# Exercise 8

## Car Price Prediction

Predict if the price of a car is low or high

#### Angie Paola Chacon Pinzon: 201012536
#### Jahir Stevens Rodriguez Riveros: 201819361
#### Juan Andres Cabal Jaramillo: 200620778
#### David Tavera Sánchez: 201016123

In [7]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn import metrics

data = pd.read_csv('./datasets/dataTrain_carListings.zip')
data = data.loc[data['Model'].str.contains('Camry')].drop(['Make', 'State'], axis=1)
data = data.join(pd.get_dummies(data['Model'], prefix='M'))
data['HighPrice'] = (data['Price'] > data['Price'].mean()).astype(int)
data = data.drop(['Model', 'Price'], axis=1)

data.head()

Unnamed: 0,Year,Mileage,M_Camry,M_Camry4dr,M_CamryBase,M_CamryL,M_CamryLE,M_CamrySE,M_CamryXLE,HighPrice
15,2016,29242,0,0,0,0,1,0,0,1
47,2015,26465,0,0,0,0,1,0,0,1
85,2012,46739,0,1,0,0,0,0,0,1
141,2017,41722,0,0,0,0,0,1,0,1
226,2014,77669,0,0,0,0,0,0,1,0


In [8]:
data.shape

(13150, 10)

In [9]:
y = data['HighPrice']
X = data.drop(['HighPrice'], axis=1)

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


# Exercise 8.1

Estimate a Decision Tree Classifier Manually using the code created in the Notebook #13

Evaluate the accuracy on the testing set

In [11]:
def gini(y):
    if y.shape[0] == 0:
        return 0
    else:
        return 1 - (y.mean()**2 + (1 - y.mean())**2)
    
def gini_impurity(X_col, y, split):
    "Calculate the gain of an split k on feature j"
    
    filter_l = X_col < split
    y_l = y.loc[filter_l]
    y_r = y.loc[~filter_l]
    
    n_l = y_l.shape[0]
    n_r = y_r.shape[0]
    
    gini_y = gini(y)
    gini_l = gini(y_l)
    gini_r = gini(y_r)
    
    gini_impurity_ = gini_y - (n_l / (n_l + n_r) * gini_l + n_r / (n_l + n_r) * gini_r)
    
    return gini_impurity_

def best_split(X, y, num_pct=10):
    
    features = range(X.shape[1])
    
    best_split = [0, 0, 0]  # j, split, gain
    
    # For all features
    for j in features:
        
        splits = np.percentile(X.iloc[:, j], np.arange(0, 100, 100.0 / (num_pct+1)).tolist())
        splits = np.unique(splits)[1:]
        
        # For all splits
        for split in splits:
            gain = gini_impurity(X.iloc[:, j], y, split)
            #print(gain)            
            if gain > best_split[2]:
                best_split = [j, split, gain]
    
    return best_split

def tree_grow(X, y, level=0, min_gain=0.001, max_depth=None, num_pct=10):
    
    # If only one observation
    if X.shape[0] == 1:
        tree = dict(y_pred=y.iloc[:1].values[0], y_prob=0.5, level=level, split=-1, n_samples=1, gain=0)
        return tree
    
    # Calculate the best split
    j, split, gain = best_split(X, y, num_pct)
    
    # save tree and estimate prediction
    y_pred = int(y.mean() >= 0.5) 
    y_prob = (y.sum() + 1.0) / (y.shape[0] + 2.0)  # Laplace correction
    
    tree = dict(y_pred=y_pred, y_prob=y_prob, level=level, split=-1, n_samples=X.shape[0], gain=gain)
    
    # Check stooping criteria
    if gain < min_gain:
        return tree
    if max_depth is not None:
        if level >= max_depth:
            return tree   
    
    # No stooping criteria was meet, then continue to create the partition
    filter_l = X.iloc[:, j] < split
    X_l, y_l = X.loc[filter_l], y.loc[filter_l]
    X_r, y_r = X.loc[~filter_l], y.loc[~filter_l]
    tree['split'] = [j, split]

    # Next iteration to each split
    
    tree['sl'] = tree_grow(X_l, y_l, level + 1, min_gain=min_gain, max_depth=max_depth, num_pct=num_pct)
    tree['sr'] = tree_grow(X_r, y_r, level + 1, min_gain=min_gain, max_depth=max_depth, num_pct=num_pct)
    
    return tree 
def tree_predict(X, tree, proba=False):
    
    predicted = np.ones(X.shape[0])

    # Check if final node
    if tree['split'] == -1:
        if not proba:
            predicted = predicted * tree['y_pred']
        else:
            predicted = predicted * tree['y_prob']
            
    else:
        
        j, split = tree['split']
        filter_l = (X.iloc[:, j] < split)
        X_l = X.loc[filter_l]
        X_r = X.loc[~filter_l]

        if X_l.shape[0] == 0:  # If left node is empty only continue with right
            predicted[~filter_l] = tree_predict(X_r, tree['sr'], proba)
        elif X_r.shape[0] == 0:  # If right node is empty only continue with left
            predicted[filter_l] = tree_predict(X_l, tree['sl'], proba)
        else:
            predicted[filter_l] = tree_predict(X_l, tree['sl'], proba)
            predicted[~filter_l] = tree_predict(X_r, tree['sr'], proba)

    return predicted    


In [35]:
tree = tree_grow(X_train, y_train, level=0, min_gain=0.001, max_depth=3, num_pct=10)
y_hat=tree_predict(X_test, tree)
#y_hat[:4]

print("Accuracy:", metrics.accuracy_score(y_hat,y_test))
print("F1-Score:", metrics.f1_score(y_hat,y_test))

Accuracy: 0.8610599078341014
F1-Score: 0.882798833819242


# Exercise 8.2

Estimate a Bagging of 10 Decision Tree Classifiers Manually using the code created in the Notebook #7

Evaluate the accuracy on the testing set

In [26]:
# set a seed for reproducibility
np.random.seed(1)

n_samples = X_train.shape[0]
n_B = 10

samples = [np.random.choice(a = n_samples, size = n_samples, replace = True) for _ in range(1, n_B + 1 )]
samples

[array([ 235, 5192,  905, ..., 6990, 3970, 4337]),
 array([ 695, 2764, 6180, ..., 7495, 4266, 3256]),
 array([6434, 8533, 3845, ..., 8624, 2779, 1284]),
 array([6607,  682, 1153, ..., 4502, 3684, 8586]),
 array([5267, 1878, 3085, ..., 2889, 2678, 5273]),
 array([5564, 3084, 4061, ..., 3010, 3961, 5753]),
 array([3836,  646, 1289, ...,  608, 7604, 8458]),
 array([1201, 5768, 5392, ..., 1500, 1119, 3696]),
 array([4334, 3801, 2516, ..., 4745, 4402,  503]),
 array([3795, 4635, 6140, ..., 7759, 2223, 2405])]

In [32]:
y_pred = pd.DataFrame(index=y_test.index, columns=[list(range(n_B))])

for i  in range(1,n_B):
    X = X_train.iloc[samples[i], 0:]
    y = y_train.iloc[samples[i]]
    tree = tree_grow(X, y, level=0, min_gain=0.001, max_depth=3, num_pct=10)
    y_pred[[i]] = tree_predict(X_test, tree)

In [36]:
bag_pred = (y_pred.sum(axis = 1) >= (n_B / 2)).astype(np.int)
print("Accuracy:", metrics.accuracy_score(y_test, bag_pred))
print("F1-Score:", metrics.f1_score(y_test, bag_pred))

Accuracy: 0.8700460829493087
F1-Score: 0.8923253150057274


# Exercise 8.3

Implement the variable max_features on the Decision Tree Classifier created in 11.1.

Compare the impact in the results by varing the parameter max_features

Evaluate the accuracy on the testing set

In [37]:
def tree_grow_r(X, y, level=0, min_gain=0.001, max_depth=None, num_pct=10, max_features=None):
    #print("tr",max_features)
    # If only one observation
    if X.shape[0] == 1:
        tree = dict(y_pred=y.iloc[:1].values[0], y_prob=0.5, level=level, split=-1, n_samples=1, gain=0)
        return tree
    
    # select random columns
    X_rdn=X.sample(max_features, axis=1)
    #print("Xrdn:",X_rdn.head())
    # Calculate the best split
    j_rdn, split, gain = best_split(X_rdn, y, num_pct)
    
    j=X_train.columns.get_loc(X_rdn.columns[j_rdn])
    
    # save tree and estimate prediction
    y_pred = int(y.mean() >= 0.5) 
    y_prob = (y.sum() + 1.0) / (y.shape[0] + 2.0)  # Laplace correction
    
    tree = dict(y_pred=y_pred, y_prob=y_prob, level=level, split=-1, n_samples=X.shape[0], gain=gain)
    
    # Check stooping criteria
    if gain < min_gain:
        return tree
    if max_depth is not None:
        if level >= max_depth:
            return tree   
    
    # No stooping criteria was meet, then continue to create the partition
    filter_l = X.iloc[:, j] < split
    X_l, y_l = X.loc[filter_l], y.loc[filter_l]
    X_r, y_r = X.loc[~filter_l], y.loc[~filter_l]
    tree['split'] = [j, split]

    # Next iteration to each split
    
    tree['sl'] = tree_grow_r(X_l, y_l, level + 1, min_gain=min_gain, max_depth=max_depth, num_pct=num_pct,max_features=max_features)
    tree['sr'] = tree_grow_r(X_r, y_r, level + 1, min_gain=min_gain, max_depth=max_depth, num_pct=num_pct,max_features=max_features)
    
    return tree 

In [39]:
for m in range(1,10):
    tree_r = tree_grow_r(X_train, y_train, level=0, min_gain=0.001, max_depth=3, num_pct=10 ,max_features=m)
    y_hat_r=tree_predict(X_test, tree_r)
    print("maxf:",m," acc:",metrics.accuracy_score(y_hat_r,y_test))

maxf: 1  acc: 0.6612903225806451
maxf: 2  acc: 0.743778801843318
maxf: 3  acc: 0.5824884792626728
maxf: 4  acc: 0.8354838709677419
maxf: 5  acc: 0.8564516129032258
maxf: 6  acc: 0.8656682027649769
maxf: 7  acc: 0.864516129032258
maxf: 8  acc: 0.8626728110599078
maxf: 9  acc: 0.8610599078341014


# Exercise 8.4

Estimate a Bagging of 10 Decision Tree Classifiers with `max_features = log(n_features)`

Evaluate the accuracy on the testing set

In [44]:
mx=int(np.log(len(X.columns)))
np.random.seed(1)
n_samples = X_train.shape[0]
n_B = 10
samples_m = [np.random.choice(a = n_samples, size = n_samples, replace = True) for _ in range(1, n_B + 1 )]
y_pred_m = pd.DataFrame(index=y_test.index, columns=[list(range(n_B))])

for i  in range(1,n_B):
    X_m = X_train.iloc[samples_m[i], 0:]
    y_m = y_train.iloc[samples_m[i]]
    tree_m = tree_grow(X_m, y_m, level=0, min_gain=0.001, max_depth=mx, num_pct=10)
    y_pred_m[[i]] = tree_predict(X_test, tree_m)

In [45]:
bag_pred_m = (y_pred_m.sum(axis = 1) >= (n_B / 2)).astype(np.int)
print("Accuracy:", metrics.accuracy_score(y_test, bag_pred_m))
print("F1-Score:", metrics.f1_score(y_test, bag_pred_m))

Accuracy: 0.8610599078341014
F1-Score: 0.882798833819242


# Exercise 8.5

Using sklearn, train a RandomForestClassifier

Evaluate the accuracy on the testing set

In [47]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

clf = RandomForestClassifier(n_estimators=100, max_features=7,random_state=1, n_jobs=-1)
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.845852534562212


# Exercise 8.6

Find the best parameters of the RandomForestClassifier (max_depth, max_features, n_estimators)

Evaluate the accuracy on the testing set

In [72]:
# list of values to try for max_features
depth_range = range(1,5)
feature_range = range(1, len(X_train.columns)+1)
estimator_range = range(10, 20, 5)

# list to store the average Accuracy for each value of max_features
accuracy_scores = []


for feature in feature_range:
    for depth in depth_range:
        for estim in estimator_range:
            clf = RandomForestClassifier(n_estimators=estim, max_depth=depth,max_features=feature, random_state=1, n_jobs=-1)
            #clf.fit(X_train,y_train)
            #y_pred=clf.predict(X_test)
            #acc=metrics.accuracy_score(y_pred,y_test)
            cv_acc=cross_val_score(clf, X_train, y_train, cv=5, scoring='accuracy').mean()
            accuracy_scores.append([feature,depth,estim,cv_acc])

In [79]:
df_acc=pd.DataFrame(accuracy_scores,columns=['f','d','e','acc'])
bm=df_acc[df_acc['acc']==df_acc['acc'].max()]
bm

Unnamed: 0,f,d,e,acc
23,3,4,15,0.877185


In [99]:
clf = RandomForestClassifier(n_estimators=bm['e'].values[0], max_depth=bm['d'].values[0],
                             max_features=bm['f'].values[0], random_state=1, n_jobs=-1)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
y_pred
#print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

array([0, 1, 1, ..., 1, 1, 0])

# Exercise 8.7 

Using xgboost train a XGBClassifier 

Evaluate the accuracy on the testing set

In [101]:
from xgboost import XGBClassifier

clfXG = XGBClassifier()
clfXG.fit(X_train, y_train)
y_pred = clf.predict(X_test)
metrics.f1_score(y_pred, y_test.values), metrics.accuracy_score(y_pred, y_test.values)

(0.8917995444191343, 0.868663594470046)

# Exercise 8.8

Using xgboost train a XGBClassifier 

Modify the parameters learning rate, gamma, colsample_bytree. Explain what each parameter means.

Evaluate the accuracy on the testing set

- Learning rate:  0<𝜂≤1. Fracción de los componentes de un árbol que serán usadas para la estimación del siguiente árbol. Es usado para mejorar la generalización del árbol. Cuando más pequeño es el parámetro es mejor la generalización que hace. Cuándo el valor es 1, significa que se usan todos los componentes del árbol actual para crear el siguiente árbol
- gamma: Ganancia mínima que debe superar una partición para ser incluida en el árbol. Es un número entre [0,inf] y a medida que es más grande el árbol final es más simple.
- colsample_bytree: Fracción de las columnas que serán tomadas para la construcción de cada árbol. tiene un valor entre 0 y 1 siendo 1:"tomar todas las columnas"

In [None]:
from xgboost import XGBClassifier

clfXG = XGBClassifier()
clfXG.fit(X_train, y_train)
y_pred = clf.predict(X_test)
metrics.f1_score(y_pred, y_test.values), metrics.accuracy_score(y_pred, y_test.values)

In [108]:
# list of values to try for max_features
learning_range = np.arange(0, 1, 0.1)
gamma_range = range(1, 10)
colsample_range =np.arange(0, 1, 0.1)

# list to store the average Accuracy for each value of max_features
accuracy_scores = []

for lr in learning_range:
    for g in gamma_range:
        for cs in colsample_range:
            clf = XGBClassifier(eta=lr, gamma=g,colsample_bytree=cs)
            clf.fit(X_train,y_train)
            y_pred=clf.predict(X_test)
            acc=metrics.accuracy_score(y_pred,y_test)
            accuracy_scores.append([lr,g,cs,acc])

In [109]:
dfXB_acc=pd.DataFrame(accuracy_scores,columns=['lr','g','cs','acc'])
bmXG=dfXB_acc[dfXB_acc['acc']==dfXB_acc['acc'].max()]
bmXG

Unnamed: 0,lr,g,cs,acc
396,0.4,4,0.6,0.885023
