# Train Test Split

## Data loading

In [40]:
import pandas as pd
import seaborn as sns

df_mpg = pd.get_dummies(sns.load_dataset('mpg', index_col='name').dropna())
df_mpg

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin_europe,origin_japan,origin_usa
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
chevrolet chevelle malibu,18.0,8,307.0,130.0,3504,12.0,70,0,0,1
buick skylark 320,15.0,8,350.0,165.0,3693,11.5,70,0,0,1
...,...,...,...,...,...,...,...,...,...,...
ford ranger,28.0,4,120.0,79.0,2625,18.6,82,0,0,1
chevy s-10,31.0,4,119.0,82.0,2720,19.4,82,0,0,1


## Feature selection

In [41]:
target = df_mpg.mpg
explanatory = df_mpg.drop(columns='mpg')

## Machine Learning System

Decision ensemble Regressor

### Calculate the score in all dataset

In [43]:
from sklearn.tree import DecisionTreeRegressor

model_dt_all = DecisionTreeRegressor()
model_dt_all.fit(X=explanatory, y=target)
model_dt_all.predict(X=explanatory)
model_dt_all.score(X=explanatory, y=target)

1.0

### Calculate the score in the test set

#### Create train and test sets

Look for the function in `model_selection`

In [44]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(explanatory, target, test_size=0.30, random_state=1)
X_train

Unnamed: 0_level_0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin_europe,origin_japan,origin_usa
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
triumph tr7 coupe,4,122.0,88.0,2500,15.1,80,1,0,0
chevy s-10,4,119.0,82.0,2720,19.4,82,0,0,1
...,...,...,...,...,...,...,...,...,...
chevrolet chevette,4,98.0,63.0,2051,17.0,77,0,0,1
chevrolet impala,8,350.0,165.0,4209,12.0,71,0,0,1


In [45]:
X_test

Unnamed: 0_level_0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin_europe,origin_japan,origin_usa
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
toyouta corona mark ii (sw),4,120.0,97.0,2506,14.5,72,0,1,0
toyota corolla,4,97.0,75.0,2171,16.0,75,0,1,0
...,...,...,...,...,...,...,...,...,...
chevrolet monza 2+2,8,262.0,110.0,3221,13.5,75,0,0,1
ford ltd,8,351.0,158.0,4363,13.0,73,0,0,1


#### Machine Learning development

In [46]:
model_dt_train = DecisionTreeRegressor(random_state=42)
model_dt_train.fit(X=X_train, y=y_train)
model_dt_train.predict(X=X_test)
model_dt_train.score(X=X_test, y=y_test)

0.8045060556582773

### Reflect

#### Compare predictions to reality in a `DataFrame`

In [47]:
df_pred = y_test.to_frame()
df_pred['prediction_all'] = model_dt_all.predict(X=X_test)
df_pred['prediction_train_test'] = model_dt_train.predict(X=X_test)
df_pred

Unnamed: 0_level_0,mpg,prediction_all,prediction_train_test
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
toyouta corona mark ii (sw),23.0,23.0,22.0
toyota corolla,29.0,29.0,28.0
...,...,...,...
chevrolet monza 2+2,20.0,20.0,13.0
ford ltd,13.0,13.0,10.0


## Other algorithms

### Linear Regression

In [48]:
from sklearn.linear_model import LinearRegression

#### Calculate the score in all dataset

In [49]:
model_lr_train = LinearRegression()
model_lr_train.fit(X=X_train, y=y_train)
score_lr_train = model_lr_train.score(X=X_test, y=y_test)
score_lr_train

0.8242172817490033

#### Calculate the score in the test set

In [50]:
model_lr_all = LinearRegression()
model_lr_all.fit(X=explanatory, y=target)
score_lr_all = model_lr_all.score(X=X_test, y=y_test)
score_lr_all

0.8369131723159857

### Random Forest

In [52]:
from sklearn.ensemble import RandomForestRegressor

#### Calculate the score in all dataset

In [53]:
model_rf_all = RandomForestRegressor()
model_rf_all.fit(X=explanatory, y=target)
score_rf_all = model_rf_all.score(X=X_test, y=y_test)
score_rf_all

0.9783493318318217

#### Calculate the score in the test set

In [54]:
model_rf_train = RandomForestRegressor()
model_rf_train.fit(X=X_train, y=y_train)
score_rf_train = model_rf_train.score(X=X_test, y=y_test)
score_rf_train

0.8544468187825718

## Compare all models in a `DataFrame`

In [60]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

list_model = [DecisionTreeRegressor(), RandomForestRegressor(), LinearRegression()]

list_model_train = []
for model in list_model:
    model.fit(X=X_train, y=y_train)
    score = model.score(X=X_test, y=y_test)
    list_model_train.append(score)
    
list_model_all = []
for model in list_model:
    model.fit(X=explanatory, y=target)
    score = model.score(X=X_test, y=y_test)
    list_model_all.append(score)

In [61]:
df_score = pd.DataFrame({
    'model': list(map(lambda x: x.__class__.__name__, list_model)),
    'score_all': list_model_all,
    'score_train_test': list_model_train,
})

df_score

Unnamed: 0,model,score_all,score_train_test
0,DecisionTreeRegressor,1.0,0.807948
1,RandomForestRegressor,0.978396,0.845569
2,LinearRegression,0.836913,0.824217
