# Train Test Split

In [None]:
???

![](src/comparison.png)

## Data loading

In [1]:
import pandas as pd

df = pd.read_excel(
    '../../data/data_stock_apple.xlsx',
    parse_dates=['Date'], index_col=0
    )
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,change_tomorrow,change_tomorrow_direction
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2007-01-09,3.087500,3.320714,3.041071,3.306071,3349298400,4.567032,UP
2007-01-10,3.383929,3.492857,3.337500,3.464286,2952880000,-1.252610,DOWN
...,...,...,...,...,...,...,...
2023-06-28,187.929993,189.899994,187.600006,189.250000,51216800,0.179332,UP
2023-06-29,189.080002,190.070007,188.940002,189.589996,46347300,2.258084,UP


## Feature selection

In [19]:
target = df.change_tomorrow_direction
explanatory = df[['Open','High','Low','Close','Volume']]

## Machine Learning System

Decision Tree Classifier

### Calculate the score in all dataset

In [20]:
from sklearn.tree import DecisionTreeClassifier

model_dt_all = DecisionTreeClassifier()
model_dt_all.fit(X=explanatory, y=target)
model_dt_all.predict(X=explanatory)
model_dt_all.score(X=explanatory, y=target)

1.0

### Calculate the score in the test set

#### Create train and test sets

Look for the function in `model_selection`

In [46]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(explanatory, target, test_size=0.30, random_state=1)
X_train

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-05-11,23.370001,23.392500,23.115000,23.127501,114876400
2011-07-26,14.285714,14.446429,14.274286,14.407500,476582400
...,...,...,...,...,...
2022-10-28,148.199997,157.500000,147.820007,155.740005,164762400
2007-12-13,6.792500,6.861429,6.707857,6.851071,864617600


In [47]:
X_test

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-03-05,73.879997,74.887497,72.852501,73.230003,187572800
2008-06-12,6.481786,6.521429,6.114286,6.187857,1308333600
...,...,...,...,...,...
2014-08-05,23.840000,23.920000,23.590000,23.780001,223732000
2010-08-03,9.321786,9.402143,9.265000,9.354643,417653600


#### Machine Learning development

In [48]:
model_dt_train = DecisionTreeClassifier(random_state=42)
model_dt_train.fit(X=X_train, y=y_train)
model_dt_train.predict(X=X_test)
model_dt_train.score(X=X_test, y=y_test)

0.5140562248995983

### Reflect

#### Compare predictions to reality in a `DataFrame`

In [49]:
df_pred = y_test.to_frame()
df_pred['prediction_all'] = model_dt_all.predict(X=X_test)
df_pred['prediction_train_test'] = model_dt_train.predict(X=X_test)
df_pred

Unnamed: 0_level_0,change_tomorrow_direction,prediction_all,prediction_train_test
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-03-05,DOWN,DOWN,UP
2008-06-12,DOWN,DOWN,UP
...,...,...,...
2014-08-05,DOWN,DOWN,UP
2010-08-03,UP,UP,UP


## Other algorithms

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

#### Calculate the score in all dataset

In [64]:
model_lr_train = LogisticRegression()
model_lr_train.fit(X=X_train, y=y_train)
score_lr_train = model_lr_train.score(X=X_test, y=y_test)
score_lr_train

0.5389558232931727

#### Calculate the score in the test set

In [66]:
model_lr_all = LogisticRegression()
model_lr_all.fit(X=explanatory, y=target)
score_lr_all = model_lr_all.score(X=X_test, y=y_test)
score_lr_all

0.5389558232931727

### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

#### Calculate the score in all dataset

In [68]:
model_gb_all = GradientBoostingClassifier()
model_gb_all.fit(X=explanatory, y=target)
score_gb_all = model_gb_all.score(X=X_test, y=y_test)
score_gb_all

0.6538152610441768

#### Calculate the score in the test set

In [None]:
model_gb_train = GradientBoostingClassifier()
model_gb_train.fit(X=X_train, y=y_train)
score_gb_train = model_gb_train.score(X=X_test, y=y_test)
score_gb_train

0.5148594377510041

## Compare all models in a `DataFrame`

In [60]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier

list_model = [LogisticRegression(), DecisionTreeClassifier(), GradientBoostingClassifier()]

list_model_train = []
for model in list_model:
    model.fit(X=X_train, y=y_train)
    score = model.score(X=X_test, y=y_test)
    list_model_train.append(score)
    
list_model_all = []
for model in list_model:
    model.fit(X=explanatory, y=target)
    score = model.score(X=X_test, y=y_test)
    list_model_all.append(score)

In [61]:
df_score = pd.DataFrame({
    'model': list(map(lambda x: x.__class__.__name__, list_model)),
    'score_all': list_model_all,
    'score_train_test': list_model_train,
})

df_score

Unnamed: 0,model,score_all,score_train_test
0,LogisticRegression,0.538956,0.538956
1,DecisionTreeClassifier,1.0,0.507631
2,GradientBoostingClassifier,0.653815,0.515663
