# Import libraries

In [38]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = (10,10)

# Create model

In [73]:
tree = DecisionTreeClassifier()

# Balanced + 0

## Import dataset

In [3]:
X_0_oh = pd.read_csv('data/X_0_oh.csv')
y_0_oh = pd.read_csv('data/y_0_oh.csv')

Divide in train and test

In [41]:
X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(X_0_oh, y_0_oh, test_size=0.2)

## Fit the model

In [43]:
tree.fit(X_train_0, y_train_0)
tree.score(X_test_0, y_test_0)

0.7978090766823162

In [44]:
tree.score(X_train_0, y_train_0)

1.0

As the result is 1.0 with the train data, it means that the model is doing overfitting

Check parameters of the decision tree:

In [54]:
tree.max_features_

19

In [55]:
tree.min_samples_split

2

## GridSearch

In [46]:
params = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [5,10, 15, 20, None],
    'min_samples_split': [2, 3, 4, 5, 6]
}

In [47]:
model_sel = GridSearchCV(estimator = tree, 
                        param_grid = params, 
                        cv = 5, 
                        n_jobs = -1, 
                        verbose = 2)

In [51]:
model_sel.fit(X_train_0, y_train_0)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    4.0s finished


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [5, 10, 15, 20, None],
                         'min_samples_split': [2, 3, 4, 5, 6],
                         'splitter': ['best', 'random']},
             verbose=2)

In [56]:
model_sel.best_estimator_

DecisionTreeClassifier(min_samples_split=6)

In [57]:
model_sel.best_params_

{'criterion': 'gini',
 'max_depth': None,
 'min_samples_split': 6,
 'splitter': 'best'}

In [53]:
model_sel.score(X_test_0, y_test_0)

0.8068857589984351

# Balanced X,y with mean and KNN

In [4]:
X_mean_knn = pd.read_csv('data/X_mean_knn.csv')
y_mean_knn = pd.read_csv('data/y_mean_knn.csv')

Divide the data en train and test:

In [90]:
X_train_mean, X_test_mean, y_train_mean, y_test_mean = train_test_split(X_mean_knn, y_mean_knn, test_size=0.2)

In [91]:
tree.fit(X_train_mean, y_train_mean)
tree.score(X_test_mean, y_test_mean)

0.8208908406524467

In [92]:
tree.score(X_train_mean, y_train_mean)

1.0

As the result is 1.0 with the train data, it means that the model is doing overfitting

## GridSearch

In [93]:
params = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [5,10, 15, 20, None],
    'min_samples_split': [2, 3, 4, 5, 6]
}

In [98]:
model_sel = GridSearchCV(estimator = tree, 
                        param_grid = params, 
                        cv = 5,  
                        verbose = 1)

In [99]:
model_sel.fit(X_train_mean, y_train_mean)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:   18.2s finished


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [5, 10, 15, 20, None],
                         'min_samples_split': [2, 3, 4, 5, 6],
                         'splitter': ['best', 'random']},
             verbose=1)

In [100]:
model_sel.best_estimator_

DecisionTreeClassifier(criterion='entropy', min_samples_split=3)

In [101]:
model_sel.best_params_

{'criterion': 'entropy',
 'max_depth': None,
 'min_samples_split': 3,
 'splitter': 'best'}

In [102]:
model_sel.score(X_test_mean, y_test_mean)

0.8321831869510665

# Conclusions

The results are better with the dataset with balanced data, filled with mean values and KNN for the page values feature