In [33]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from careful_split import careful_split
from helpers import my_train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

%matplotlib inline

In [23]:
numerical_features = ['satisfaction']
categorical_features = ['relevance', 'pace']
predictors = numerical_features + categorical_features

In [24]:
ds = pd.read_csv('ds_tidy.csv', parse_dates=['date'])
ds.head()

Unnamed: 0,turma,date,student,topicos,component,relevance,learn,satisfaction,pace,tool,algorithm
0,6,2019-04-18,25,aplicações de clusterização de dados,Agrupamento,4,1,6,5,False,False
1,6,2019-04-18,25,MeanShift e DBSCAN,Agrupamento/Algoritmo/ML,4,2,6,5,False,True
2,6,2019-04-18,47,aplicações de clusterização de dados,Agrupamento,5,1,7,7,False,False
3,6,2019-04-18,47,MeanShift e DBSCAN,Agrupamento/Algoritmo/ML,5,2,7,7,False,True
4,6,2019-04-18,32,aplicações de clusterização de dados,Agrupamento,4,1,5,7,False,False


In [25]:
ds['lesson'] = ds.date.astype(str) + '/' + ds.turma.astype(str)

In [26]:
subset_1, subset_2 = careful_split(ds)

In [27]:
data_train, data_test = my_train_test_split(subset_1)

In [28]:
dummies = pd.get_dummies(data_train[categorical_features].astype(str), drop_first=True)
features = pd.concat([data_train[numerical_features], dummies], axis=1)

In [29]:
X_train = features.to_numpy()
y_train = data_train.learn.to_numpy()

X_test = features.to_numpy()
y_test = data_train.learn.to_numpy()

In [30]:
params = {
    'max_depth': np.linspace(10, 100, 5),
    'min_samples_split': [2,3,4]
}

regressor = GridSearchCV(DecisionTreeRegressor(random_state=42), params)
model = regressor.fit(X_train, y_train)

In [31]:
y_pred = model.predict(X_test)

In [35]:
def r2_adjusted(r2, n, p):
    return 1 - (1 - r2) * (n-1) / (n-p-1)

In [36]:
mse = mean_squared_error(y_test, y_pred)
print(f'MSE: {mse}')

rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'RMSE: {rmse}')

mae = mean_absolute_error(y_test, y_pred)
print(f'MAE: {mae}')

r2 = r2_score(y_test, y_pred)
print(f'R2: {r2}')

n = len(y_test)
print(f'R2 ajustado: {r2_adjusted(r2, n, 3)}')

MSE: 0.6002385786880667
RMSE: 0.7747506558164806
MAE: 0.5958716727113171
R2: 0.18073884774149307
R2 ajustado: 0.17874064980915527
