In [5]:
import pandas as pd
import numpy as np

from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from careful_split import careful_split
from helpers import my_train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import smogn

%matplotlib inline

In [6]:
numerical_features = ['satisfaction']
categorical_features = ['relevance', 'pace']
predictors = numerical_features + categorical_features

In [7]:
ds = pd.read_csv('ds_tidy.csv', parse_dates=['date'])
ds.head()

Unnamed: 0,turma,date,student,topicos,component,relevance,learn,satisfaction,pace,tool,algorithm
0,6,2019-04-18,25,aplicações de clusterização de dados,Agrupamento,4,1,6,5,False,False
1,6,2019-04-18,25,MeanShift e DBSCAN,Agrupamento/Algoritmo/ML,4,2,6,5,False,True
2,6,2019-04-18,47,aplicações de clusterização de dados,Agrupamento,5,1,7,7,False,False
3,6,2019-04-18,47,MeanShift e DBSCAN,Agrupamento/Algoritmo/ML,5,2,7,7,False,True
4,6,2019-04-18,32,aplicações de clusterização de dados,Agrupamento,4,1,5,7,False,False


In [8]:
ds['lesson'] = ds.date.astype(str) + '/' + ds.turma.astype(str)

In [9]:
subset_1, subset_2 = careful_split(ds)

In [10]:
data_train, data_test = my_train_test_split(subset_1)

In [11]:
dummies = pd.get_dummies(data_train[categorical_features].astype(str), drop_first=True)
features = pd.concat([data_train[numerical_features], dummies], axis=1)

In [12]:
X_train = features.to_numpy()
y_train = data_train.learn.to_numpy()

X_test = features.to_numpy()
y_test = data_train.learn.to_numpy()

In [15]:
params = {
    'alpha': np.logspace(-3,1,5)
}

model = GridSearchCV(ElasticNet(random_state=42), params)
model.fit(X_train, y_train)

GridSearchCV(estimator=ElasticNet(random_state=42),
             param_grid={'alpha': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01])})

In [16]:
y_pred = model.predict(X_test)

In [17]:
def r2_adjusted(r2, n, p):
    return 1 - (1 - r2) * (n-1) / (n-p-1)

In [18]:
mse = mean_squared_error(y_test, y_pred)
print(f'MSE: {mse}')

rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'RMSE: {rmse}')

mae = mean_absolute_error(y_test, y_pred)
print(f'MAE: {mae}')

r2 = r2_score(y_test, y_pred)
print(f'R2: {r2}')

n = len(y_test)
print(f'R2 ajustado: {r2_adjusted(r2, n, 3)}')

MSE: 0.6714189740399347
RMSE: 0.8194015950923788
MAE: 0.6399646061564104
R2: 0.08358525784454518
R2 ajustado: 0.08135009993684894


In [113]:
feature_importance = model.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
fig = plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, np.array(features.columns)[sorted_idx])
plt.title('Feature Importance')

AttributeError: 'GridSearchCV' object has no attribute 'feature_importances_'