---
### Preparando os dados

#### Import da base com todas as variáveis

In [1]:
import pandas as pd

df = pd.read_csv('dados/final.csv',index_col=0)
df.head()

Unnamed: 0,qt_salas_utilizadas,qt_salas_utiliza_climatizadas,qt_equip_dvd,qt_equip_som,qt_equip_tv,qt_equip_lousa_digital,qt_equip_multimidia,qt_desktop_aluno,qt_comp_portatil_aluno,qt_tablet_aluno,...,in_prof_sim,in_prof_tec_sim,in_eja_sim,in_eja_fund_sim,in_eja_med_sim,in_esp_sim,in_esp_cc_sim,in_esp_ce_sim,tp_lingua_indigena,tp_lingua_portugues
11024968,15,15,1,1,3,3,10,12,0,0,...,0,0,0,0,0,0,0,0,0,1
11025638,44,31,1,2,21,0,15,22,15,98,...,0,0,0,0,0,1,1,0,0,1
11006773,25,0,0,1,11,0,6,25,0,0,...,0,0,1,0,1,1,1,0,0,1
11006889,17,17,4,4,4,0,5,12,0,0,...,0,0,0,0,0,1,1,0,0,1
11007168,18,18,0,0,3,2,10,4,7,4,...,0,0,0,0,0,1,1,0,0,1


#### Divisão Variáveis Explicativas / Variável Resposta

In [2]:
import numpy as np
df = df.sample(frac=1)
y = df[['nota enem']]
X = df.drop(columns=['nota enem'])

print(X.shape, y.shape)

(19022, 218) (19022, 1)


In [3]:
pd.options.display.max_columns = 300
print(X.columns)

Index(['qt_salas_utilizadas', 'qt_salas_utiliza_climatizadas', 'qt_equip_dvd',
       'qt_equip_som', 'qt_equip_tv', 'qt_equip_lousa_digital',
       'qt_equip_multimidia', 'qt_desktop_aluno', 'qt_comp_portatil_aluno',
       'qt_tablet_aluno',
       ...
       'in_prof_sim', 'in_prof_tec_sim', 'in_eja_sim', 'in_eja_fund_sim',
       'in_eja_med_sim', 'in_esp_sim', 'in_esp_cc_sim', 'in_esp_ce_sim',
       'tp_lingua_indigena', 'tp_lingua_portugues'],
      dtype='object', length=218)


#### Normalização

Como nossos dados possuem valores considerados outlier, e não esses outliers podem indicar algum tipo de relação, decidimos utilizar o RobustScaler para realizar a normalização.

In [4]:
from sklearn.preprocessing import RobustScaler

scaler_X, scaler_y = RobustScaler(), RobustScaler()
scaler_X.fit(X)
scaler_y.fit(y)

X_norm, y_norm = scaler_X.transform(X), scaler_y.transform(y)
X, y = pd.DataFrame(X_norm, columns=X.columns), pd.DataFrame(y_norm, columns=y.columns)

X, y = np.array(X), np.array(y)
print(X.shape, y.shape)

(19022, 218) (19022, 1)


#### Divisão Treino / Teste

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.7, random_state=42)
print('Dados Treino', X_train.shape, y_train.shape)
print('Dados Teste', X_test.shape, y_test.shape)

Dados Treino (13315, 218) (13315, 1)
Dados Teste (5707, 218) (5707, 1)


---
### Modelo - Ridge

In [6]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

model = Ridge()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
MAE = mean_absolute_error(y_pred=y_pred, y_true=y_test)
MSE = mean_squared_error(y_pred=y_pred, y_true=y_test)
R2 = r2_score(y_pred=y_pred, y_true=y_test)
print(f'MAE:{MAE}    MSE:{MSE}    R2:{R2}')

MAE:0.30974285733061946    MSE:0.17048411025003377    R2:0.7753556236537882


In [7]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_validate

model = Ridge()
cv_results = cross_validate(estimator=model, X=X, y=y, cv=10, n_jobs=-1, return_train_score=True, 
    scoring=('neg_mean_absolute_error', 'neg_mean_squared_error'))

print(cv_results['test_neg_mean_absolute_error'])
print(cv_results['test_neg_mean_squared_error'])

[-0.30315079 -0.30823456 -0.3061251  -0.31611158 -0.36441121 -0.30399576
 -0.31359403 -0.3052551  -0.2978458  -0.30416329]
[-0.16511184 -0.1634745  -0.15835135 -0.1739074  -7.70044954 -0.16968431
 -0.17798866 -0.16174233 -0.15437362 -0.16347345]


---
### Modelo - Elastic Net

In [8]:
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

model = ElasticNet()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
MAE = mean_absolute_error(y_pred=y_pred, y_true=y_test)
MSE = mean_squared_error(y_pred=y_pred, y_true=y_test)
R2 = r2_score(y_pred=y_pred, y_true=y_test)
print(f'MAE:{MAE}    MSE:{MSE}    R2:{R2}')

print(df.columns[np.where(np.abs(model.coef_) > 0)])
print('Indices das variaveis com coeficientes não zerados {}')

MAE:0.6355738883301159    MSE:0.6859695984552339    R2:0.09610806302456099
Index(['qt_equip_multimidia', 'qt_comp_portatil_aluno', 'qt_tablet_aluno',
       'qt_prof_administrativos', 'qt_prof_bibliotecario', 'qt_prof_saude',
       'qt_prof_seguranca', 'qt_prof_monitores', 'qt_doc_eja', 'qt_tur_prof',
       'qt_tur_prof_tec'],
      dtype='object')
Indices das variaveis com coeficientes não zerados {}


In [9]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import cross_validate

model = ElasticNet()
cv_results = cross_validate(estimator=model, X=X, y=y, cv=10, n_jobs=-1, return_train_score=True, 
    scoring=('neg_mean_absolute_error', 'neg_mean_squared_error'))

print(cv_results['test_neg_mean_absolute_error'])
print(cv_results['test_neg_mean_squared_error'])

[-0.63906317 -0.63251279 -0.6378395  -0.64023167 -0.63792826 -0.6359887
 -0.65956708 -0.64175679 -0.63775149 -0.62957085]
[-0.6756713  -0.67000112 -0.66689043 -0.69117833 -0.69162331 -0.69023396
 -0.72510635 -0.67941952 -0.66030681 -0.67626878]


---
### Modelo - Lasso

In [10]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

model = Lasso()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
MAE = mean_absolute_error(y_pred=y_pred, y_true=y_test)
MSE = mean_squared_error(y_pred=y_pred, y_true=y_test)
R2 = r2_score(y_pred=y_pred, y_true=y_test)
print(f'MAE:{MAE}    MSE:{MSE}    R2:{R2}')

print(df.columns[np.where(np.abs(model.coef_) > 0)])
print('Indices das variaveis com coeficientes não zerados {}')

MAE:0.6607944475925429    MSE:0.7356299032766539    R2:0.03067141799406936
Index(['qt_comp_portatil_aluno', 'qt_tablet_aluno', 'qt_prof_saude',
       'qt_prof_seguranca', 'qt_prof_monitores', 'qt_doc_eja'],
      dtype='object')
Indices das variaveis com coeficientes não zerados {}


In [11]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import cross_validate

model = ElasticNet()
cv_results = cross_validate(estimator=model, X=X, y=y, cv=10, n_jobs=-1, return_train_score=True, 
    scoring=('neg_mean_absolute_error', 'neg_mean_squared_error'))

print(cv_results['test_neg_mean_absolute_error'])
print(cv_results['test_neg_mean_squared_error'])

[-0.63906317 -0.63251279 -0.6378395  -0.64023167 -0.63792826 -0.6359887
 -0.65956708 -0.64175679 -0.63775149 -0.62957085]
[-0.6756713  -0.67000112 -0.66689043 -0.69117833 -0.69162331 -0.69023396
 -0.72510635 -0.67941952 -0.66030681 -0.67626878]


---
### Modelo - Regressão Linear

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
MAE = mean_absolute_error(y_pred=y_pred, y_true=y_test)
MSE = mean_squared_error(y_pred=y_pred, y_true=y_test)
R2 = r2_score(y_pred=y_pred, y_true=y_test)
print(f'MAE:{MAE}    MSE:{MSE}    R2:{R2}')

MAE:0.30984762228399887    MSE:0.1706591374296615    R2:0.7751249929424954


In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate

model = LinearRegression()
cv_results = cross_validate(estimator=model, X=X, y=y, cv=10, n_jobs=-1, return_train_score=True, 
    scoring=('neg_mean_absolute_error', 'neg_mean_squared_error'))

print(cv_results['test_neg_mean_absolute_error'])
print(cv_results['test_neg_mean_squared_error'])

[-0.30329332 -0.30860724 -0.30631918 -0.31619375 -0.36452795 -0.30411833
 -0.31366691 -0.30521076 -0.29808923 -0.30468195]
[-0.16526499 -0.16400377 -0.15855883 -0.17412914 -7.70813133 -0.16968012
 -0.1779659  -0.16167925 -0.15470435 -0.16446078]
