In [187]:

import functools
from math import sqrt

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
import scipy.stats as sct
import seaborn as sns
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, median_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [188]:

# Algumas configurações para o matplotlib.
%matplotlib inline

from IPython.core.pylabtools import figsize


figsize(12, 12)

sns.set()

In [189]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [190]:
df_train = df_train[['NU_INSCRICAO', 'CO_UF_RESIDENCIA', 'SG_UF_RESIDENCIA', 'NU_IDADE',
       'TP_SEXO', 'TP_COR_RACA', 'TP_NACIONALIDADE', 'TP_ST_CONCLUSAO',
       'TP_ANO_CONCLUIU', 'TP_ESCOLA', 'TP_ENSINO', 'IN_TREINEIRO',
       'TP_DEPENDENCIA_ADM_ESC', 'IN_BAIXA_VISAO', 'IN_CEGUEIRA', 'IN_SURDEZ',
       'IN_DISLEXIA', 'IN_DISCALCULIA', 'IN_SABATISTA', 'IN_GESTANTE',
       'IN_IDOSO', 'TP_PRESENCA_CN', 'TP_PRESENCA_CH', 'TP_PRESENCA_LC',
       'CO_PROVA_CN', 'CO_PROVA_CH', 'CO_PROVA_LC', 'CO_PROVA_MT',
       'NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC', 'NU_NOTA_MT', 'TP_LINGUA',
       'TP_STATUS_REDACAO', 'NU_NOTA_COMP1', 'NU_NOTA_COMP2', 'NU_NOTA_COMP3',
       'NU_NOTA_COMP4', 'NU_NOTA_COMP5', 'NU_NOTA_REDACAO', 'Q001', 'Q002',
       'Q006', 'Q024', 'Q025', 'Q026', 'Q027', 'Q047']]

In [191]:
df_train.drop(['NU_INSCRICAO', 'CO_PROVA_CN', 'CO_PROVA_CH', 'CO_PROVA_LC', 'CO_PROVA_MT'], axis=1, inplace=True)
df_test.drop(['NU_INSCRICAO', 'CO_PROVA_CN', 'CO_PROVA_CH', 'CO_PROVA_LC', 'CO_PROVA_MT'], axis=1, inplace=True)

In [192]:
df_train = pd.get_dummies(df_train)
df_test = pd.get_dummies(df_test)

In [193]:
df_train['TP_DEPENDENCIA_ADM_ESC'].fillna(1, inplace=True)
df_train['TP_ENSINO'].fillna(1, inplace=True)
df_train['TP_STATUS_REDACAO'].fillna(1, inplace=True)
df_train['NU_NOTA_COMP2'].fillna(df_train['NU_NOTA_COMP2'].kurtosis(), inplace=True)
df_train['NU_NOTA_LC'].fillna(df_train['NU_NOTA_LC'].kurtosis(), inplace=True)
df_train['NU_NOTA_COMP1'].fillna(df_train['NU_NOTA_COMP1'].kurtosis(), inplace=True)
df_train['NU_NOTA_COMP3'].fillna(df_train['NU_NOTA_COMP3'].kurtosis(), inplace=True)
df_train['NU_NOTA_COMP5'].fillna(df_train['NU_NOTA_COMP5'].kurtosis(), inplace=True)
df_train['NU_NOTA_REDACAO'].fillna(df_train['NU_NOTA_REDACAO'].kurtosis(), inplace=True)
df_train['NU_NOTA_COMP4'].fillna(df_train['NU_NOTA_COMP4'].kurtosis(), inplace=True)
df_train['NU_NOTA_CH'].fillna(df_train['NU_NOTA_CH'].kurtosis(), inplace=True)
df_train['NU_NOTA_CN'].fillna(df_train['NU_NOTA_CN'].kurtosis(), inplace=True)
df_train['NU_NOTA_MT'].fillna(0, inplace=True)

# df_train['TP_DEPENDENCIA_ADM_ESC'].fillna(1, inplace=True)
# df_train['TP_ENSINO'].fillna(1, inplace=True)
# df_train['TP_STATUS_REDACAO'].fillna(1, inplace=True)
# df_train['NU_NOTA_COMP2'].fillna(df_train['NU_NOTA_COMP2'].kurtosis(), inplace=True)
# df_train['NU_NOTA_LC'].fillna(df_train['NU_NOTA_LC'].kurtosis(), inplace=True)
# df_train['NU_NOTA_COMP1'].fillna(df_train['NU_NOTA_COMP1'].kurtosis(), inplace=True)
# df_train['NU_NOTA_COMP3'].fillna(df_train['NU_NOTA_COMP3'].kurtosis(), inplace=True)
# df_train['NU_NOTA_COMP5'].fillna(df_train['NU_NOTA_COMP5'].kurtosis(), inplace=True)
# df_train['NU_NOTA_REDACAO'].fillna(df_train['NU_NOTA_REDACAO'].kurtosis(), inplace=True)
# df_train['NU_NOTA_COMP4'].fillna(df_train['NU_NOTA_COMP4'].kurtosis(), inplace=True)
# df_train['NU_NOTA_CH'].fillna(317.4, inplace=True)
# df_train['NU_NOTA_CN'].fillna(316.5, inplace=True)
# df_train['NU_NOTA_MT'].fillna(0, inplace=True)

In [194]:
df_test['TP_DEPENDENCIA_ADM_ESC'].fillna(1, inplace=True)
df_test['TP_ENSINO'].fillna(1, inplace=True)
df_test['TP_STATUS_REDACAO'].fillna(1, inplace=True)
df_test['NU_NOTA_COMP2'].fillna(df_test['NU_NOTA_COMP2'].kurtosis(), inplace=True)
df_test['NU_NOTA_LC'].fillna(df_test['NU_NOTA_LC'].kurtosis(), inplace=True)
df_test['NU_NOTA_COMP1'].fillna(df_test['NU_NOTA_COMP1'].kurtosis(), inplace=True)
df_test['NU_NOTA_COMP3'].fillna(df_test['NU_NOTA_COMP3'].kurtosis(), inplace=True)
df_test['NU_NOTA_COMP5'].fillna(df_test['NU_NOTA_COMP5'].kurtosis(), inplace=True)
df_test['NU_NOTA_REDACAO'].fillna(df_test['NU_NOTA_REDACAO'].kurtosis(), inplace=True)
df_test['NU_NOTA_COMP4'].fillna(df_test['NU_NOTA_COMP4'].kurtosis(), inplace=True)
df_test['NU_NOTA_CH'].fillna(317.4, inplace=True)
df_test['NU_NOTA_CN'].fillna(316.5, inplace=True)

In [195]:
features_train = df_train.drop(['NU_NOTA_MT'], axis=1)
target_train = df_train['NU_NOTA_MT']

In [196]:
features_test = df_test

In [159]:
linear_regression = LinearRegression(normalize=True)
linear_regression.fit(features_train, target_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [160]:
predicted = linear_regression.predict(features_test)

In [161]:
acuracy = linear_regression.score(features_train, target_train)

In [162]:
acuracy

0.920414298798914

In [203]:
test = pd.read_csv('test.csv')
num_inscricao = test['NU_INSCRICAO']
resposta = pd.DataFrame(columns=['NU_INSCRICAO', 'NU_NOTA_MT'])
resposta['NU_INSCRICAO'] = num_inscricao
resposta['NU_NOTA_MT'] = predicted
resposta.set_index(['NU_INSCRICAO'], inplace=True)
resposta

Unnamed: 0_level_0,NU_NOTA_MT
NU_INSCRICAO,Unnamed: 1_level_1
73ff9fcc02f0a99919906c942c2e1a1042cdcf98,116853.051002
71a95f9f1b91a82c65ad94abbdf9f54e6066f968,137163.465811
b38a03232f43b11c9d0788abaf060f7366053b6d,133985.896493
70b682d9a3636be23f6120fa9d6b164eb3c6002d,26872.743042
715494628a50142ce8cb17191cfe6d0f3cae0934,128881.674829
e656d6bad65c93fb2880f1eba5037008c8e75774,137188.874958
465cd2a6907fb37d9d8ad3c065f0e2dabdba9b13,154660.447643
11539e86171bf07d3a36f09377d7f54ebcc8406a,116099.807938
043c544a2104aa8a9849f1a703a08d37a2f16839,155216.498694
76ba050e64ad100b856f0eaabd8f539d5c7dd185,137260.482296


In [204]:
for i in resposta.index:
    if resposta.loc[i,'NU_NOTA_MT'] < 100:
                    resposta.loc[i,'NU_NOTA_MT'] = 309.7

resposta.to_csv('answer.csv')
resposta

Unnamed: 0_level_0,NU_NOTA_MT
NU_INSCRICAO,Unnamed: 1_level_1
73ff9fcc02f0a99919906c942c2e1a1042cdcf98,116853.051002
71a95f9f1b91a82c65ad94abbdf9f54e6066f968,137163.465811
b38a03232f43b11c9d0788abaf060f7366053b6d,133985.896493
70b682d9a3636be23f6120fa9d6b164eb3c6002d,26872.743042
715494628a50142ce8cb17191cfe6d0f3cae0934,128881.674829
e656d6bad65c93fb2880f1eba5037008c8e75774,137188.874958
465cd2a6907fb37d9d8ad3c065f0e2dabdba9b13,154660.447643
11539e86171bf07d3a36f09377d7f54ebcc8406a,116099.807938
043c544a2104aa8a9849f1a703a08d37a2f16839,155216.498694
76ba050e64ad100b856f0eaabd8f539d5c7dd185,137260.482296


In [197]:
scaler = StandardScaler()

In [198]:
features_train_scaled = scaler.fit_transform(features_train)

In [199]:
ridge_regression = Ridge(alpha=1, solver="cholesky")

ridge_regression.fit(features_train_scaled, target_train)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='cholesky', tol=0.001)

In [200]:
predicted = ridge_regression.predict(features_test)

In [201]:
acuracy = ridge_regression.score(features_train_scaled, target_train)

In [202]:
acuracy

0.9164235886116098