# Importing Libraries

In [48]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold


from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR

import pickle

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Importing and Analysing Data

In [49]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [50]:
# Selecting only the columns that are in the test set
df = df[df_test.columns]

In [51]:
# Checking how many features features
len(df.columns)

47

In [52]:
# Creating the answer DF
df_answer = pd.DataFrame(index = range(df_test.shape[0]), columns=['NU_INSCRICAO','NU_NOTA_MT'])
df_answer['NU_INSCRICAO'] = df_test['NU_INSCRICAO']

In [53]:
df.columns

Index(['NU_INSCRICAO', 'CO_UF_RESIDENCIA', 'SG_UF_RESIDENCIA', 'NU_IDADE',
       'TP_SEXO', 'TP_COR_RACA', 'TP_NACIONALIDADE', 'TP_ST_CONCLUSAO',
       'TP_ANO_CONCLUIU', 'TP_ESCOLA', 'TP_ENSINO', 'IN_TREINEIRO',
       'TP_DEPENDENCIA_ADM_ESC', 'IN_BAIXA_VISAO', 'IN_CEGUEIRA', 'IN_SURDEZ',
       'IN_DISLEXIA', 'IN_DISCALCULIA', 'IN_SABATISTA', 'IN_GESTANTE',
       'IN_IDOSO', 'TP_PRESENCA_CN', 'TP_PRESENCA_CH', 'TP_PRESENCA_LC',
       'CO_PROVA_CN', 'CO_PROVA_CH', 'CO_PROVA_LC', 'CO_PROVA_MT',
       'NU_NOTA_CN', 'NU_NOTA_CH', 'NU_NOTA_LC', 'TP_LINGUA',
       'TP_STATUS_REDACAO', 'NU_NOTA_COMP1', 'NU_NOTA_COMP2', 'NU_NOTA_COMP3',
       'NU_NOTA_COMP4', 'NU_NOTA_COMP5', 'NU_NOTA_REDACAO', 'Q001', 'Q002',
       'Q006', 'Q024', 'Q025', 'Q026', 'Q027', 'Q047'],
      dtype='object')

## Analysing the missing values

In [54]:
# Checking the % of NAN values in each feature
df.isnull().sum()/df.shape[0]

NU_INSCRICAO              0.000000
CO_UF_RESIDENCIA          0.000000
SG_UF_RESIDENCIA          0.000000
NU_IDADE                  0.000000
TP_SEXO                   0.000000
TP_COR_RACA               0.000000
TP_NACIONALIDADE          0.000000
TP_ST_CONCLUSAO           0.000000
TP_ANO_CONCLUIU           0.000000
TP_ESCOLA                 0.000000
TP_ENSINO                 0.688128
IN_TREINEIRO              0.000000
TP_DEPENDENCIA_ADM_ESC    0.688128
IN_BAIXA_VISAO            0.000000
IN_CEGUEIRA               0.000000
IN_SURDEZ                 0.000000
IN_DISLEXIA               0.000000
IN_DISCALCULIA            0.000000
IN_SABATISTA              0.000000
IN_GESTANTE               0.000000
IN_IDOSO                  0.000000
TP_PRESENCA_CN            0.000000
TP_PRESENCA_CH            0.000000
TP_PRESENCA_LC            0.000000
CO_PROVA_CN               0.000000
CO_PROVA_CH               0.000000
CO_PROVA_LC               0.000000
CO_PROVA_MT               0.000000
NU_NOTA_CN          

In [55]:
df[(df['TP_PRESENCA_LC']==0)|(df['TP_PRESENCA_LC']==2)].shape[0]/df.shape[0]

0.2619810633648944

26.19% were absent, so they of a grade iqual to 0.
They will be deleted from the training dataframe and it will be aplied the 0 in the test set answer

In [57]:
df = df.dropna(subset = ['TP_STATUS_REDACAO'])

In [58]:
# Checking the result
df.isnull().sum()/df.shape[0]

NU_INSCRICAO              0.000000
CO_UF_RESIDENCIA          0.000000
SG_UF_RESIDENCIA          0.000000
NU_IDADE                  0.000000
TP_SEXO                   0.000000
TP_COR_RACA               0.000000
TP_NACIONALIDADE          0.000000
TP_ST_CONCLUSAO           0.000000
TP_ANO_CONCLUIU           0.000000
TP_ESCOLA                 0.000000
TP_ENSINO                 0.640876
IN_TREINEIRO              0.000000
TP_DEPENDENCIA_ADM_ESC    0.640876
IN_BAIXA_VISAO            0.000000
IN_CEGUEIRA               0.000000
IN_SURDEZ                 0.000000
IN_DISLEXIA               0.000000
IN_DISCALCULIA            0.000000
IN_SABATISTA              0.000000
IN_GESTANTE               0.000000
IN_IDOSO                  0.000000
TP_PRESENCA_CN            0.000000
TP_PRESENCA_CH            0.000000
TP_PRESENCA_LC            0.000000
CO_PROVA_CN               0.000000
CO_PROVA_CH               0.000000
CO_PROVA_LC               0.000000
CO_PROVA_MT               0.000000
NU_NOTA_CN          

In [60]:
# Dropping also the students that were abstent in the CN test, because there was only 0.35% of them
df = df.dropna(subset = ['NU_NOTA_CN'])

In [70]:
# Imputing the 0 grade for the students that were absent in the test set
absent_test = df_test[(df_test['TP_PRESENCA_LC']==0)|(df_test['TP_PRESENCA_LC']==2)]['NU_INSCRICAO']
present_test = df_test[df_test['TP_PRESENCA_LC']==1]['NU_INSCRICAO']

In [71]:
absent_test = absent_test.to_list()
present_test = present_test.to_list()

In [74]:
len(absent_test) + len(present_test) ==df_test.shape[0]

True

In [78]:
df_answer.loc[df_answer['NU_INSCRICAO'].isin(absent_test),'NU_NOTA_MT'] = 0