In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [3]:
num_inscricao = df_test.NU_INSCRICAO
columns = df_test.columns

In [5]:
X_train = df_train.drop(['IN_TREINEIRO'], axis=1)[columns]
y_train = df_train.IN_TREINEIRO

In [6]:
#determine categorical and numerical features

def separating(df):
    
    num = df.select_dtypes(include = 'number').columns
    cat = df.select_dtypes(exclude = 'number').columns
    
    return num, cat

In [7]:
#Applies separating
num_test, cat_test = separating(df_test)
num_train, cat_train = separating(X_train)

In [9]:
#Create new variables
columns_train = num_train
columns_test = num_test

In [10]:
#Pipeline for numerical columns
numeric_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'constant')),
    ('scaler', StandardScaler())
    ])

#Pipeline for categorical columns
categorical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

In [11]:
def fitting(df, num, cat):
    
    a = numeric_transformer.fit_transform(df[num])
    b = categorical_transformer.fit_transform(df[cat])
    
    return a, b

In [12]:
#Preprocessing df_test
num_test, cat_test = fitting(df_test, num_test, cat_test)

#Preprocessing df_train
num_train, cat_train = fitting(X_train, num_train, cat_train)

In [14]:
#Trasforms in DataFrame
X_train = pd.DataFrame(num_train, columns=[columns_train])
df_test = pd.DataFrame(num_test, columns=[columns_test])

In [15]:
#Feature selection with RFE
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

selector = RFE(LinearRegression(), n_features_to_select=5, step=1)
selector = selector.fit(X_train, y_train)

Another options: SelectKBest and PCA

In [16]:
df_test

Unnamed: 0,CO_UF_RESIDENCIA,NU_IDADE,TP_COR_RACA,TP_NACIONALIDADE,TP_ST_CONCLUSAO,TP_ANO_CONCLUIU,TP_ESCOLA,TP_ENSINO,TP_DEPENDENCIA_ADM_ESC,IN_BAIXA_VISAO,...,NU_NOTA_CH,NU_NOTA_LC,TP_LINGUA,TP_STATUS_REDACAO,NU_NOTA_COMP1,NU_NOTA_COMP2,NU_NOTA_COMP3,NU_NOTA_COMP4,NU_NOTA_COMP5,NU_NOTA_REDACAO
0,1.209124,-0.374856,-1.180663,-0.190554,-0.881021,0.254485,-0.631128,-0.586404,-0.629501,-0.046829,...,0.820475,-1.647927,0.800951,-1.085054,-1.535180,-1.470134,-1.445743,-1.476754,-1.135260,-1.480858
1,-1.619076,0.361515,0.826267,3.878750,-0.881021,0.562155,-0.631128,-0.586404,-0.629501,-0.046829,...,-1.697694,-1.647927,0.800951,-1.085054,-1.535180,-1.470134,-1.445743,-1.476754,-1.135260,-1.480858
2,-0.204976,-0.816679,-0.177198,-0.190554,1.414456,-0.668527,-0.631128,-0.586404,-0.629501,-0.046829,...,0.548837,-1.647927,-1.248515,-1.085054,-1.535180,-1.470134,-1.445743,-1.476754,-1.135260,-1.480858
3,1.007110,-0.669405,-1.180663,-0.190554,0.266717,-0.668527,1.131864,1.025062,1.135419,-0.046829,...,0.669800,0.678228,0.800951,0.252965,0.545818,0.654649,0.019051,0.642334,-0.336101,0.337254
4,2.219195,-0.374856,-1.180663,-0.190554,-0.881021,-0.360857,-0.631128,-0.586404,-0.629501,-0.046829,...,0.832359,0.902259,0.800951,0.252965,1.239484,1.362910,1.483845,1.348697,0.463057,1.246310
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4565,-0.002962,-0.669405,-0.177198,-0.190554,0.266717,-0.668527,2.894855,1.025062,2.900339,-0.046829,...,0.720308,0.503981,0.800951,0.252965,0.892651,0.654649,0.751448,0.642334,1.262216,0.867537
4566,0.401067,-0.227582,-1.180663,-0.190554,-0.881021,-0.053186,-0.631128,-0.586404,-0.629501,-0.046829,...,0.139682,0.313425,0.800951,0.252965,0.198985,-0.407742,-0.347147,-0.064029,-0.735681,-0.268783
4567,-2.023104,0.066966,0.826267,-0.190554,-0.881021,0.869826,-0.631128,-0.586404,-0.629501,-0.046829,...,0.722855,0.771789,0.800951,0.252965,0.545818,0.654649,0.385250,0.289153,0.063478,0.413009
4568,0.401067,-0.374856,-1.180663,-0.190554,0.266717,-0.668527,1.131864,1.025062,1.135419,-0.046829,...,0.789915,0.752476,-1.248515,0.252965,0.545818,0.300519,0.385250,0.642334,0.463057,0.488764


In [17]:
selected = []

#Summarize all features
for i, j in zip(X_train, range(len(X_train))):
    
    if selector.support_[j]:
        selected.append(i)
        
selected

[('NU_IDADE',),
 ('TP_ST_CONCLUSAO',),
 ('TP_ANO_CONCLUIU',),
 ('TP_ESCOLA',),
 ('NU_NOTA_COMP1',)]

### Train model

In [18]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier().fit(X_train[selected], y_train)

### Make Predictions 

In [19]:
pred_y = clf.predict(df_test[selected])

In [20]:
df_answer = pd.DataFrame({'NU_INSCRICAO': num_inscricao, 'IN_TREINEIRO': pred_y})

In [21]:
#Save file 
df_answer.to_csv('answer.csv', index=False)