In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

# Configuring the plots color to white
params = {"ytick.color" : "w",
          "xtick.color" : "w",
          "axes.titlecolor" : "w",
          "axes.labelcolor" : "w",
          "axes.edgecolor" : "w"}
plt.rcParams.update(params)

In [2]:
#Importing the data
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')
# sample = pd.read_csv('dataset/sample_submission.csv')

train

Unnamed: 0,Id,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,16280,34,Private,204991,Some-college,10,Divorced,Exec-managerial,Own-child,White,Male,0,0,44,United-States,<=50K
1,16281,58,Local-gov,310085,10th,6,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,United-States,<=50K
2,16282,25,Private,146117,Some-college,10,Never-married,Machine-op-inspct,Not-in-family,White,Male,0,0,42,United-States,<=50K
3,16283,24,Private,138938,Some-college,10,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,<=50K
4,16284,57,Self-emp-inc,258883,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,5178,0,60,Hungary,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32555,48835,42,Private,384236,Masters,14,Married-civ-spouse,Prof-specialty,Husband,White,Male,7688,0,40,United-States,>50K
32556,48836,23,Private,129042,HS-grad,9,Never-married,Machine-op-inspct,Unmarried,Black,Female,0,0,40,United-States,<=50K
32557,48837,30,Private,195488,HS-grad,9,Never-married,Priv-house-serv,Own-child,White,Female,0,0,40,Guatemala,<=50K
32558,48838,18,Private,27620,HS-grad,9,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,25,United-States,<=50K


### Data Analysis

In [3]:
# Verifying that there are no missing values.
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32560 entries, 0 to 32559
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Id              32560 non-null  int64 
 1   age             32560 non-null  int64 
 2   workclass       32560 non-null  object
 3   fnlwgt          32560 non-null  int64 
 4   education       32560 non-null  object
 5   education.num   32560 non-null  int64 
 6   marital.status  32560 non-null  object
 7   occupation      32560 non-null  object
 8   relationship    32560 non-null  object
 9   race            32560 non-null  object
 10  sex             32560 non-null  object
 11  capital.gain    32560 non-null  int64 
 12  capital.loss    32560 non-null  int64 
 13  hours.per.week  32560 non-null  int64 
 14  native.country  32560 non-null  object
 15  income          32560 non-null  object
dtypes: int64(7), object(9)
memory usage: 4.0+ MB


In [4]:
# Spliting the columns
columns_to_remove = ['Id','fnlwgt', 'education']
y = train['income']
X = train.drop(columns_to_remove + ['income'], axis=1)

X_test = test.drop(columns_to_remove, axis=1)

X.shape, y.shape, X_test.shape

((32560, 12), (32560,), (16280, 12))

In [9]:
categorical_columns = X.select_dtypes(include=[object]).columns
numerical_columns = X.select_dtypes(include=['int64']).columns

In [10]:
# As there are a lot of categorical variables we are going to do a one hot enconding
from sklearn.preprocessing import OrdinalEncoder

oe = OrdinalEncoder()
oe.fit(X[categorical_columns])
X[categorical_columns] = oe.transform(X[categorical_columns])
X_test[categorical_columns] = oe.transform(X_test[categorical_columns])

X_test

Unnamed: 0,age,workclass,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
0,25,4.0,13,4.0,10.0,1.0,4.0,1.0,0,0,44,39.0
1,64,7.0,13,2.0,4.0,0.0,4.0,1.0,0,0,45,39.0
2,31,4.0,14,0.0,4.0,1.0,4.0,0.0,0,0,40,39.0
3,45,4.0,9,4.0,7.0,3.0,4.0,1.0,0,0,40,39.0
4,64,5.0,13,2.0,4.0,0.0,4.0,1.0,0,0,50,39.0
...,...,...,...,...,...,...,...,...,...,...,...,...
16275,40,4.0,9,2.0,3.0,0.0,4.0,1.0,0,0,40,39.0
16276,30,2.0,9,2.0,8.0,0.0,4.0,1.0,0,0,40,39.0
16277,25,4.0,9,4.0,1.0,3.0,1.0,1.0,0,0,40,30.0
16278,60,4.0,14,4.0,4.0,1.0,4.0,0.0,3325,0,35,39.0


### Modelagem da Rede Neural

In [12]:
# Normalização dos dados
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X[numerical_columns] = scaler.fit_transform(X[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

In [14]:
from sklearn.neural_network import MLPClassifier

# Use by default the log-loss function 
model_nn = MLPClassifier(hidden_layer_sizes=(60,20), solver='adam', alpha= 0.001)

model_nn.fit(X, y)

In [17]:
y_test = model_nn.predict(X_test)
output = pd.DataFrame({'income': y_test})


In [18]:
output.to_csv("submission3.csv", index=True, index_label='Id')