In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.neural_network import MLPClassifier

In [3]:
import warnings
warnings.filterwarnings("ignore")

# Data

In [4]:
train = pd.read_csv("voting_train.csv")
test = pd.read_csv("voting_test.csv")

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 218 entries, 0 to 217
Data columns (total 18 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   ID                                      218 non-null    int64 
 1   class                                   218 non-null    object
 2   handicapped-infants                     218 non-null    object
 3   water-project-cost-sharing              218 non-null    object
 4   adoption-of-the-budget-resolution       218 non-null    object
 5   physician-fee-freeze                    218 non-null    object
 6   el-salvador-aid                         218 non-null    object
 7   religious-groups-in-schools             218 non-null    object
 8   anti-satellite-test-ban                 218 non-null    object
 9   aid-to-nicaraguan-contras               218 non-null    object
 10  mx-missile                              218 non-null    object
 11  immigr

In [6]:
train.head()

Unnamed: 0,ID,class,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-crporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,67,republican,n,y,n,y,y,y,y,n,n,n,y,y,y,y,n,y
1,338,democrat,y,n,y,n,n,n,y,y,y,n,n,n,n,n,y,y
2,35,democrat,y,y,y,n,n,n,y,y,y,n,n,n,n,n,y,y
3,122,republican,n,unknown,n,y,y,y,n,n,n,y,n,y,y,y,n,y
4,420,democrat,y,y,y,n,n,n,y,y,y,n,n,n,n,n,n,y


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217 entries, 0 to 216
Data columns (total 17 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   ID                                      217 non-null    int64 
 1   handicapped-infants                     217 non-null    object
 2   water-project-cost-sharing              217 non-null    object
 3   adoption-of-the-budget-resolution       217 non-null    object
 4   physician-fee-freeze                    217 non-null    object
 5   el-salvador-aid                         217 non-null    object
 6   religious-groups-in-schools             217 non-null    object
 7   anti-satellite-test-ban                 217 non-null    object
 8   aid-to-nicaraguan-contras               217 non-null    object
 9   mx-missile                              217 non-null    object
 10  immigration                             217 non-null    object
 11  synfue

In [8]:
test.head()

Unnamed: 0,ID,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-crporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,430,y,n,y,n,unknown,n,y,y,y,y,n,y,n,unknown,y,y
1,411,n,n,n,y,y,y,n,n,n,n,y,y,y,y,n,y
2,167,y,n,y,y,y,y,y,y,n,y,n,y,n,y,y,y
3,99,y,y,y,n,n,y,y,y,y,y,n,n,n,n,n,y
4,415,y,y,y,n,n,n,y,y,y,n,n,n,n,n,n,y


In [9]:
X_train = train.drop(["ID", "class"], axis=1)
y_train = train["class"]

X_test = test.drop(["ID"], axis=1)

# Pre-processing

## Encoding

In [10]:
ohe = OneHotEncoder()
X_train_enc = ohe.fit_transform(X_train)
X_test_enc = ohe.fit_transform(X_test)

# Model selection

In [11]:
# Grid Search

cv = StratifiedKFold(5, shuffle=True, random_state=1987)
verbose = 3

## Multi-layer Perceptron

In [15]:
parameters = {
    "hidden_layer_sizes":[
        (10), (20), (30), (50), (100), (150), 
        (10,5), (20,5), (30, 5), (50,5), (100,5), (100,5), 
        (10,10), (20,10), (30, 10), (50,10), (100,10), (150,10)],
    "solver":["lbfgs"],
    "activation":["relu", "logistic"],
    "alpha":[0.0001, 0.001, 0.01, 0.1]}
model = GridSearchCV(MLPClassifier(), parameters, cv=cv, verbose=verbose, scoring="accuracy")
model.fit(X_train_enc, y_train)

results = pd.DataFrame(model.cv_results_)
results = results[["param_hidden_layer_sizes", "param_activation", "param_alpha", "mean_test_score", "std_test_score"]]
results.sort_values(["mean_test_score"], ascending=False).head(10)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
[CV 1/5] END activation=relu, alpha=0.0001, hidden_layer_sizes=(10,), solver=lbfgs;, score=0.977 total time=   0.0s
[CV 2/5] END activation=relu, alpha=0.0001, hidden_layer_sizes=(10,), solver=lbfgs;, score=0.955 total time=   0.0s
[CV 3/5] END activation=relu, alpha=0.0001, hidden_layer_sizes=(10,), solver=lbfgs;, score=0.955 total time=   0.0s
[CV 4/5] END activation=relu, alpha=0.0001, hidden_layer_sizes=(10,), solver=lbfgs;, score=0.930 total time=   0.0s
[CV 5/5] END activation=relu, alpha=0.0001, hidden_layer_sizes=(10,), solver=lbfgs;, score=0.953 total time=   0.0s
[CV 1/5] END activation=relu, alpha=0.0001, hidden_layer_sizes=(20,), solver=lbfgs;, score=0.977 total time=   0.0s
[CV 2/5] END activation=relu, alpha=0.0001, hidden_layer_sizes=(20,), solver=lbfgs;, score=0.955 total time=   0.0s
[CV 3/5] END activation=relu, alpha=0.0001, hidden_layer_sizes=(20,), solver=lbfgs;, score=0.955 total time=   0.0s
[CV 4/5] 

Unnamed: 0,param_hidden_layer_sizes,param_activation,param_alpha,mean_test_score,std_test_score
38,"(30,)",relu,0.01,0.976956,0.014711
119,"(100, 5)",logistic,0.01,0.976956,0.014711
143,"(150, 10)",logistic,0.1,0.97241,0.017209
130,"(100,)",logistic,0.1,0.97241,0.017209
36,"(10,)",relu,0.01,0.97241,0.017209
106,"(100, 10)",logistic,0.001,0.97241,0.017209
51,"(50, 10)",relu,0.01,0.97241,0.017209
121,"(20, 10)",logistic,0.01,0.97241,0.017209
113,"(150,)",logistic,0.01,0.97241,0.017209
54,"(10,)",relu,0.1,0.97241,0.017209


# Final model

In [34]:
best_model = model.best_estimator_
best_model.fit(X_train_enc, y_train)

MLPClassifier(hidden_layer_sizes=(20,), solver='lbfgs')

In [35]:
predictions = pd.DataFrame(test["ID"])
predictions["class"] = best_model.predict(X_test_enc)

In [36]:
predictions.to_csv("submission.csv", index=False)