In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.neural_network import MLPClassifier

In [14]:
import warnings
warnings.filterwarnings("ignore")

# Data

In [15]:
train = pd.read_csv("voting_train.csv")
test = pd.read_csv("voting_test.csv")

In [16]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 218 entries, 0 to 217
Data columns (total 18 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   ID                                      218 non-null    int64 
 1   class                                   218 non-null    object
 2   handicapped-infants                     218 non-null    object
 3   water-project-cost-sharing              218 non-null    object
 4   adoption-of-the-budget-resolution       218 non-null    object
 5   physician-fee-freeze                    218 non-null    object
 6   el-salvador-aid                         218 non-null    object
 7   religious-groups-in-schools             218 non-null    object
 8   anti-satellite-test-ban                 218 non-null    object
 9   aid-to-nicaraguan-contras               218 non-null    object
 10  mx-missile                              218 non-null    object
 11  immigr

In [17]:
train.head()

Unnamed: 0,ID,class,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-crporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,67,republican,n,y,n,y,y,y,y,n,n,n,y,y,y,y,n,y
1,338,democrat,y,n,y,n,n,n,y,y,y,n,n,n,n,n,y,y
2,35,democrat,y,y,y,n,n,n,y,y,y,n,n,n,n,n,y,y
3,122,republican,n,unknown,n,y,y,y,n,n,n,y,n,y,y,y,n,y
4,420,democrat,y,y,y,n,n,n,y,y,y,n,n,n,n,n,n,y


In [18]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217 entries, 0 to 216
Data columns (total 17 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   ID                                      217 non-null    int64 
 1   handicapped-infants                     217 non-null    object
 2   water-project-cost-sharing              217 non-null    object
 3   adoption-of-the-budget-resolution       217 non-null    object
 4   physician-fee-freeze                    217 non-null    object
 5   el-salvador-aid                         217 non-null    object
 6   religious-groups-in-schools             217 non-null    object
 7   anti-satellite-test-ban                 217 non-null    object
 8   aid-to-nicaraguan-contras               217 non-null    object
 9   mx-missile                              217 non-null    object
 10  immigration                             217 non-null    object
 11  synfue

In [19]:
test.head()

Unnamed: 0,ID,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-crporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,430,y,n,y,n,unknown,n,y,y,y,y,n,y,n,unknown,y,y
1,411,n,n,n,y,y,y,n,n,n,n,y,y,y,y,n,y
2,167,y,n,y,y,y,y,y,y,n,y,n,y,n,y,y,y
3,99,y,y,y,n,n,y,y,y,y,y,n,n,n,n,n,y
4,415,y,y,y,n,n,n,y,y,y,n,n,n,n,n,n,y


In [20]:
X_train = train.drop(["ID", "class"], axis=1)
y_train = train["class"]

X_test = test.drop(["ID"], axis=1)

# Pre-processing

## Encoding

In [21]:
ohe = OneHotEncoder()
X_train_enc = ohe.fit_transform(X_train)
X_test_enc = ohe.fit_transform(X_test)

# Exploratory data analysis

In [22]:
X_train.isnull().sum()

handicapped-infants                       0
water-project-cost-sharing                0
adoption-of-the-budget-resolution         0
physician-fee-freeze                      0
el-salvador-aid                           0
religious-groups-in-schools               0
anti-satellite-test-ban                   0
aid-to-nicaraguan-contras                 0
mx-missile                                0
immigration                               0
synfuels-crporation-cutback               0
education-spending                        0
superfund-right-to-sue                    0
crime                                     0
duty-free-exports                         0
export-administration-act-south-africa    0
dtype: int64

# Model selection

In [23]:
# Grid Search

cv = 10         # number of folds
verbose = 1     # information shown during training

## Multi-layer Perceptron

In [31]:
parameters = {
    "hidden_layer_sizes":[(50,), (100,), (150,), (50,5), (100,5), (150,5), (50,10), (100,10), (150,10), (200,10)],
    "alpha":[0.001, 0.01, 0.1]}
model = GridSearchCV(MLPClassifier(), parameters, cv=cv, verbose=verbose, scoring="f1_weighted")
model.fit(X_train_enc, y_train)

results = pd.DataFrame(model.cv_results_)
results = results[["param_hidden_layer_sizes", "param_alpha", "mean_test_score"]]
results.sort_values(["mean_test_score"], ascending=False).head(10)

Fitting 10 folds for each of 30 candidates, totalling 300 fits
[CV 1/10] END alpha=0.001, hidden_layer_sizes=(50,);, score=1.000 total time=   0.1s
[CV 2/10] END alpha=0.001, hidden_layer_sizes=(50,);, score=0.909 total time=   0.1s
[CV 3/10] END alpha=0.001, hidden_layer_sizes=(50,);, score=0.909 total time=   0.1s
[CV 4/10] END alpha=0.001, hidden_layer_sizes=(50,);, score=1.000 total time=   0.1s
[CV 5/10] END alpha=0.001, hidden_layer_sizes=(50,);, score=0.955 total time=   0.1s
[CV 6/10] END alpha=0.001, hidden_layer_sizes=(50,);, score=1.000 total time=   0.1s
[CV 7/10] END alpha=0.001, hidden_layer_sizes=(50,);, score=0.954 total time=   0.1s
[CV 8/10] END alpha=0.001, hidden_layer_sizes=(50,);, score=0.954 total time=   0.1s
[CV 9/10] END alpha=0.001, hidden_layer_sizes=(50,);, score=0.953 total time=   0.1s
[CV 10/10] END alpha=0.001, hidden_layer_sizes=(50,);, score=0.905 total time=   0.1s
[CV 1/10] END alpha=0.001, hidden_layer_sizes=(100,);, score=1.000 total time=   0.2s


Unnamed: 0,param_hidden_layer_sizes,param_alpha,mean_test_score
20,"(50,)",0.1,0.967514
12,"(150,)",0.01,0.958443
9,"(200, 10)",0.001,0.958443
28,"(150, 10)",0.1,0.958402
8,"(150, 10)",0.001,0.958402
1,"(100,)",0.001,0.958364
6,"(50, 10)",0.001,0.958023
26,"(50, 10)",0.1,0.954066
0,"(50,)",0.001,0.953848
22,"(150,)",0.1,0.953848


# Final model

In [13]:
best_model = model.best_estimator_
best_model.fit(X_train_enc, y_train)

LogisticRegression(C=10, class_weight='balanced', solver='liblinear')

In [14]:
predictions = pd.DataFrame(test["ID"])
predictions["class"] = best_model.predict(X_test_enc)

In [15]:
predictions.to_csv("submission.csv", index=False)