In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import neighbors
from sklearn import svm

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [15]:
import warnings
warnings.filterwarnings("ignore")

# Data

In [16]:
train = pd.read_csv("voting_train.csv")
test = pd.read_csv("voting_test.csv")

In [17]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 218 entries, 0 to 217
Data columns (total 18 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   ID                                      218 non-null    int64 
 1   class                                   218 non-null    object
 2   handicapped-infants                     218 non-null    object
 3   water-project-cost-sharing              218 non-null    object
 4   adoption-of-the-budget-resolution       218 non-null    object
 5   physician-fee-freeze                    218 non-null    object
 6   el-salvador-aid                         218 non-null    object
 7   religious-groups-in-schools             218 non-null    object
 8   anti-satellite-test-ban                 218 non-null    object
 9   aid-to-nicaraguan-contras               218 non-null    object
 10  mx-missile                              218 non-null    object
 11  immigr

In [18]:
train.head()

Unnamed: 0,ID,class,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-crporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,67,republican,n,y,n,y,y,y,y,n,n,n,y,y,y,y,n,y
1,338,democrat,y,n,y,n,n,n,y,y,y,n,n,n,n,n,y,y
2,35,democrat,y,y,y,n,n,n,y,y,y,n,n,n,n,n,y,y
3,122,republican,n,unknown,n,y,y,y,n,n,n,y,n,y,y,y,n,y
4,420,democrat,y,y,y,n,n,n,y,y,y,n,n,n,n,n,n,y


In [19]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217 entries, 0 to 216
Data columns (total 17 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   ID                                      217 non-null    int64 
 1   handicapped-infants                     217 non-null    object
 2   water-project-cost-sharing              217 non-null    object
 3   adoption-of-the-budget-resolution       217 non-null    object
 4   physician-fee-freeze                    217 non-null    object
 5   el-salvador-aid                         217 non-null    object
 6   religious-groups-in-schools             217 non-null    object
 7   anti-satellite-test-ban                 217 non-null    object
 8   aid-to-nicaraguan-contras               217 non-null    object
 9   mx-missile                              217 non-null    object
 10  immigration                             217 non-null    object
 11  synfue

In [20]:
test.head()

Unnamed: 0,ID,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-crporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,430,y,n,y,n,unknown,n,y,y,y,y,n,y,n,unknown,y,y
1,411,n,n,n,y,y,y,n,n,n,n,y,y,y,y,n,y
2,167,y,n,y,y,y,y,y,y,n,y,n,y,n,y,y,y
3,99,y,y,y,n,n,y,y,y,y,y,n,n,n,n,n,y
4,415,y,y,y,n,n,n,y,y,y,n,n,n,n,n,n,y


In [21]:
X = train.drop(["ID", "class"], axis=1)
y = train["class"]

X_test = test.drop(["ID"], axis=1)

# Exploratory data analysis

In [22]:
X.isnull().sum()

handicapped-infants                       0
water-project-cost-sharing                0
adoption-of-the-budget-resolution         0
physician-fee-freeze                      0
el-salvador-aid                           0
religious-groups-in-schools               0
anti-satellite-test-ban                   0
aid-to-nicaraguan-contras                 0
mx-missile                                0
immigration                               0
synfuels-crporation-cutback               0
education-spending                        0
superfund-right-to-sue                    0
crime                                     0
duty-free-exports                         0
export-administration-act-south-africa    0
dtype: int64

# Pre-processing

## Encoding

In [23]:
ohe = OneHotEncoder()
X_enc = ohe.fit_transform(X)
X_test_enc = ohe.fit_transform(X_test)

# Model selection

In [24]:
cv = 5
verbose = 1

## KNN

In [25]:
parameters = {
    "n_neighbors":[1, 5, 10, 20], 
    "weights":["uniform", "distance"],
    "metric":["euclidean", "manhattan", "chebyshev", "minkowski", "wminkowski", "seuclidean", "mahalanobis"]}
knn = GridSearchCV(neighbors.KNeighborsClassifier(), parameters, cv=cv, verbose=verbose)
knn.fit(X_enc, y)

Fitting 5 folds for each of 56 candidates, totalling 280 fits


GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'metric': ['euclidean', 'manhattan', 'chebyshev',
                                    'minkowski', 'wminkowski', 'seuclidean',
                                    'mahalanobis'],
                         'n_neighbors': [1, 5, 10, 20],
                         'weights': ['uniform', 'distance']},
             verbose=1)

In [26]:
knn_results = pd.DataFrame(knn.cv_results_)
knn_results= knn_results[["param_n_neighbors", "param_weights", "param_metric", "mean_test_score"]]
knn_results.sort_values(["mean_test_score"], ascending=False).head(10)

Unnamed: 0,param_n_neighbors,param_weights,param_metric,mean_test_score
0,1,uniform,euclidean,0.930973
24,1,uniform,minkowski,0.930973
2,5,uniform,euclidean,0.930973
3,5,distance,euclidean,0.930973
27,5,distance,minkowski,0.930973
8,1,uniform,manhattan,0.930973
9,1,distance,manhattan,0.930973
10,5,uniform,manhattan,0.930973
11,5,distance,manhattan,0.930973
1,1,distance,euclidean,0.930973


## Decision Tree

In [27]:
parameters = {
    "criterion":["gini", "entropy"],
    "splitter":["best", "random"], 
    "max_depth":["None", 5, 10, 20],
    "max_features":["auto", "sqrt", "log2"]}    
dt = GridSearchCV(DecisionTreeClassifier(), parameters, cv=cv, verbose=verbose)
dt.fit(X_enc, y)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': ['None', 5, 10, 20],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'splitter': ['best', 'random']},
             verbose=1)

In [28]:
dt_results = pd.DataFrame(dt.cv_results_)
dt_results = dt_results[["param_criterion", "param_splitter", "param_max_depth", "param_max_features", "mean_test_score"]]
dt_results.sort_values(["mean_test_score"], ascending=False).head(10)

Unnamed: 0,param_criterion,param_splitter,param_max_depth,param_max_features,mean_test_score
40,entropy,best,10,log2,0.94482
42,entropy,best,20,auto,0.940486
19,gini,random,20,auto,0.935729
21,gini,random,20,sqrt,0.931184
44,entropy,best,20,sqrt,0.930973
34,entropy,best,5,log2,0.930973
37,entropy,random,10,auto,0.926638
20,gini,best,20,sqrt,0.922199
6,gini,best,5,auto,0.922093
39,entropy,random,10,sqrt,0.921987


## Random Forest

In [29]:
parameters = {
    "n_estimators":[10, 50, 100, 200, 300], 
    "criterion":["gini", "entropy"], 
    "max_depth":["None", 5, 10, 20],
    "max_features":["auto", "sqrt", "log2"]}
rf = GridSearchCV(RandomForestClassifier(), parameters, cv=cv, verbose=verbose)
rf.fit(X_enc, y)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': ['None', 5, 10, 20],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [10, 50, 100, 200, 300]},
             verbose=1)

In [30]:
rf_results = pd.DataFrame(rf.cv_results_)
rf_results = rf_results[["param_n_estimators", "param_criterion", "param_max_depth", "param_max_features", "mean_test_score"]]
rf_results.sort_values(["mean_test_score"], ascending=False).head(10)

Unnamed: 0,param_n_estimators,param_criterion,param_max_depth,param_max_features,mean_test_score
47,100,gini,20,auto,0.963319
17,100,gini,5,auto,0.963214
18,200,gini,5,auto,0.963214
41,50,gini,10,log2,0.963214
75,10,entropy,5,auto,0.958668
32,100,gini,10,auto,0.958668
52,100,gini,20,sqrt,0.958668
51,50,gini,20,sqrt,0.958668
102,100,entropy,10,log2,0.958668
59,300,gini,20,log2,0.958668


## SVM

In [31]:
parameters = {
    "C":[1, 5, 10],
    "kernel":["linear", "sigmoid", "rbf", "poly"],     
    "degree":[3, 5],
    "gamma":["scale", "auto"],
    "class_weight":["None", "balanced"]}
svm = GridSearchCV(svm.SVC(), parameters, cv=cv, verbose=verbose)
svm.fit(X_enc, y)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [1, 5, 10], 'class_weight': ['None', 'balanced'],
                         'degree': [3, 5], 'gamma': ['scale', 'auto'],
                         'kernel': ['linear', 'sigmoid', 'rbf', 'poly']},
             verbose=1)

In [32]:
svm_results = pd.DataFrame(svm.cv_results_)
svm_results = svm_results[["param_C", "param_kernel", "param_degree", "param_gamma", "param_class_weight", "mean_test_score"]]
svm_results.sort_values(["mean_test_score"], ascending=False).head(10)

Unnamed: 0,param_C,param_kernel,param_degree,param_gamma,param_class_weight,mean_test_score
16,1,linear,3,scale,balanced,0.96797
20,1,linear,3,auto,balanced,0.96797
24,1,linear,5,scale,balanced,0.96797
28,1,linear,5,auto,balanced,0.96797
85,10,sigmoid,3,auto,balanced,0.963425
93,10,sigmoid,5,auto,balanced,0.963425
54,5,rbf,3,auto,balanced,0.963319
18,1,rbf,3,scale,balanced,0.963319
26,1,rbf,5,scale,balanced,0.963319
62,5,rbf,5,auto,balanced,0.963319


## Logistic Regression

In [33]:
parameters = {
    "penalty":["l1", "l2", "elasticnet", "none"], 
    "C":[1, 5, 10, 20],
    "class_weight":["None", "balanced"],
    "solver":["liblinear"]}
logreg = GridSearchCV(LogisticRegression(), parameters, cv=cv, verbose=verbose)
logreg.fit(X_enc, y)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': [1, 5, 10, 20],
                         'class_weight': ['None', 'balanced'],
                         'penalty': ['l1', 'l2', 'elasticnet', 'none'],
                         'solver': ['liblinear']},
             verbose=1)

In [34]:
logreg_results = pd.DataFrame(logreg.cv_results_)
logreg_results = logreg_results[["param_penalty", "param_C", "param_class_weight", "mean_test_score"]]
logreg_results.sort_values(["mean_test_score"], ascending=False).head(10)

Unnamed: 0,param_penalty,param_C,param_class_weight,mean_test_score
21,l2,10,balanced,0.963319
29,l2,20,balanced,0.963319
5,l2,1,balanced,0.963214
12,l1,5,balanced,0.958774
13,l2,5,balanced,0.958774
4,l1,1,balanced,0.954017
20,l1,10,balanced,0.949471
28,l1,20,balanced,0.949471
0,l1,1,,
1,l2,1,,


# Results

In [35]:
print(knn_results.sort_values(["mean_test_score"], ascending=False).head(20).to_markdown(index=False))

|   param_n_neighbors | param_weights   | param_metric   |   mean_test_score |
|--------------------:|:----------------|:---------------|------------------:|
|                   1 | uniform         | euclidean      |          0.930973 |
|                   1 | uniform         | minkowski      |          0.930973 |
|                   5 | uniform         | euclidean      |          0.930973 |
|                   5 | distance        | euclidean      |          0.930973 |
|                   5 | distance        | minkowski      |          0.930973 |
|                   1 | uniform         | manhattan      |          0.930973 |
|                   1 | distance        | manhattan      |          0.930973 |
|                   5 | uniform         | manhattan      |          0.930973 |
|                   5 | distance        | manhattan      |          0.930973 |
|                   1 | distance        | euclidean      |          0.930973 |
|                   5 | uniform         | minkowski 

In [36]:
print(dt_results.sort_values(["mean_test_score"], ascending=False).head(20).to_markdown(index=False))

| param_criterion   | param_splitter   |   param_max_depth | param_max_features   |   mean_test_score |
|:------------------|:-----------------|------------------:|:---------------------|------------------:|
| entropy           | best             |                10 | log2                 |          0.94482  |
| entropy           | best             |                20 | auto                 |          0.940486 |
| gini              | random           |                20 | auto                 |          0.935729 |
| gini              | random           |                20 | sqrt                 |          0.931184 |
| entropy           | best             |                20 | sqrt                 |          0.930973 |
| entropy           | best             |                 5 | log2                 |          0.930973 |
| entropy           | random           |                10 | auto                 |          0.926638 |
| gini              | best             |                20 | sqr

In [37]:
print(rf_results.sort_values(["mean_test_score"], ascending=False).head(20).to_markdown(index=False))

|   param_n_estimators | param_criterion   |   param_max_depth | param_max_features   |   mean_test_score |
|---------------------:|:------------------|------------------:|:---------------------|------------------:|
|                  100 | gini              |                20 | auto                 |          0.963319 |
|                  100 | gini              |                 5 | auto                 |          0.963214 |
|                  200 | gini              |                 5 | auto                 |          0.963214 |
|                   50 | gini              |                10 | log2                 |          0.963214 |
|                   10 | entropy           |                 5 | auto                 |          0.958668 |
|                  100 | gini              |                10 | auto                 |          0.958668 |
|                  100 | gini              |                20 | sqrt                 |          0.958668 |
|                   50 | gin

In [38]:
print(svm_results.sort_values(["mean_test_score"], ascending=False).head(20).to_markdown(index=False))

|   param_C | param_kernel   |   param_degree | param_gamma   | param_class_weight   |   mean_test_score |
|----------:|:---------------|---------------:|:--------------|:---------------------|------------------:|
|         1 | linear         |              3 | scale         | balanced             |          0.96797  |
|         1 | linear         |              3 | auto          | balanced             |          0.96797  |
|         1 | linear         |              5 | scale         | balanced             |          0.96797  |
|         1 | linear         |              5 | auto          | balanced             |          0.96797  |
|        10 | sigmoid        |              3 | auto          | balanced             |          0.963425 |
|        10 | sigmoid        |              5 | auto          | balanced             |          0.963425 |
|         5 | rbf            |              3 | auto          | balanced             |          0.963319 |
|         1 | rbf            |       

In [39]:
print(logreg_results.sort_values(["mean_test_score"], ascending=False).head(20).to_markdown(index=False))

| param_penalty   |   param_C | param_class_weight   |   mean_test_score |
|:----------------|----------:|:---------------------|------------------:|
| l2              |        10 | balanced             |          0.963319 |
| l2              |        20 | balanced             |          0.963319 |
| l2              |         1 | balanced             |          0.963214 |
| l1              |         5 | balanced             |          0.958774 |
| l2              |         5 | balanced             |          0.958774 |
| l1              |         1 | balanced             |          0.954017 |
| l1              |        10 | balanced             |          0.949471 |
| l1              |        20 | balanced             |          0.949471 |
| l1              |         1 | None                 |        nan        |
| l2              |         1 | None                 |        nan        |
| elasticnet      |         1 | None                 |        nan        |
| none            |      

# Final model

In [40]:
model = LogisticRegression(penalty="l2", C=10, class_weight="balanced")
model.fit(X_enc, y)

LogisticRegression(C=10, class_weight='balanced')

In [41]:
predictions = pd.DataFrame(test["ID"])
predictions["class"] = model.predict(X_test_enc)

In [42]:
predictions.to_csv("submission.csv", index=False)