# Congressional voting data

Identify party allegiance of members of the US national congress based on their voting behaviour.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import time
from math import sqrt
from math import log2
%matplotlib inline

# General sklearn stuff
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler,PowerTransformer,MinMaxScaler,QuantileTransformer,normalize
from sklearn.feature_selection import VarianceThreshold, SelectKBest
from sklearn.model_selection import cross_val_score

# Models
from sklearn.ensemble import RandomForestClassifier as RFC

from IPython.display import display

In [None]:
# Custom functions
import pathlib
import os
import sys

module_path = pathlib.Path(os.getcwd()).parent
if str(module_path) not in sys.path:
    sys.path.append(str(module_path))
print(sys.path)

from common.dataset_grabber import get_data_path
import functions

In [None]:
datapath = get_data_path("Congressional_Voting", "CongressionalVotingID.shuf.train.csv")
display(datapath)

In [None]:
def preprocess_voting_rf(df:pd.DataFrame):
    try: 
        df.rename(columns={"class": "Class"})
    except:
        print("Did not find col \"class\"!")

    num_samples, num_cols = df.shape
    num_feats = num_cols -2
    num_samples, num_cols, num_feats

    target = "Class"
    feats = [x for x in df.columns if x not in [target, "ID"]]

    df.replace({"y": 1, "n": -1, "unknown": 0}, inplace=True)

    return df, (num_samples, num_cols, num_feats), (target, feats)

In [None]:
print("Before preprocess:")
df = pd.read_csv(datapath).rename(columns={"class": "Class"})
display(df.head())
print("-"*50)
print("After preprocess:")
df, (num_samples, num_cols, num_feats), (target, feats) = preprocess_voting_rf(df)
df.head()

In [None]:
df.info()

Pretty sure I got all entries properly configured now :)

In [None]:
corr_to_feat = functions.plot_corr_heatmap(df[[target]+feats].replace({"republican":0, "democrat":1}), feat_to_ret="Class",ticksfont=16)

In [None]:
corr_to_feat

In [None]:
keep = list( filter(lambda x: x[0] > 0.3, list(zip(corr_to_feat, corr_to_feat.index))))
keep

In [None]:
feats_keep = [x[1] for x in keep if x[1] != "Class"]
feats_keep

# Model

In [None]:
train, valid = train_test_split(df, test_size=0.3)
x, y = train[feats], train[target]
train.head()

In [None]:
train, valid = train_test_split(df[[target]+feats_keep], test_size=0.3)
x, y = train[feats_keep], train[target]
train.head()

In [None]:
display(x)
display(y)

In [None]:
scores={}

In [None]:
rf_paramgrid = {
    "n_estimators": [1, 5, 10, 20, 100, 200, 500, 1000],
    "criterion": ["gini", "entropy"],
    "max_features": ["sqrt", "log2"]
}

In [None]:
for n in [1, 5, 10, 20, 100, 200, 500, 1000]:
    start = time.time()
    rf = RFC(n_jobs=-1, bootstrap=True, oob_score=True, n_estimators=n, verbose=0)
    scores[str(n)] = cross_val_score(rf, df[feats], df[target], cv=5, n_jobs=-1)
    print(f"Runtime(n={n}): {time.time()-start:.3f}s")
print()
print(f"scores: {scores}")

In [None]:
scores_mean = {}
for k, v in scores.items():
    scores_mean[k] = v.mean()
scores_mean

In [None]:
plt.figure(figsize=(10,8))
keys = scores_mean.keys()
values = scores_mean.values()
plt.plot(keys, values)
plt.suptitle("With dropping");

In [None]:
plt.figure(figsize=(10,8))
keys = scores_mean.keys()
values= scores_mean.values()
plt.plot(keys, values)
plt.suptitle("Without dropping");

In [None]:
plt.figure(figsize=(10,8))
for key in scores:
    plt.plot(range(1,5+1),scores[key], label=key)
plt.legend();