In [None]:
import pandas as pd
import numpy as np
import random
from sklearn import metrics
import matplotlib
import matplotlib.pyplot as plt
plt.style.use("fivethirtyeight")
plt.rcParams["figure.figsize"] = [9, 6]

In [None]:
df = pd.read_csv('../../datasets/usa_election_dataset.csv')

In [None]:
df[:5]

In [None]:
df['winnerc'] = 0
df.loc[lambda x: x['winner'].str.contains('Trump'), 'winnerc'] = 1
df['largecity'] = 0
df.loc[lambda x: x['tot_pop']>x['tot_pop'].mean(), 'largecity'] = 1

## plots

In [None]:
# df.boxplot('yougn', by='winner')

## entropy & information gain

In [None]:
from math import log

In [None]:
def pdentropy(dataf, column):
    p1 = len(dataf.loc[lambda x: x[column]==list(set(x[column].values))[0]]) / len(dataf)
    p2 = 1 - p1
    return round(- p1 * log(p1, 2) - p2 * log(p2, 2), 3)

In [None]:
pdentropy(df, 'winner')

In [None]:
entr_un = pdentropy(df, 'winner',)
dfseg1 = df.loc[lambda x: x['largecity']==0]
dfseg2 = df.loc[lambda x: x['largecity']==1]
freq_seg1 = len(dfseg1) / len(df)
entr_seg1 = pdentropy(dfseg1,  'winner',)
freq_seg2 = len(dfseg2) / len(df)
entr_seg2 = pdentropy(dfseg2,  'winner',)

In [None]:
def information_gain_two_segments(entr_unseg, freq1, entr1, freq2, entr2):
    return round(entr_unseg - freq1 * entr1 - freq2 * entr2, 3)

In [None]:
information_gain_two_segments(entr_un, freq_seg1, entr_seg1, freq_seg2, entr_seg2)

## => Classification modeling

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
seed = 3
indepedents = [
    'tot_pop',
    'yougn',
    'female',
    'black',
]
X = df[indepedents].values
y = df['winnerc'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.22, random_state=seed)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
## random forecast accuracy
y_random = []
for r in range(len(y_test)):
    y_random.append(random.randint(0, 1))
print(f'random accuracy: {round(metrics.accuracy_score(y_test, y_random), 3)}')

In [None]:
from sklearn.metrics import plot_confusion_matrix
plt.style.use("seaborn-dark")

# 1.1 logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=seed, solver='sag', multi_class='ovr')
classifier.fit(X_train, y_train)
y_predict = classifier.predict(X_test)
print(f'model accuracy: {round(metrics.accuracy_score(y_test, y_predict), 3)}')

In [None]:
dftmp = pd.DataFrame({'true': y_test, 'pred': y_predict})
dftmp['count'] = 1
dftmp.groupby(['true', 'pred'])[['count']].sum()

In [None]:
plot_confusion_matrix(classifier,
                      X_test,
                      y_test,
                      display_labels=['BIDEN', 'TRUMP'],
                      cmap=plt.cm.Reds,
                      normalize='true')

## 1.2 SVM

In [None]:
from sklearn.svm import LinearSVC

In [None]:
classifier = LinearSVC(max_iter=100000, dual=True)
classifier.fit(X_train, y_train)
y_predict = classifier.predict(X_test)
print(f'model accuracy: {round(metrics.accuracy_score(y_test, y_predict), 3)}')

In [None]:
plot_confusion_matrix(classifier,
                      X_test,
                      y_test,
                      display_labels=['BIDEN','TRUMP'],
                      cmap=plt.cm.Reds,
                      normalize='true')

## 1.3 classification tree

#### 1.3.1 Decision stump: 1 level tree

In [None]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

In [None]:
classifier = DecisionTreeClassifier(max_depth=1, random_state=0)
classifier.fit(X_train, y_train)
y_predict = classifier.predict(X_test)
print(f'model accuracy: {round(metrics.accuracy_score(y_test, y_predict), 3)}')

In [None]:
tree.plot_tree(classifier)

In [None]:
count = 0
for x in df[indepedents].columns:
    print(f'{count}: {x}')
    count += 1

#### 1.3.2 Tree: maximized on leaf sample size in train set

In [None]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

In [None]:
classifier = DecisionTreeClassifier(min_samples_leaf=int(len(y_train)*0.2), random_state=10)
classifier.fit(X_train, y_train)
y_predict = classifier.predict(X_test)
print(f'model accuracy: {round(metrics.accuracy_score(y_test, y_predict), 3)}')

In [None]:
tree.plot_tree(classifier)

In [None]:
count = 0
for x in df[indepedents].columns:
    print(f'{count}: {x}')
    count += 1

#### 1.3.3 Random forecast

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
classifier = RandomForestClassifier(n_estimators=100, max_depth=1, random_state=0)
classifier.fit(X_train, y_train)
y_predict = classifier.predict(X_test)
print(f'model accuracy: {round(metrics.accuracy_score(y_test, y_predict), 3)}')

In [None]:
plot_confusion_matrix(classifier,
                      X_test,
                      y_test,
                      display_labels=['BIDEN','TRUMP'],
                      cmap=plt.cm.Reds,
                      normalize='true')

## 1.4 neural networks

In [None]:
import tensorflow as tf
from tensorflow import keras

In [None]:
classifier = keras.Sequential([
    keras.layers.Flatten(input_shape=(4,)),
    keras.layers.Dense(16, activation=tf.nn.relu),
	keras.layers.Dense(16, activation=tf.nn.relu),
    keras.layers.Dense(1, activation=tf.nn.sigmoid),
])

classifier.compile(optimizer='adam',
                   loss='binary_crossentropy',
                   metrics=['accuracy'])

classifier.fit(X_train, y_train, epochs=50, batch_size=1)

In [None]:
test_loss, test_acc = classifier.evaluate(X_test, y_test)
print(f'model accuracy: {round(test_acc, 3)}')