In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix
plt.style.use("seaborn-dark")
plt.rcParams["figure.figsize"] = [9, 6]

## -- read in data

In [None]:
df = pd.read_csv('../../datasets/usa_election_dataset.csv')

In [None]:
df['winnerc'] = 0
df.loc[lambda x: x['winner'].str.contains('Trump'), 'winnerc'] = 1
df['largecity'] = 0
df.loc[lambda x: x['tot_pop']>x['tot_pop'].mean(), 'largecity'] = 1

## -- model confs and fit

In [None]:
seed = 3
indepedents = [
    'tot_pop',
    'yougn',
    'female',
    'black',
]
X = df[indepedents].values
y = df['winnerc'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.22, random_state=seed)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
classifier = LogisticRegression(random_state=seed, solver='sag', multi_class='ovr')
classifier.fit(X_train, y_train)
y_predict = classifier.predict(X_test)

## -- results

In [None]:
dftmp = pd.DataFrame({'true': y_test, 'pred': y_predict})
dftmp['count'] = 1
dftmp.groupby(['true', 'pred'])[['count']].sum()

In [None]:
plot_confusion_matrix(classifier,
                      X_test,
                      y_test,
                      display_labels=['BIDEN', 'TRUMP'],
                      cmap=plt.cm.Reds,
                      normalize='true')

## 1. model evaluation

## 1.1 Accuracy

In [None]:
# sklearn metrics

from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_predict)

# defined

def accuracy(y_test, y_predict):
    """Correctly predicted instances over the total predicted instances:  (tp + tn) / (p + n)"""
    return len([t for t, p in zip(y_test, y_predict) if t == p]) / len(y_test)

accuracy(y_test, y_predict)

## 1.2 Precision

In [None]:
# sklearn metrics

from sklearn.metrics import precision_score

precision_score(y_test, y_predict)

# defined

def precision(y_test, y_predict):
    """Correcly predicted instances over the true predictions: tp / (tp + fp)"""
    return len([t for t, p in zip(y_test, y_predict) if t == p and p == 1]) / len([_p for _p in y_predict if _p == 1])

precision(y_test, y_predict)

## 1.3 Recall

In [None]:
# sklearn metrics

from sklearn.metrics import recall_score

recall_score(y_test, y_predict)

# defined

def recall(y_test, y_predict):
    """Correcly predicted instances over the true instances, True positive rate: tp / (tp + fn)"""
    return len([t for t, p in zip(y_test, y_predict) if t == p and t == 1]) / len([_t for _t in y_test if _t == 1])

recall(y_test, y_predict)

## 1.4 Specificity

In [None]:
def specificity(y_test, y_predict):
    """Correcly predicted instances over the false instances, True negative rate: tn / (fp + tn)"""
    return len([t for t, p in zip(y_test, y_predict) if t == p and t == 0]) / len([_t for _t in y_test if _t == 0])

specificity(y_test, y_predict)

## 1.5 FPR

In [None]:
def false_positive_rate(y_test, y_predict):
    """Wrongly predicted instances over the false instances, False positive rate: fp / (fp / tn)"""
    return len([t for t, p in zip(y_test, y_predict) if t != p and t == 0]) / len([_t for _t in y_test if _t == 0])

false_positive_rate(y_test, y_predict)

## 1.6 F-measure

In [None]:
# sklearn metrics

from sklearn.metrics import f1_score

f1_score(y_test, y_predict)

# defined

def precision(y_test, y_predict):
    """Correcly predicted instances over the true predictions: tp / (tp + fp)"""
    return len([t for t, p in zip(y_test, y_predict) if t == p and p == 1]) / len([_p for _p in y_predict if _p == 1])

def recall(y_test, y_predict):
    """Correcly predicted instances over the true instances, True positive rate: tp / (tp + fn)"""
    return len([t for t, p in zip(y_test, y_predict) if t == p and t == 1]) / len([_t for _t in y_test if _t == 1])

def f_measure(y_test, y_predict):
    """Measure to balance fp & fn"""
    prec = precision(y_test, y_predict)
    rec = recall(y_test, y_predict)
    return 2 * (prec * rec) / (prec + rec)

f_measure(y_test, y_predict)

## 2. ROC graph - interesting to compare different models

In [None]:
def recall(y_test, y_predict):
    """Correcly predicted instances over the true instances, True positive rate: tp / (tp + fn)"""
    return len([t for t, p in zip(y_test, y_predict) if t == p and t == 1]) / len([_t for _t in y_test if _t == 1])

def false_positive_rate(y_test, y_predict):
    """Wrongly predicted instances over the false instances, False positive rate: fp / (fp / tn)"""
    return len([t for t, p in zip(y_test, y_predict) if t != p and t == 0]) / len([_t for _t in y_test if _t == 0])

def roc_graph(y_test, y_predict):
    """Plot both the TPR and the FPR in scattered graph"""
    tpr = recall(y_test, y_predict)
    fpr = false_positive_rate(y_test, y_predict)
    plt.scatter(fpr, tpr)
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.xlim(0, 1)
    plt.ylim(0, 1)
    
roc_graph(y_test, y_predict)

## 3. ROC curve

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

# define sets

classifier = LogisticRegression(random_state=9, solver='sag', multi_class='ovr')
classifier.fit(X_train, y_train)

y_probs = classifier.predict_proba(X_test)
y_probs_pos = y_probs[:, 1]
random_probs_pos = [0 for _ in range(len(y_test))]

random_auc = roc_auc_score(y_test, random_probs_pos)
classifier_auc = roc_auc_score(y_test, y_probs_pos)
rn_fpr, rn_tpr, _ = roc_curve(y_test, random_probs_pos)
lr_fpr, lr_tpr, _ = roc_curve(y_test, y_probs_pos)

plt.plot(rn_fpr, rn_tpr, linestyle='--', label='Random')
plt.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()