In [None]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [10, 10]

In [None]:
df = pd.read_csv('../../datasets/usa_election_dataset.csv')

In [None]:
df['winnerc'] = 0
df.loc[lambda x: x['winner'].str.contains('Trump'), 'winnerc'] = 1
df['largecity'] = 0
df.loc[lambda x: x['tot_pop']>x['tot_pop'].mean(), 'largecity'] = 1

In [None]:
df[:5]

## 1. overfitting

### 1.1 tree induction: fitting graph

In [None]:
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
plt.style.use("seaborn-dark")

In [None]:
def tree_fit_score(X_train, y_train, X_test=None, y_test=None, n_nodes=2, use_testset=True):
    """Fit model on training data, and calculate accuracy based on training or test data"""
    
    classifier = DecisionTreeClassifier(max_leaf_nodes=n_nodes, random_state=3)
    classifier.fit(X_train, y_train)
    if use_testset == False:
        y_predict = classifier.predict(X_train)
        score = metrics.accuracy_score(y_train, y_predict)
    else:
        y_predict = classifier.predict(X_test)
        score = metrics.accuracy_score(y_test, y_predict)
    return score
    
def n_node_increase_scores(X_train, y_train, X_test, y_test, node_range):
    """Get model accuracies for the given n nodes range"""
    
    rows = []
    for n in node_range:
        _score_full = tree_fit_score(X_train, y_train, n_nodes=n, use_testset=False)
        _score_testtrain = tree_fit_score(X_train, y_train, X_test, y_test, n_nodes=n)
        rows.append([n, _score_full, _score_testtrain])
        
    return rows

def plot_fitting_graph(rows):
    """Plot n nodes accuracies based on both training and test data"""
    
    plt.plot([y[0] for y in rows], [y[1] for y in rows], label='train_performance', color='k')
    plt.plot([y[0] for y in rows], [y[2] for y in rows], label='test_performance', color='k', linestyle='dashed')
    plt.legend()
    
def main():
    """Execute"""
    
    indepedents = [
        'tot_pop',
        'yougn',
        'female',
        'black',
    ]
    dependent = 'winnerc'
    start_n_nodes = 2
    end_n_nodes = 50
    
    X = df[indepedents].values
    y = df[dependent].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=3)
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    scores = n_node_increase_scores(X_train, y_train, X_test, y_test, range(start_n_nodes, end_n_nodes+1))
    plot_fitting_graph(scores)
    
main()

### 1.2 classification: learning curve

In [None]:
import numpy as np
import random
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn import metrics

In [None]:
def run_models(X_train, X_test, y_train, y_test, seed=10):
    """fit and predict for tree, svm and logistic classification models"""
    
    treeclassifier = DecisionTreeClassifier(max_leaf_nodes=7, random_state=seed)
    treeclassifier.fit(X_train, y_train)
    y_predict = treeclassifier.predict(X_test)
    treescore = metrics.accuracy_score(y_test, y_predict)
    
    svmclassifier = LinearSVC(max_iter=100000, dual=True, random_state=seed)
    svmclassifier.fit(X_train, y_train)
    y_predict = svmclassifier.predict(X_test)
    svmscore = metrics.accuracy_score(y_test, y_predict)
    
    lrclassifier = LogisticRegression(solver='sag', multi_class='ovr', random_state=seed)
    lrclassifier.fit(X_train, y_train)
    y_predict = lrclassifier.predict(X_test)
    lrscore = metrics.accuracy_score(y_test, y_predict)
    
    return [treescore, svmscore, lrscore]

def trainset_increase_scores(X_train, X_test, y_train, y_test, steps, end):
    """increase train set size as model input"""
    
    scores = []
    for n in range(steps, end+steps, steps):
        random_indices = random.sample(range(len(X_train)), n)
        xtrains, ytrains = [], []
        for ri in random_indices:
            xtrains.append(X_train[ri])
            ytrains.append(y_train[ri])
        X_train_select = np.array(xtrains)
        y_train_select = np.array(ytrains)
        
        runscores = run_models(X_train_select, X_test, y_train_select, y_test)
        scores.append([n] + runscores)
        
    return scores

def plot_learning_curve(rows):
    """Plot model accuracies over increasing train set size"""
    
    plt.plot([y[0] for y in rows], [y[1] for y in rows], label='tree', color='r')
    plt.plot([y[0] for y in rows], [y[2] for y in rows], label='svm', color='b',)
    plt.plot([y[0] for y in rows], [y[3] for y in rows], label='lr', color='k',)
    plt.legend()
    
def main():
    """Execute"""
    
    indepedents = [
        'tot_pop',
        'yougn',
        'female',
        'black',
    ]
    dependent = 'winnerc'
    start_n_nodes = 2
    end_n_nodes = 50
    
    X = df[indepedents].values
    y = df[dependent].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=3)
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    scores = trainset_increase_scores(X_train, X_test, y_train, y_test, 20, 1000)
    plot_learning_curve(scores)
    
main()

### 1.3 bagging

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn import metrics

In [None]:
indepedents = [
    'tot_pop',
    'yougn',
    'female',
    'black',
]
dependent = 'winnerc'

X = df[indepedents].values
y = df[dependent].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=3)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
classifier =  BaggingClassifier(base_estimator=SVC(),
                                n_estimators=10,
                                random_state=30)
classifier.fit(X_train, y_train)
y_predict = classifier.predict(X_test)
print(f'model accuracy: {round(metrics.accuracy_score(y_test, y_predict), 3)}')

### 1.4 boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics

In [None]:
indepedents = [
    'tot_pop',
    'yougn',
    'female',
    'black',
]
dependent = 'winnerc'

X = df[indepedents].values
y = df[dependent].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=3)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
params = {
    'n_estimators': 500,
    'max_depth': 4,
    'random_state': 10,
    'min_samples_split': 5,
    'learning_rate': 0.01,
}

classifier = GradientBoostingClassifier(**params)
classifier.fit(X_train, y_train)
y_predict = classifier.predict(X_test)
print(f'model accuracy: {round(metrics.accuracy_score(y_test, y_predict), 3)}')