# Imports

In [None]:
from os.path import join
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.plotting import plot_decision_regions
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import label_binarize
from sklearn.metrics import classification_report, confusion_matrix,f1_score,make_scorer,roc_curve, auc
from mpl_toolkits.mplot3d import Axes3D
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import learning_curve,ShuffleSplit,validation_curve, train_test_split #for splitting data
import sklearn.metrics
from sklearn.svm import SVC
from tpot import TPOTClassifier
from scipy import stats
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

# Retrieve dataset

In [None]:
DS_URL = "https://raw.githubusercontent.com/clintonyeb/ml-dataset/master/BEPS.csv"
FIG_SIZE=(12, 6)

In [None]:
beps = pd.read_csv(DS_URL, names=["id", "vote", "age", "nat_cond", "hhold_cond", "labor_lead_assmnt", "cons_lead_assmnt", "democ_lead_assmnt", "euro_intg_attud", "political_knowledge", "gender"], index_col="id", header=0)
beps.head(10)

# Exploratory Data Analysis (EDA)

In [None]:
print("Number of records: ", len(beps))
print("Shape: ", beps.shape)
# Checks if there are any missing values
print("\nMissing data?")
beps.isnull().sum()

In [None]:
sns.countplot(x="vote", data=beps);

The Labor party won that election. This might be the reason why it's more represented here!

In [None]:
sns.set(style="whitegrid")
sns.violinplot(x="vote", y="age", data=beps);

In [None]:
sns.boxplot(x="vote", y="age", data=beps);

We can tell from the above two graphs that the Conservate party voter's typical age is higher than that of the two other parties

In [None]:
beps.groupby('vote')['nat_cond'].plot.hist(legend=True, figsize=FIG_SIZE);

It seems like the Labor's party voters were happier with the national economic conditions than the others, followed by the Liberal Democrat's

In [None]:
beps.groupby('vote')['hhold_cond'].plot.hist(legend=True, figsize=FIG_SIZE);

The public attitude towards household economic conditions reflects that towards national economic conditions

In [None]:
beps.groupby('vote')['labor_lead_assmnt'].plot.hist(legend=True, figsize=FIG_SIZE);

It seems like the Labor's leader (i.e. Tony Blair) was just fine, but the voters might wanted more, because even among the Labor's voters there were way more 4s than 5s. Also, it seems like he was more popular among the Libral Democrats than the Conservatives.

In [None]:
beps.groupby('vote')['cons_lead_assmnt'].plot.hist(legend=True, figsize=FIG_SIZE);

It doesn't seem like the conservative's leader (i.e. John Major) was more popular among Labour's voters than the Labour's leader was among the Conservatives!
But the Liberal Democrats seemed more into the Labour's leader than the Conservative's leader.

In [None]:
beps.groupby('vote')['democ_lead_assmnt'].plot.hist(legend=True, figsize=FIG_SIZE);

The Liberal Democrat's leader (i.e. Paddy Ashdown) seemed just fine, but not so popular even among Liberal Democrats or the Labour's voters.
But it obvious that the Conservatives didn't like him at all.

In [None]:
beps.groupby('vote')['euro_intg_attud'].plot.hist(legend=True, figsize=FIG_SIZE);

The most prominent attitude was the Conservatives attitude! They seemed very Eurosceptic! 

In [None]:
beps.groupby(['vote', 'gender'])['vote'].count().unstack('gender').plot.bar(stacked=True, figsize=FIG_SIZE);

The number of female voters in almost all the parties was almost half the number of male voters!

In [None]:
g = sns.FacetGrid(beps, col="vote", margin_titles=True)
g.map(plt.hist, "political_knowledge", color="steelblue");

In [None]:
plt.figure(figsize=FIG_SIZE)
sns.countplot(x='political_knowledge', hue='vote', data=beps);

We can vaguely say that the Conservatives tend to report higher knowledge of parties' positions on European integration than the other parties' voters tend to do!

In [None]:
nat_hhold = beps.groupby(["nat_cond", "hhold_cond"])["nat_cond"].count()
plt.figure(figsize=FIG_SIZE)
sns.heatmap(nat_hhold.unstack("hhold_cond"), annot=True, cmap="YlGnBu");

In [None]:
nat_hhold.unstack().plot(figsize=FIG_SIZE);

The relationship between voter's assessment of current national vs. household economic conditions is not linear! Voters seemed half-half satisfied with both!

In [None]:
plt.figure(figsize=(17, 6))
vote_lab = beps.loc[beps.vote == 'Labour']
vote_cons = beps.loc[beps.vote == 'Conservative']
vote_democ = beps.loc[beps.vote == 'Liberal Democrat']
plt.subplot(131)
sns.kdeplot(vote_lab['euro_intg_attud'], vote_lab['age'], cmap="YlOrBr", shade=True, shade_lowest=False)
plt.subplot(132)
sns.kdeplot(vote_cons['euro_intg_attud'], vote_cons['age'], cmap="Reds", shade=True, shade_lowest=False)
plt.subplot(133)
sns.kdeplot(vote_democ['euro_intg_attud'], vote_democ['age'], cmap="Blues", shade=True, shade_lowest=False);

The trend of older and more Eurosceptic Conservatives is obvious once more!

In [None]:
fig = plt.figure(figsize=FIG_SIZE)
ax = fig.add_subplot(111, projection='3d')
beps_c = beps['vote'].map({'Labour':'r', 'Conservative':'b', 'Liberal Democrat':'g'})
ax.scatter(beps['labor_lead_assmnt'], beps['cons_lead_assmnt'], beps['democ_lead_assmnt'], s = 60, c=beps_c)
ax.set_xlabel('labor_lead_assmnt')
ax.set_ylabel('cons_lead_assmnt')
ax.set_zlabel('domoc_lead_assmnt')
plt.show()

In [None]:
sns.pairplot(beps[['age', 'nat_cond', 'hhold_cond', 'labor_lead_assmnt', 'cons_lead_assmnt', 'democ_lead_assmnt', 'euro_intg_attud', 'political_knowledge']]);

There is no linear correlation between any pair of variables! Even between variables like age and attitudes toward European integration for example, or age and political knowledge!

# Support Vector Machine Model (SVM) Analysis

In [None]:
#Changing to numerical value because will need in Roc Curve
y= y.replace(['Labour','Conservative','Liberal Democrat'],[0,1,2])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#X_train.iloc[1337]

In [None]:
#Creating model
model = SVC()

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
#HyperParameter Tuning
param_grid = {'C': [1,100], 
              'gamma': [0.1,0.01,0.001,'auto','scale'],
              'kernel': ['rbf','linear']}


In [None]:
f1_scorer = make_scorer(f1_score, average='micro')

In [None]:
grid = GridSearchCV(
    estimator=SVC(),
    param_grid=param_grid,
    cv=5,
    return_train_score=False,
    scoring=f1_scorer, 
    n_jobs=-1,
    verbose=2)
svm_grid=grid.fit(X_train,y_train)

In [None]:
grid.best_params_

In [None]:
grid.best_score_

In [None]:
grid_predictions = grid.predict(X_test)
print(confusion_matrix(y_test,grid_predictions))
print(classification_report(y_test,grid_predictions))

In [None]:
#Binarize the output for plot
yy = label_binarize(y, classes=[0, 1, 2])
n_classes = yy.shape[1]

In [None]:
#Generating new train and test data with the binarize data
Xx_train, Xx_test, yx_train, yx_test = train_test_split(X, yy, test_size=0.2)

In [None]:
# Learn to predict each class against the other
#ROC curve using the best params found in gridSearch
classifier = OneVsRestClassifier(SVC(C=100,kernel='rbf', probability=True,
                                 gamma='scale'))

In [None]:
#Generating new yscore with the binarize data
yy_score = classifier.fit(Xx_train, yx_train).decision_function(Xx_test)

In [None]:
# Compute ROC curve for the model with the best paramater
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(yx_test[:, i], yy_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(yx_test.ravel(), yy_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

In [None]:
plt.figure()
lw = 2
plt.plot(fpr[2], tpr[2], color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="lower right")
plt.show()

In [None]:
#Method for ploting learning curve and Validation Curve
def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
   
    if axes is None:
        _, axes = plt.subplots(1, 1, figsize=(20, 5))

    axes[0].set_title(title)
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes,
                       return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes[0].legend(loc="best")
    
    #plot Validation curve
    
    param_range = np.logspace(-6, -1, 5)
    trainVC_scores, testVC_scores = validation_curve(
        estimator, X, y, param_name="gamma", param_range=param_range,
        scoring="accuracy", n_jobs=1)
    trainVC_scores_mean = np.mean(trainVC_scores, axis=1)
    trainVC_scores_std = np.std(trainVC_scores, axis=1)
    testVC_scores_mean = np.mean(testVC_scores, axis=1)
    testVC_scores_std = np.std(testVC_scores, axis=1)

    plt.title("Validation Curve with SVM")
    plt.xlabel(r"$\gamma$")
    plt.ylabel("Score")
    plt.ylim(0.0, 1.1)
    lw = 2
    plt.semilogx(param_range, trainVC_scores_mean, label="Training score",
                 color="darkorange", lw=lw)
    plt.fill_between(param_range, trainVC_scores_mean - trainVC_scores_std,
                     trainVC_scores_mean + trainVC_scores_std, alpha=0.2,
                     color="darkorange", lw=lw)
    plt.semilogx(param_range, testVC_scores_mean, label="Cross-validation score",
                 color="navy", lw=lw)
    plt.fill_between(param_range, testVC_scores_mean - testVC_scores_std,
                     testVC_scores_mean + testVC_scores_std, alpha=0.2,
                     color="navy", lw=lw)
    plt.legend(loc="best")

    return plt


In [None]:
fig, axes = plt.subplots(2, 1, figsize=(10, 15))
title = r"Learning Curves (SVM, RBF kernel, scale)"
# SVC is more expensive so we do a lower number of CV iterations:
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
estimator =SVC(C=100,kernel='rbf', probability=True,gamma='scale')
plot_learning_curve(estimator, title, X, y, axes=axes,
                    cv=cv, n_jobs=4)

plt.show()

From the learning curve we can see that as the dataset is increacing the training accuracy is decreasing exponentially ,while the cross-validation score is increasing until they both reaches a point where they start converging to a lower value. We can conclude that our dataset is really complexe and won't benefit from adding more dataset.
From the validation curve

References:
    -https://github.com/corymaklin/svm/blob/master/svm.ipynb
    -https://scikit-learn.org/stable/auto_examples/datasets/plot_random_dataset.html
    -https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html
    -https://stats.stackexchange.com/questions/437072/use-f1-score-in-gridsearchcv
    -https://matplotlib.org/3.1.1/tutorials/toolkits/mplot3d.html
    -https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
    -https://docs.w3cub.com/scikit_learn/auto_examples/model_selection/plot_validation_curve/
        -https://matplotlib.org/3.1.1/api/_as_gen/mpl_toolkits.mplot3d.axes3d.Axes3D.html
    -https://pythonprogramming.net/matplotlib-3d-scatterplot-tutorial/
    -https://stackoverflow.com/questions/55375515/change-marker-color-in-3d-scatter-plot-based-on-condition
    -https://stackoverflow.com/questions/19451400/matplotlib-scatter-marker-size
    -https://github.com/dataprofessor/code/blob/master/python/ROC_curve.ipynb


# Auto ML

In [None]:
y_train_num = y_train.replace(['Labour', 'Conservative', 'Liberal Democrat'], [1, 2, 3])
y_test_num = y_test.replace(['Labour', 'Conservative', 'Liberal Democrat'], [1, 2, 3])

In [None]:
pipeline_optimizer = TPOTClassifier(generations=5, cv=5, random_state=42, verbosity=2)

In [None]:
pipeline_optimizer.fit(X_train, y_train_num)

In [None]:
print(pipeline_optimizer.score(X_test, y_test_num)) 

In [None]:
#watever