# Classification for Prediction

<p> Here, we want to classify cheaters. Today's data set reports the results of an old survey of married people. Each row is associated to a person, whereas the columns report:
<ul>
<li><b>age, children</b>: age and number of children
<li><b>religious</b>: the person's religiousness
<li><b>educ</b>: the person's education level 
<li><b>occupation</b>: a code that identifies the person's occupation
<li><b>rate_marriage</b>: how the person rates his or her marriage,
<li><b>yrs_married</b>: length of the marriage, in years
<li><b>affairs</b>: time spent, in hours/week, in extra-marital affairs 
</ul></p>
<p>That is all we need to know about the columns' meaning.</p>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
%pylab inline

import sklearn as sk

In [None]:
df = pd.read_csv('affairs.csv',index_col=0)

In [None]:
df

In [None]:
(df.affairs == 0).mean()

In [None]:
df['Cheater'] = (df.affairs > 0) * 1.0

In [None]:
df = pd.get_dummies(df,columns=['occupation'])

In [None]:
X = df.drop(columns=['affairs', 'Cheater'])

In [None]:
Y = df.Cheater

# Hold-out sample

Split the data into 70% training and 30% test

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,
                            test_size=0.3,random_state = 0,)

In [None]:
len(X_train)

In [None]:
len(Y_train)

In [None]:
len(X_test)

In [None]:
len(Y_test)


### Train on the training set

In [None]:
from sklearn.ensemble import RandomForestClassifier

cl = RandomForestClassifier(random_state=2)
cl.fit(X_train,Y_train)

### predict on the test set

In [None]:
Y_pred = cl.predict(X_test)

In [None]:
(Y_pred - Y_test).abs().sum()

In [None]:
len(Y_pred)

### collect scores

#### Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(Y_test,Y_pred)

#### Accuracy

In [None]:
1 - (Y_pred - Y_test).abs().mean()

In [None]:
import sklearn
sklearn.metrics.accuracy_score(Y_test,Y_pred)

#### Precision

In [None]:
sklearn.metrics.precision_score(Y_test,Y_pred)

#### Recall

In [None]:
sklearn.metrics.recall_score(Y_test,Y_pred)

#### AUC score

In [None]:
cl.predict_proba(X_test)[:,1]

In [None]:
Y_proba = cl.predict_proba(X_test)[:,1]

In [None]:
sklearn.metrics.roc_auc_score(Y_test,Y_proba)

# Cross-validation

In [None]:
cl

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=10,random_state=0,shuffle=True)
sklearn.model_selection.cross_val_score(cl,X,Y,cv=kf,scoring='roc_auc')

In [None]:
sklearn.model_selection.cross_val_score(cl,X,Y,cv=kf,scoring='roc_auc').mean()

# Which classifier obtains the highest performance?

Here is a list of classifiers

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

clfs = [DecisionTreeClassifier(), sk.ensemble.RandomForestClassifier(n_jobs=-1), sk.naive_bayes.GaussianNB(),
        sk.linear_model.LogisticRegression(n_jobs=-1),sk.tree.DecisionTreeClassifier(),sk.ensemble.AdaBoostClassifier(),
        QuadraticDiscriminantAnalysis(),MLPClassifier(),SVC()]

Let's find the best one in terms of average AUC

In [None]:
# for every classifier in clfs, perform cross-validation, and return the 
# classifier with the largest average AUC
maxAUC = -1
bestCL = ''
for cl in clfs:
    kf = KFold(n_splits=10,random_state=2,shuffle=True)
    auc = sklearn.model_selection.cross_val_score(cl,X,Y,cv=kf,scoring='roc_auc').mean()
    if auc > maxAUC:
        bestCL = cl
        maxAUC = auc
print('***********************************************')
print (str(bestCL) + ': ' +str(maxAUC))

In [None]:
bestCL

In [None]:
maxAUC