In [1]:
# sklearn

In [2]:
# imports

# linalg
import math
import numpy as np
import pandas as pd

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white', color_codes=True)
%matplotlib inline

# pre-processing
from sklearn.pipeline import make_pipeline  
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split # train test split

# decomposition
from sklearn.decomposition import PCA

# supervised algorithms
from sklearn.linear_model import LinearRegression # linear regression
from sklearn.linear_model import RANSACRegressor # linear regression
from sklearn.linear_model import LogisticRegression #logistic regression
from sklearn.model_selection import GridSearchCV
from sklearn import svm #support vector Machine
from sklearn.ensemble import RandomForestClassifier #Random Forest
from sklearn.neighbors import KNeighborsClassifier #KNN
from sklearn.naive_bayes import GaussianNB #Naive bayes
from sklearn.tree import DecisionTreeClassifier #Decision Tree

# artificial/simulated data
from sklearn.datasets import make_blobs

# metrics
from sklearn import metrics #accuracy measure
from sklearn.metrics import confusion_matrix #for confusion matrix
from sklearn.metrics import classification_report

# cross validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

# misc
import sys,os
from IPython.display import Image, display

# to show images
def show_image(fn, width=500):
    fp1 = os.path.join('./diagrams/', fn)
    display(Image(filename=fp1, width=width))


In [3]:
# versions
print(f"np: {np.__version__}")
print(f"pd: {pd.__version__}")
print(f"sns: {sns.__version__}")
print(f"py: {sys.version_info[0:3]}") # sys.version


np: 1.17.3
pd: 0.25.2
sns: 0.9.0
py: (3, 7, 3)


In [4]:
# kaggle titanic example (pt1)
    # https://www.kaggle.com/aaysbt/titanic-datasets-eda-fe-dc-model-predictions
    # did the passenger survive?
    
train = pd.read_csv('./data/titanic_train.csv')
test = pd.read_csv('./data/titanic_test.csv')

# -----------
# data preperation steps
# -----------

# fill in missing values (imputation)
# impute age
    # determine average age, based on class
    # fill in null values with associated averages
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    if pd.isnull(Age):
        if Pclass == 1:
            return 37
        elif Pclass == 2:
            return 29
        elif Pclass == 3:
            return 24
        else:
            return 30 # this fallback value is arbitrary. do better.
    else: return Age
# fill in 'age'
train['Age'] = train[['Age', 'Pclass']].apply(impute_age, axis=1)
# remove 'cabin'
train = train.drop('Cabin', axis=1)
# remove null rows (only a handful left)
train = train.dropna()
# convert categories into dummy values
    # drop_first should be true
    # http://www.statsmodels.org/dev/contrasts.html
sex = pd.get_dummies(train['Sex'], drop_first=True)
embark = pd.get_dummies(train['Embarked'], drop_first=True)
# replace categorical columns with dummy columns
train = pd.concat([train, sex, embark], axis=1)
# drop unusable columns
train = train.drop(['Sex', 'Embarked', 'Name', 'Ticket'], axis=1)

# split
y = train.Survived
X = train.drop('Survived', axis=1)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=101)

In [5]:
# kaggle titanic example (pt2)

# -----------
# classification
# -----------
    
# logistic regression
logr = LogisticRegression(solver='liblinear')
logr.fit(X_train, y_train)
y_pred = logr.predict(X_valid)
acc = metrics.accuracy_score(y_pred, y_valid)
print(f"logistic regression accuracy: {round(acc,2)}")

# linear SVM
    # https://medium.com/@ankitnitjsr13/math-behind-support-vector-machine-svm-5e7376d0ee4d
model_svm_l = svm.SVC(kernel='linear', C=0.1, gamma=0.1)
model_svm_l.fit(X_train, y_train)
y_pred = model_svm_l.predict(X_valid)
acc = metrics.accuracy_score(y_pred, y_valid)
print(f"linear SVM accuracy: {round(acc,2)}")

# radial SVM
model_svm_rbf = svm.SVC(kernel='rbf', C=0.1, gamma=0.1)
model_svm_rbf.fit(X_train, y_train)
y_pred = model_svm_rbf.predict(X_valid)
acc = metrics.accuracy_score(y_pred, y_valid)
print(f"radial SVM accuracy: {round(acc,2)}")

# decision tree
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
y_pred = tree.predict(X_valid)
acc = metrics.accuracy_score(y_pred, y_valid)
print(f"decision tree accuracy: {round(acc,2)}")

# K nearest neighbors (KNN)
knn = KNeighborsClassifier(n_neighbors=4) # most accurate in her example
knn.fit(X_train, y_train)
y_pred = knn.predict(X_valid)
acc = metrics.accuracy_score(y_pred, y_valid)
print(f"knn accuracy: {round(acc,2)}")

# gaussian naive bayes
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_valid)
acc = metrics.accuracy_score(y_pred, y_valid)
print(f"gaussian naive bayes accuracy: {round(acc,2)}")

# random forests
rf = RandomForestClassifier(n_estimators=200) # most accurate in her example
rf.fit(X_train, y_train)
y_pred = rf.predict(X_valid)
acc = metrics.accuracy_score(y_pred, y_valid)
print(f"random forest accuracy: {round(acc,2)}")

logistic regression accuracy: 0.81
linear SVM accuracy: 0.79
radial SVM accuracy: 0.6
decision tree accuracy: 0.8
knn accuracy: 0.66
gaussian naive bayes accuracy: 0.8
random forest accuracy: 0.85


In [7]:
# kaggle titanic example (pt3)

#------------------
# cross validation
#------------------

# cross validation
    # https://towardsdatascience.com/cross-validation-70289113a072
    # assess how well a model will generalize to an independent data set

kfold = KFold(n_splits=10, random_state=22)
classifiers = [
    'Linear Svm', 'Radial Svm', 'Logistic Regression', 'KNN', 'Decision Tree', 
    'Naive Bayes' , 'Random Forest'
]
models = [
    svm.SVC(kernel='linear', gamma='scale'), svm.SVC(kernel='rbf', gamma='scale'), 
    LogisticRegression(solver='liblinear'), KNeighborsClassifier(n_neighbors=9), 
    DecisionTreeClassifier(), GaussianNB(), RandomForestClassifier(n_estimators=100)
]
# results
cv_mean = []
cv_std = []
cv_acc = []
# cv
for m in models:
    model = m
    cv_result = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')
    cv_mean.append(cv_result.mean())
    cv_std.append(cv_result.std())
    cv_acc.append(cv_result)
    print(f"mean: {round(cv_result.mean(),2)}")
    
df_results = pd.DataFrame({'CV Mean': cv_mean, 'Std': cv_std}, index=classifiers)
df_results = df_results.applymap(lambda x: round(x, 2))
print(df_results)

mean: 0.79
mean: 0.64
mean: 0.8
mean: 0.64
mean: 0.72
mean: 0.78
mean: 0.82
                     CV Mean   Std
Linear Svm              0.79  0.04
Radial Svm              0.64  0.07
Logistic Regression     0.80  0.03
KNN                     0.64  0.07
Decision Tree           0.72  0.09
Naive Bayes             0.78  0.02
Random Forest           0.82  0.04


In [None]:
# kaggle titanic example (pt4)

#------------------
# confusion matrix
#------------------

# HEATMAP IS BROKEN FOR MATPLOTLIB 3.1.1

f, ax = plt.subplots(3,3, figsize=(12,10))
y_pred = cross_val_predict(svm.SVC(kernel='linear', gamma='scale'),X,y,cv=10)
sns.heatmap(confusion_matrix(y,y_pred), ax=ax[0,0], annot=True,fmt='2.0f')
ax[0,0].set_title('Linear SVM')

y_pred = cross_val_predict(svm.SVC(kernel='rbf', gamma='scale'),X,y,cv=10)
sns.heatmap(confusion_matrix(y,y_pred), ax=ax[0,1], annot=True,fmt='2.0f')
ax[0,1].set_title('Radical SVM')

y_pred = cross_val_predict(KNeighborsClassifier(n_neighbors=9) ,X,y,cv=10)
sns.heatmap(confusion_matrix(y,y_pred), ax=ax[0,2], annot=True,fmt='2.0f')
ax[0,2].set_title('KNN')

y_pred = cross_val_predict(LogisticRegression(solver='liblinear') ,X,y,cv=10)
sns.heatmap(confusion_matrix(y,y_pred), ax=ax[1,0], annot=True,fmt='2.0f')
ax[1,0].set_title('Logistic Regression')

y_pred = cross_val_predict(RandomForestClassifier(n_estimators=100) ,X,y,cv=10)
sns.heatmap(confusion_matrix(y,y_pred), ax=ax[1,1], annot=True,fmt='2.0f')
ax[1,1].set_title('Random Forest')

y_pred = cross_val_predict(DecisionTreeClassifier() ,X,y,cv=10)
sns.heatmap(confusion_matrix(y,y_pred), ax=ax[1,2], annot=True,fmt='2.0f')
ax[1,2].set_title('Decision Tree')

y_pred = cross_val_predict(GaussianNB() ,X,y,cv=10)
sns.heatmap(confusion_matrix(y,y_pred), ax=ax[2,0], annot=True,fmt='2.0f')
ax[2,0].set_title('Naive Bayes')

plt.subplots_adjust(hspace=0.5, wspace=0.5)
plt.show()