In [42]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis,QuadraticDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [43]:
titanic_df = pd.read_csv('data/titanic_processed.csv')
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
0,0,3,1,15.0,1,1,7.2292,0,0
1,0,3,1,22.0,0,0,7.25,0,1
2,0,2,1,18.0,0,0,13.0,0,1
3,1,2,0,18.0,0,1,23.0,0,1
4,0,3,1,20.0,0,0,7.8542,0,1


In [44]:
features = list(titanic_df.columns[1:])
features

['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_Q', 'Embarked_S']

In [45]:
results_dict = {}

In [46]:
def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred, normalize = True)
    num_acc = accuracy_score(y_test, y_pred, normalize = False)
    
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    return {'accuracy' : acc,
            'precision' : prec, 
            'recall' : recall,
            'accurcy_count' : num_acc     
            }

In [47]:
def build_model(classifier_fn, name_of_y_col, names_of_x_cols, dataset, test_frac = 0.2):
    
    x = dataset[names_of_x_cols]
    y = dataset[name_of_y_col]
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = test_frac)
    
    model = classifier_fn(x_train, y_train)
    y_pred = model.predict(x_test)
    
    y_pred_train = model.predict(x_train)
    
    train_summary = summarize_classification(y_train, y_pred_train)
    test_summary = summarize_classification(y_test, y_pred)
    
    pred_results = pd.DataFrame({'y_test': y_test,
                                 'y_pred': y_pred,
                                })
    
    # This is the confusionmatrix
    model_crosstab = pd.crosstab(pred_results.y_test, pred_results.y_pred)
    
    return {'training' : train_summary,
            'test': test_summary,
            'confusion_matrix' : model_crosstab
           }

In [48]:
def compare_results():
    for key in results_dict:
        print({'classification ' : key})
        
        print()
        print('Training Data')
        for score in results_dict[key]['training']:
            print(score, results_dict[key]['training'][score])
        
        
        print()
        print('Test Data')
        for score in results_dict[key]['test']:
            print(score, results_dict[key]['test'][score])
        
        print()

In [49]:
def logistic_fn(x_train, y_train):
    model = LogisticRegression(solver = 'liblinear')
    model.fit(x_train,y_train)
    
    return model

In [50]:
results_dict['survived - logistic'] = build_model(logistic_fn, 'Survived', features, titanic_df)

compare_results()

{'classification ': 'survived - logistic'}

Training Data
accuracy 0.7943760984182777
precision 0.7772277227722773
recall 0.6855895196506551
accurcy_count 452

Test Data
accuracy 0.7902097902097902
precision 0.7543859649122807
recall 0.7288135593220338
accurcy_count 113



In [51]:
def linear_discriminant_fn(x_train, y_train, solver ='svd'):
    model = LinearDiscriminantAnalysis(solver = solver)
    model.fit(x_train, y_train)
    
    return model

In [52]:
results_dict['survived - Linear_Discriminat_Analysis'] = build_model(linear_discriminant_fn, 'Survived', features, titanic_df)

compare_results()

{'classification ': 'survived - logistic'}

Training Data
accuracy 0.7943760984182777
precision 0.7772277227722773
recall 0.6855895196506551
accurcy_count 452

Test Data
accuracy 0.7902097902097902
precision 0.7543859649122807
recall 0.7288135593220338
accurcy_count 113

{'classification ': 'survived - Linear_Discriminat_Analysis'}

Training Data
accuracy 0.789103690685413
precision 0.7666666666666667
recall 0.6939655172413793
accurcy_count 449

Test Data
accuracy 0.8181818181818182
precision 0.7777777777777778
recall 0.75
accurcy_count 117



In [53]:
def quadratic_discriminant_fn(x_train, y_train):
    model = QuadraticDiscriminantAnalysis()
    model.fit(x_train, y_train)
    
    return model

In [54]:
results_dict['survived - Quadratic_discriminat_analysis'] = build_model(quadratic_discriminant_fn, 'Survived',features, titanic_df)

compare_results()

{'classification ': 'survived - logistic'}

Training Data
accuracy 0.7943760984182777
precision 0.7772277227722773
recall 0.6855895196506551
accurcy_count 452

Test Data
accuracy 0.7902097902097902
precision 0.7543859649122807
recall 0.7288135593220338
accurcy_count 113

{'classification ': 'survived - Linear_Discriminat_Analysis'}

Training Data
accuracy 0.789103690685413
precision 0.7666666666666667
recall 0.6939655172413793
accurcy_count 449

Test Data
accuracy 0.8181818181818182
precision 0.7777777777777778
recall 0.75
accurcy_count 117

{'classification ': 'survived - Quadratic_discriminat_analysis'}

Training Data
accuracy 0.8014059753954306
precision 0.7799043062200957
recall 0.7086956521739131
accurcy_count 456

Test Data
accuracy 0.7972027972027972
precision 0.7457627118644068
recall 0.7586206896551724
accurcy_count 114



In [55]:
def sgd_fn(x_train, y_train, max_iter = 1000, tol = 1e-3):
    model = SGDClassifier(max_iter = max_iter, tol = tol)
    model.fit(x_train, y_train)
    
    return model

In [56]:
results_dict['survived - Stochastic_Gradient_Descent'] = build_model(sgd_fn, 'Survived', features, titanic_df)

compare_results()

{'classification ': 'survived - logistic'}

Training Data
accuracy 0.7943760984182777
precision 0.7772277227722773
recall 0.6855895196506551
accurcy_count 452

Test Data
accuracy 0.7902097902097902
precision 0.7543859649122807
recall 0.7288135593220338
accurcy_count 113

{'classification ': 'survived - Linear_Discriminat_Analysis'}

Training Data
accuracy 0.789103690685413
precision 0.7666666666666667
recall 0.6939655172413793
accurcy_count 449

Test Data
accuracy 0.8181818181818182
precision 0.7777777777777778
recall 0.75
accurcy_count 117

{'classification ': 'survived - Quadratic_discriminat_analysis'}

Training Data
accuracy 0.8014059753954306
precision 0.7799043062200957
recall 0.7086956521739131
accurcy_count 456

Test Data
accuracy 0.7972027972027972
precision 0.7457627118644068
recall 0.7586206896551724
accurcy_count 114

{'classification ': 'survived - Stochastic_Gradient_Descent'}

Training Data
accuracy 0.7750439367311072
precision 0.7727272727272727
recall 0.648305084745762

In [57]:
def linear_svc_fn(x_train, y_train, C = 1, max_iter = 1000, tol = 1e-3):
    model = LinearSVC(C = C, max_iter = max_iter, tol = tol, dual = False)
    model.fit(x_train, y_train)
    
    return model

In [59]:
results_dict['survived - Linear_SVC'] = build_model(linear_svc_fn, 'Survived', features, titanic_df)

compare_results()

{'classification ': 'survived - logistic'}

Training Data
accuracy 0.7943760984182777
precision 0.7772277227722773
recall 0.6855895196506551
accurcy_count 452

Test Data
accuracy 0.7902097902097902
precision 0.7543859649122807
recall 0.7288135593220338
accurcy_count 113

{'classification ': 'survived - Linear_Discriminat_Analysis'}

Training Data
accuracy 0.789103690685413
precision 0.7666666666666667
recall 0.6939655172413793
accurcy_count 449

Test Data
accuracy 0.8181818181818182
precision 0.7777777777777778
recall 0.75
accurcy_count 117

{'classification ': 'survived - Quadratic_discriminat_analysis'}

Training Data
accuracy 0.8014059753954306
precision 0.7799043062200957
recall 0.7086956521739131
accurcy_count 456

Test Data
accuracy 0.7972027972027972
precision 0.7457627118644068
recall 0.7586206896551724
accurcy_count 114

{'classification ': 'survived - Stochastic_Gradient_Descent'}

Training Data
accuracy 0.7750439367311072
precision 0.7727272727272727
recall 0.648305084745762

In [60]:
def radius_neighbor_fn(x_train, y_train, radius = 40.0):
    model = RadiusNeighborsClassifier(radius = radius)
    model.fit(x_train, y_train)
    
    return model

In [61]:
results_dict['survived - Radius_Neighbor'] = build_model(radius_neighbor_fn, 'Survived', features, titanic_df)

compare_results()

{'classification ': 'survived - logistic'}

Training Data
accuracy 0.7943760984182777
precision 0.7772277227722773
recall 0.6855895196506551
accurcy_count 452

Test Data
accuracy 0.7902097902097902
precision 0.7543859649122807
recall 0.7288135593220338
accurcy_count 113

{'classification ': 'survived - Linear_Discriminat_Analysis'}

Training Data
accuracy 0.789103690685413
precision 0.7666666666666667
recall 0.6939655172413793
accurcy_count 449

Test Data
accuracy 0.8181818181818182
precision 0.7777777777777778
recall 0.75
accurcy_count 117

{'classification ': 'survived - Quadratic_discriminat_analysis'}

Training Data
accuracy 0.8014059753954306
precision 0.7799043062200957
recall 0.7086956521739131
accurcy_count 456

Test Data
accuracy 0.7972027972027972
precision 0.7457627118644068
recall 0.7586206896551724
accurcy_count 114

{'classification ': 'survived - Stochastic_Gradient_Descent'}

Training Data
accuracy 0.7750439367311072
precision 0.7727272727272727
recall 0.648305084745762

In [62]:
def decision_tree_fn(x_train, y_train, max_depth = None, max_features = None):
    model = DecisionTreeClassifier(max_depth = max_depth, max_features = max_features)
    model.fit(x_train, y_train)
    
    return model

In [63]:
results_dict['survived - Decision_Tree_Classifier'] = build_model(decision_tree_fn, 'Survived', features, titanic_df)

compare_results()

{'classification ': 'survived - logistic'}

Training Data
accuracy 0.7943760984182777
precision 0.7772277227722773
recall 0.6855895196506551
accurcy_count 452

Test Data
accuracy 0.7902097902097902
precision 0.7543859649122807
recall 0.7288135593220338
accurcy_count 113

{'classification ': 'survived - Linear_Discriminat_Analysis'}

Training Data
accuracy 0.789103690685413
precision 0.7666666666666667
recall 0.6939655172413793
accurcy_count 449

Test Data
accuracy 0.8181818181818182
precision 0.7777777777777778
recall 0.75
accurcy_count 117

{'classification ': 'survived - Quadratic_discriminat_analysis'}

Training Data
accuracy 0.8014059753954306
precision 0.7799043062200957
recall 0.7086956521739131
accurcy_count 456

Test Data
accuracy 0.7972027972027972
precision 0.7457627118644068
recall 0.7586206896551724
accurcy_count 114

{'classification ': 'survived - Stochastic_Gradient_Descent'}

Training Data
accuracy 0.7750439367311072
precision 0.7727272727272727
recall 0.648305084745762

In [64]:
def naive_bayes_fn(x_train, y_train,priors = None):
    model = GaussianNB(priors = priors)
    model.fit(x_train, y_train)
    
    return model

In [65]:
results_dict['survived - Naive_Bayes'] = build_model(naive_bayes_fn,'Survived', features, titanic_df)

compare_results()

{'classification ': 'survived - logistic'}

Training Data
accuracy 0.7943760984182777
precision 0.7772277227722773
recall 0.6855895196506551
accurcy_count 452

Test Data
accuracy 0.7902097902097902
precision 0.7543859649122807
recall 0.7288135593220338
accurcy_count 113

{'classification ': 'survived - Linear_Discriminat_Analysis'}

Training Data
accuracy 0.789103690685413
precision 0.7666666666666667
recall 0.6939655172413793
accurcy_count 449

Test Data
accuracy 0.8181818181818182
precision 0.7777777777777778
recall 0.75
accurcy_count 117

{'classification ': 'survived - Quadratic_discriminat_analysis'}

Training Data
accuracy 0.8014059753954306
precision 0.7799043062200957
recall 0.7086956521739131
accurcy_count 456

Test Data
accuracy 0.7972027972027972
precision 0.7457627118644068
recall 0.7586206896551724
accurcy_count 114

{'classification ': 'survived - Stochastic_Gradient_Descent'}

Training Data
accuracy 0.7750439367311072
precision 0.7727272727272727
recall 0.648305084745762