In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [2]:
titanic_df = pd.read_csv('datasets/titanic_processed.csv')

titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SiblingSpouse,ParentChild,Fare,Embarked_C,Embarked_Q,Embarked_S
0,1,3,0,27.0,0,0,7.925,0,0,1
1,1,3,1,0.42,0,1,8.5167,1,0,0
2,0,3,0,22.0,0,0,9.8375,0,0,1
3,1,3,0,5.0,4,2,31.3875,0,0,1
4,0,3,1,28.0,0,0,9.5,0,0,1


In [5]:
titanic_df.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SiblingSpouse', 'ParentChild',
       'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [3]:
FEATURES = list(titanic_df.columns[1:])

FEATURES

['Pclass',
 'Sex',
 'Age',
 'SiblingSpouse',
 'ParentChild',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [6]:
result_dict = {}

In [30]:
def summarize_classification(y_test, y_pred):
    
    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)

    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    return {'accuracy': acc, 
            'precision': prec,
            'recall':recall, 
            'accuracy_count':num_acc}

In [31]:
def build_model(classifier_fn,                
                name_of_y_col, 
                names_of_x_cols, 
                dataset, 
                test_frac=0.2):
    
    X = dataset[names_of_x_cols]
    Y = dataset[name_of_y_col]

    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_frac)
       
    model = classifier_fn(x_train, y_train)
    
    y_pred = model.predict(x_test)

    y_pred_train = model.predict(x_train)
    
    train_summary = summarize_classification(y_train, y_pred_train)
    test_summary = summarize_classification(y_test, y_pred)
    
    pred_results = pd.DataFrame({'y_test': y_test,
                                 'y_pred': y_pred})
    
    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
    
    return {'training': train_summary, 
            'test': test_summary,
            'confusion_matrix': model_crosstab}

In [32]:
def compare_results():
    for key in result_dict:
        print('Classification: ', key)

        print()
        print('Training data')
        for score in result_dict[key]['training']:
            print(score, result_dict[key]['training'][score])

        print()
        print('Test data')
        for score in result_dict[key]['test']:
            print(score, result_dict[key]['test'][score])
       
        print()

In [33]:
def logistic_fn(x_train, y_train):
    
    model = LogisticRegression(solver='liblinear')
    model.fit(x_train, y_train)
    
    return model

In [34]:
result_dict['survived ~ logistic'] = build_model(logistic_fn,
                                              'Survived',
                                               FEATURES,
                                               titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.8031634446397188
precision 0.7832512315270936
recall 0.7004405286343612
accuracy_count 457

Test data
accuracy 0.7692307692307693
precision 0.7692307692307693
recall 0.6557377049180327
accuracy_count 110

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.7978910369068541
precision 0.7695852534562212
recall 0.7198275862068966
accuracy_count 454

Test data
accuracy 0.7762237762237763
precision 0.7307692307692307
recall 0.6785714285714286
accuracy_count 111

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.7961335676625659
precision 0.7587719298245614
recall 0.7393162393162394
accuracy_count 453

Test data
accuracy 0.7552447552447552
precision 0.6666666666666666
recall 0.7037037037037037
accuracy_count 108

Classification:  survived ~ sgd

Training data
accuracy 0.4200351493848858
precision 0.40942028985507245
recall 0.9826086956521739
accuracy_count 239

Te

In [14]:
def linear_discriminant_fn(x_train, y_train, solver='svd'):
    
    model = LinearDiscriminantAnalysis(solver=solver)
    model.fit(x_train, y_train)
    
    return model

In [15]:
result_dict['survived ~ linear_discriminant_analysis'] = build_model(linear_discriminant_fn,
                                                                 'Survived',
                                                                  FEATURES,
                                                                  titanic_df)
compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7996485061511424
precision 0.7836538461538461
recall 0.7025862068965517
accuracy_count 455

Test data
accuracy 0.8041958041958042
precision 0.7916666666666666
recall 0.6785714285714286
accuracy_count 115

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.7961335676625659
precision 0.783410138248848
recall 0.7112970711297071
accuracy_count 453

Test data
accuracy 0.7552447552447552
precision 0.6346153846153846
recall 0.673469387755102
accuracy_count 108



In [16]:
result_dict['survived ~ linear_discriminant_analysis'] = build_model(linear_discriminant_fn,
                                                                     'Survived',
                                                                      FEATURES[0:-1],
                                                                      titanic_df)
compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7996485061511424
precision 0.7836538461538461
recall 0.7025862068965517
accuracy_count 455

Test data
accuracy 0.8041958041958042
precision 0.7916666666666666
recall 0.6785714285714286
accuracy_count 115

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.7978910369068541
precision 0.7695852534562212
recall 0.7198275862068966
accuracy_count 454

Test data
accuracy 0.7762237762237763
precision 0.7307692307692307
recall 0.6785714285714286
accuracy_count 111



In [17]:
def quadratic_discriminant_fn(x_train, y_train):
    
    model = QuadraticDiscriminantAnalysis()
    model.fit(x_train, y_train)
    
    return model

In [18]:
result_dict['survived ~ quadratic_discriminant_analysis'] = build_model(quadratic_discriminant_fn,
                                                                        'Survived',
                                                                        FEATURES[0:-1],
                                                                        titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7996485061511424
precision 0.7836538461538461
recall 0.7025862068965517
accuracy_count 455

Test data
accuracy 0.8041958041958042
precision 0.7916666666666666
recall 0.6785714285714286
accuracy_count 115

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.7978910369068541
precision 0.7695852534562212
recall 0.7198275862068966
accuracy_count 454

Test data
accuracy 0.7762237762237763
precision 0.7307692307692307
recall 0.6785714285714286
accuracy_count 111

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.7961335676625659
precision 0.7587719298245614
recall 0.7393162393162394
accuracy_count 453

Test data
accuracy 0.7552447552447552
precision 0.6666666666666666
recall 0.7037037037037037
accuracy_count 108



In [19]:
def sgd_fn(x_train, y_train, max_iter=1000, tol=1e-3):
    
    model = SGDClassifier(max_iter=max_iter, tol=tol)
    model.fit(x_train, y_train)
     
    return model

In [20]:
result_dict['survived ~ sgd'] = build_model(sgd_fn,
                                           'Survived',
                                            FEATURES,
                                            titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7996485061511424
precision 0.7836538461538461
recall 0.7025862068965517
accuracy_count 455

Test data
accuracy 0.8041958041958042
precision 0.7916666666666666
recall 0.6785714285714286
accuracy_count 115

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.7978910369068541
precision 0.7695852534562212
recall 0.7198275862068966
accuracy_count 454

Test data
accuracy 0.7762237762237763
precision 0.7307692307692307
recall 0.6785714285714286
accuracy_count 111

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.7961335676625659
precision 0.7587719298245614
recall 0.7393162393162394
accuracy_count 453

Test data
accuracy 0.7552447552447552
precision 0.6666666666666666
recall 0.7037037037037037
accuracy_count 108

Classification:  survived ~ sgd

Training data
accuracy 0.4200351493848858
precision 0.40942028985507245
recall 0.9826086956521739
accuracy_count 239

Te

### LinearSVC

https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html

* SVC with a linear kernel
* dual=False when number of samples > number of features

In [21]:
def linear_svc_fn(x_train, y_train, C=1.0, max_iter=1000, tol=1e-3):
    
    model = LinearSVC(C=C, max_iter=max_iter, tol=tol, dual=False)
    model.fit(x_train, y_train) 
    
    return model

In [22]:
result_dict['survived ~ linear_svc'] = build_model(linear_svc_fn,
                                                  'Survived',
                                                   FEATURES,
                                                   titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7996485061511424
precision 0.7836538461538461
recall 0.7025862068965517
accuracy_count 455

Test data
accuracy 0.8041958041958042
precision 0.7916666666666666
recall 0.6785714285714286
accuracy_count 115

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.7978910369068541
precision 0.7695852534562212
recall 0.7198275862068966
accuracy_count 454

Test data
accuracy 0.7762237762237763
precision 0.7307692307692307
recall 0.6785714285714286
accuracy_count 111

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.7961335676625659
precision 0.7587719298245614
recall 0.7393162393162394
accuracy_count 453

Test data
accuracy 0.7552447552447552
precision 0.6666666666666666
recall 0.7037037037037037
accuracy_count 108

Classification:  survived ~ sgd

Training data
accuracy 0.4200351493848858
precision 0.40942028985507245
recall 0.9826086956521739
accuracy_count 239

Te

In [23]:
def radius_neighbor_fn(x_train, y_train, radius=40.0):

    model = RadiusNeighborsClassifier(radius=radius)
    model.fit(x_train, y_train) 
    
    return model

In [24]:
result_dict['survived ~ radius_neighbors'] = build_model(radius_neighbor_fn,
                                                         'Survived',
                                                         FEATURES,
                                                         titanic_df)
compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7996485061511424
precision 0.7836538461538461
recall 0.7025862068965517
accuracy_count 455

Test data
accuracy 0.8041958041958042
precision 0.7916666666666666
recall 0.6785714285714286
accuracy_count 115

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.7978910369068541
precision 0.7695852534562212
recall 0.7198275862068966
accuracy_count 454

Test data
accuracy 0.7762237762237763
precision 0.7307692307692307
recall 0.6785714285714286
accuracy_count 111

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.7961335676625659
precision 0.7587719298245614
recall 0.7393162393162394
accuracy_count 453

Test data
accuracy 0.7552447552447552
precision 0.6666666666666666
recall 0.7037037037037037
accuracy_count 108

Classification:  survived ~ sgd

Training data
accuracy 0.4200351493848858
precision 0.40942028985507245
recall 0.9826086956521739
accuracy_count 239

Te

max_depth = None [ If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples ]

max_features = None [None -- max_features=n_features, 
                     auto -- then max_features=sqrt(n_features), 
                     sqrt -- then max_features=sqrt(n_features), 
                     log2 -- then max_features=log2(n_features)]

In [25]:
def decision_tree_fn(x_train, y_train, max_depth=None, max_features=None): 
    
    model = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features)
    model.fit(x_train, y_train)
    
    return model

In [26]:
result_dict['survived ~ decision_tree'] = build_model(decision_tree_fn,
                                                 'Survived',
                                                  FEATURES,
                                                  titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7996485061511424
precision 0.7836538461538461
recall 0.7025862068965517
accuracy_count 455

Test data
accuracy 0.8041958041958042
precision 0.7916666666666666
recall 0.6785714285714286
accuracy_count 115

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.7978910369068541
precision 0.7695852534562212
recall 0.7198275862068966
accuracy_count 454

Test data
accuracy 0.7762237762237763
precision 0.7307692307692307
recall 0.6785714285714286
accuracy_count 111

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.7961335676625659
precision 0.7587719298245614
recall 0.7393162393162394
accuracy_count 453

Test data
accuracy 0.7552447552447552
precision 0.6666666666666666
recall 0.7037037037037037
accuracy_count 108

Classification:  survived ~ sgd

Training data
accuracy 0.4200351493848858
precision 0.40942028985507245
recall 0.9826086956521739
accuracy_count 239

Te

In [27]:
def naive_bayes_fn(x_train,y_train, priors=None):
    
    model = GaussianNB(priors=priors)
    model.fit(x_train, y_train)
    
    return model

In [28]:
result_dict['survived ~ naive_bayes'] = build_model(naive_bayes_fn,
                                                    'Survived',
                                                    FEATURES,
                                                    titanic_df)

compare_results()

Classification:  survived ~ logistic

Training data
accuracy 0.7996485061511424
precision 0.7836538461538461
recall 0.7025862068965517
accuracy_count 455

Test data
accuracy 0.8041958041958042
precision 0.7916666666666666
recall 0.6785714285714286
accuracy_count 115

Classification:  survived ~ linear_discriminant_analysis

Training data
accuracy 0.7978910369068541
precision 0.7695852534562212
recall 0.7198275862068966
accuracy_count 454

Test data
accuracy 0.7762237762237763
precision 0.7307692307692307
recall 0.6785714285714286
accuracy_count 111

Classification:  survived ~ quadratic_discriminant_analysis

Training data
accuracy 0.7961335676625659
precision 0.7587719298245614
recall 0.7393162393162394
accuracy_count 453

Test data
accuracy 0.7552447552447552
precision 0.6666666666666666
recall 0.7037037037037037
accuracy_count 108

Classification:  survived ~ sgd

Training data
accuracy 0.4200351493848858
precision 0.40942028985507245
recall 0.9826086956521739
accuracy_count 239

Te

In [41]:
from sklearn.datasets import load_iris

In [42]:
     iris = load_iris()
     X = iris.data
     y = iris.target
     from sklearn.neighbors import KNeighborsClassifier
     model = KNeighborsClassifier(n_neighbors=1)
     model.fit(X, y)
     y_model = model.predict(X)
     from sklearn.metrics import accuracy_score
     accuracy_score(y, y_model)

1.0