In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [2]:
titanic = pd.read_csv('titanic_processed1.csv')
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,0,2.0,3,2,27.9,0,0,1
1,1,2,0,13.0,0,1,19.5,0,0,1
2,0,3,1,30.0,0,0,7.225,1,0,0
3,0,3,1,25.0,0,0,7.225,1,0,0
4,0,3,0,18.0,1,0,17.8,0,0,1


In [3]:
#Lets extract features 

FEATURES = list(titanic.columns[1:])#Survived columns is our label
FEATURES

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [4]:
#Will evaluate each model's accuracy, precision and recall score of test data and store in a dictionary 
#The keys of the this dictionay is different model we built and values will contain evaluation metrics

result_dict = {}


In [5]:
#Lets setup few helper function that we can reuse in our notebook
#Takes actual and predicted y value and quickly calculate scores

def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred, normalize = True) #normalize=True - accuracy in terms of fraction
    num_acc = accuracy_score(y_test, y_pred, normalize = False)#normalize=False - no of accurately predicted label 
    
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    return{'accuracy' : acc,
           'precision' : prec,
           'recall' : recall,
           'accuracy_count' : num_acc}

In [6]:
#Another helper function to help us build and train the different classification models in this notebook

def build_model(classifier_fn, name_of_y_col, name_of_x_cols, dataset, test_frac=0.2):
    
    #classifier_fn-function we'll define, takes training data training label instantiates and estimator object and trains the model
    #name_of_y_col -string name of the target label.i.e. name of y column
    #name_of_x_cols - features.i.e name of the x columns in the form of list
    #dataset- dataframe holds our training data
    #test_frac - testing fraction 
    
    X = dataset[name_of_x_cols]
    Y = dataset[name_of_y_col]
    
    #Train test split
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = test_frac)
    
    #Classifier fucntion
    model = classifier_fn(x_train, y_train)
    
    #Testing prediction
    y_pred = model.predict(x_test)
    
    #Training prediction
    y_pred_train = model.predict(x_train)
    
    #Scores
    train_summary = summarize_classification(y_train, y_pred_train)
    test_summary = summarize_classification(y_test, y_pred)
    
    #Actual vs prediction results in df
    pred_results = pd.DataFrame({'y_test' : y_test, 'y_pred' : y_pred})
    
    #Confusion matrix
    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
    
    return {'training' : train_summary,
            'test' : test_summary,
            'confusrion_matrix' : model_crosstab}


In [8]:
#Helper function which allow us quickly compare results of the different calssification model that we built

def compare_results():
    for key in result_dict:
        print('Classification : ', key)
        
        print()
        print('Training Data')
        for score in result_dict[key]['training']:
            print(score, result_dict[key]['training'][score])
        print()
        print('Test Data')
        for score in result_dict[key]['test']:
            print(score, result_dict[key]['test'][score])
        print()
        

In [9]:
#We'll define a function that takes training data and training labels 
#This instantiates logistic regression estimator and calls fit on the estimator to start training

def logistic_fn(x_train, y_train):
    model = LogisticRegression(solver='liblinear')
    model.fit(x_train, y_train)
    
    return model

In [10]:
#Lets call our reuseable function 

result_dict['survived - logistic'] = build_model(logistic_fn, 'Survived', FEATURES, titanic)

compare_results()

Classification :  survived - logistic

Training Data
accuracy 0.7961335676625659
precision 0.7783505154639175
recall 0.6741071428571429
accuracy_count 453

Test Data
accuracy 0.7972027972027972
precision 0.8070175438596491
recall 0.71875
accuracy_count 114



In [17]:
result_dict

{'survived - logistic': {'training': {'accuracy': 0.789103690685413,
   'precision': 0.7674418604651163,
   'recall': 0.7021276595744681,
   'accuracy_count': 449},
  'test': {'accuracy': 0.8321678321678322,
   'precision': 0.8536585365853658,
   'recall': 0.660377358490566,
   'accuracy_count': 119},
  'confusrion_matrix': y_test   0   1
  y_pred        
  0       84  18
  1        6  35}}

<h2>Linear Discriminant Analysis</h2>

Linear discriminant isany line that can be used to separate the two classes into which we are categorizing data. Find axes to best separate the classes such that all instances of a class are in the same quadrant

In [11]:
#Define a classifier fucntion takes training data and training labels and instantiates an estimator for
#Linear Discriminant Analysis from scikit-learn

#Our training data is numeric and every record can be a point  in an N-dimensional space.
#The best axes is refer to those axes that the best separate the data into different classes

def linear_discriminant_fn(x_train, y_train, solver='svd'):
    
#Solver='svd' - singular value decomposition solver is default solver, 
#svd- find axes without calculating the covariance matrix of features useful when we have many features our 
#training data and many records as well 

    model = LinearDiscriminantAnalysis(solver=solver)
    model.fit(x_train, y_train)
    
    return model

In [12]:
result_dict['survived - linear_discriminant_analysis'] = build_model(linear_discriminant_fn,
                                                                    'Survived',
                                                                     FEATURES,
                                                                     titanic)

compare_results()

Classification :  survived - logistic

Training Data
accuracy 0.7961335676625659
precision 0.7783505154639175
recall 0.6741071428571429
accuracy_count 453

Test Data
accuracy 0.7972027972027972
precision 0.8070175438596491
recall 0.71875
accuracy_count 114

Classification :  survived - linear_discriminant_analysis

Training Data
accuracy 0.7961335676625659
precision 0.7621359223300971
recall 0.7008928571428571
accuracy_count 453

Test Data
accuracy 0.8251748251748252
precision 0.8823529411764706
recall 0.703125
accuracy_count 118



For many machine learning models if you include all of the columns from your one-hot encoded features in your training data, you'll encounter something  that is called a dummy trap. This occurs when there is a perfect colinearity between two or more features in your training set.

This dummy trap can result  in poor ML models. The way to fix this is to use dummy encoding of our categorical variables instead of one-hot encoding, this can be done easily by simply droppingg one of the columns from our one-hot encoded set.



In [14]:
#Using dummy encoding 
#Instead of using all of our features from our training data lets drop last columns-one of one-hot encoded column
result_dict['survived - linear_discriminant_analysis'] = build_model(linear_discriminant_fn,
                                                                    'Survived',
                                                                     FEATURES[0:-1],
                                                                     titanic)

compare_results()

Classification :  survived - logistic

Training Data
accuracy 0.7961335676625659
precision 0.7783505154639175
recall 0.6741071428571429
accuracy_count 453

Test Data
accuracy 0.7972027972027972
precision 0.8070175438596491
recall 0.71875
accuracy_count 114

Classification :  survived - linear_discriminant_analysis

Training Data
accuracy 0.8014059753954306
precision 0.7819905213270142
recall 0.7112068965517241
accuracy_count 456

Test Data
accuracy 0.7622377622377622
precision 0.6896551724137931
recall 0.7142857142857143
accuracy_count 109



<h2>Quadratic Discriminant Analysis</h2>

In [15]:
#Useful when the X variable  corresponding to different labels have different covariances.
#i.e. covariances are different  for X for all values of Y

def quadratic_discriminany_fn(x_train, y_train):
    model = QuadraticDiscriminantAnalysis()
    model.fit(x_train, y_train)
    
    return model

In [16]:
result_dict['survived - quadratic_discriminant_analysis'] = build_model(quadratic_discriminany_fn,
                                                                       'Survived',
                                                                       FEATURES[0:-1],
                                                                       titanic)

compare_results()

Classification :  survived - logistic

Training Data
accuracy 0.7961335676625659
precision 0.7783505154639175
recall 0.6741071428571429
accuracy_count 453

Test Data
accuracy 0.7972027972027972
precision 0.8070175438596491
recall 0.71875
accuracy_count 114

Classification :  survived - linear_discriminant_analysis

Training Data
accuracy 0.8014059753954306
precision 0.7819905213270142
recall 0.7112068965517241
accuracy_count 456

Test Data
accuracy 0.7622377622377622
precision 0.6896551724137931
recall 0.7142857142857143
accuracy_count 109

Classification :  survived - quadratic_discriminant_analysis

Training Data
accuracy 0.7961335676625659
precision 0.7843137254901961
recall 0.6896551724137931
accuracy_count 453

Test Data
accuracy 0.8321678321678322
precision 0.82
recall 0.7321428571428571
accuracy_count 119



<h2> Stochastic Gradient Descent </h2> 

Stochastic Gradient Descent (SGD) performs a numeric optimization - one training instance at one time to find the best model parameter. Each training instance is fed into the model at an epoch or an interation, so you have to run sevaral iterations to improve the model.<br>

You can specify different hyperparameters to all of these classification models and these are design factor in your model, two parameter for SGD classifier max iter - no of iteration  and tol - tolarance value - stopping criterion for the model training<br>

When we specify value the model will stop training if the loss calculated at a particular iteration is less than the tolerance that we have specified as compared with the previous iteration  


In [32]:
def sgd_fn(x_train, y_train, max_iter = 10000, tol = 1e-3):
    
    model = SGDClassifier(max_iter = max_iter, tol=tol)
    model.fit(x_train, y_train)
    
    return model

In [36]:
result_dict['survived - sgd'] = build_model(sgd_fn, 'Survived', FEATURES, titanic)

compare_results()

Classification :  survived - logistic

Training Data
accuracy 0.7961335676625659
precision 0.7783505154639175
recall 0.6741071428571429
accuracy_count 453

Test Data
accuracy 0.7972027972027972
precision 0.8070175438596491
recall 0.71875
accuracy_count 114

Classification :  survived - linear_discriminant_analysis

Training Data
accuracy 0.8014059753954306
precision 0.7819905213270142
recall 0.7112068965517241
accuracy_count 456

Test Data
accuracy 0.7622377622377622
precision 0.6896551724137931
recall 0.7142857142857143
accuracy_count 109

Classification :  survived - quadratic_discriminant_analysis

Training Data
accuracy 0.7961335676625659
precision 0.7843137254901961
recall 0.6896551724137931
accuracy_count 453

Test Data
accuracy 0.8321678321678322
precision 0.82
recall 0.7321428571428571
accuracy_count 119

Classification :  survived - sgd

Training Data
accuracy 0.6942003514938488
precision 0.7938144329896907
recall 0.3333333333333333
accuracy_count 395

Test Data
accuracy 0.734

<h1> Support Vector Machine </h1>

Find a hyperplane that separates points so all points on the same side belongs to the same class

C - inverse of the regularization strength, smaller values indicates stronger regularization and this is to penalize  more complex model- penalize points on the wrong side of the margin.<br>

Tolerance is what we use the model training should be stopped,If the calculated loss for two consecutive iteration of model is less than the tolerance value the model will be basically infer that additional training isn't really improving the model parameters by much and it'll stop training.<br>

We use LinearSVC estimator object to perform support vector machine classification, There is another estimator object available in scikit-learn that you can use to perform SVM tha tis the SVC object.

LinearSVC = SVC(kernal='linear')<br>

dual = False is also refer to the optimizatino problem when you perform optimizations in machine learning its possible to convert what is called primal problem to dual problem- dual probelm is easier to solve using optimization<br>

Prefer dual = False when n_samples > n_features



In [38]:
def linear_svc_fn(x_train, y_train, C=1.0, max_iter = 1000, tol = 1e-3):
    
    model = LinearSVC(C = C, max_iter = max_iter, tol = tol, dual = False)
    model.fit(x_train, y_train)
    


    return model

In [39]:
result_dict['survived - linear_svc'] = build_model(linear_svc_fn, 'Survived', FEATURES, titanic)

compare_results()

Classification :  survived - logistic

Training Data
accuracy 0.7961335676625659
precision 0.7783505154639175
recall 0.6741071428571429
accuracy_count 453

Test Data
accuracy 0.7972027972027972
precision 0.8070175438596491
recall 0.71875
accuracy_count 114

Classification :  survived - linear_discriminant_analysis

Training Data
accuracy 0.8014059753954306
precision 0.7819905213270142
recall 0.7112068965517241
accuracy_count 456

Test Data
accuracy 0.7622377622377622
precision 0.6896551724137931
recall 0.7142857142857143
accuracy_count 109

Classification :  survived - quadratic_discriminant_analysis

Training Data
accuracy 0.7961335676625659
precision 0.7843137254901961
recall 0.6896551724137931
accuracy_count 453

Test Data
accuracy 0.8321678321678322
precision 0.82
recall 0.7321428571428571
accuracy_count 119

Classification :  survived - sgd

Training Data
accuracy 0.6942003514938488
precision 0.7938144329896907
recall 0.3333333333333333
accuracy_count 395

Test Data
accuracy 0.734

<h1> Nearest Neighbors </h1>

<h3>K - Nearest Neighbors<br>
    
Radius Neighbors

In [40]:
#Lets create function for nearest neighbors

def radius_neighbor_fn(x_train, y_train, radius=40.0):
    
    #points within the radius are considered neighbors and can vote
    
    model = RadiusNeighborsClassifier(radius = radius)
    model.fit(x_train, y_train)
    
    return model
    

In [41]:
result_dict['survived - radius_neighbors'] = build_model(radius_neighbor_fn,
                                                         'Survived', 
                                                         FEATURES, 
                                                         titanic)

compare_results()

Classification :  survived - logistic

Training Data
accuracy 0.7961335676625659
precision 0.7783505154639175
recall 0.6741071428571429
accuracy_count 453

Test Data
accuracy 0.7972027972027972
precision 0.8070175438596491
recall 0.71875
accuracy_count 114

Classification :  survived - linear_discriminant_analysis

Training Data
accuracy 0.8014059753954306
precision 0.7819905213270142
recall 0.7112068965517241
accuracy_count 456

Test Data
accuracy 0.7622377622377622
precision 0.6896551724137931
recall 0.7142857142857143
accuracy_count 109

Classification :  survived - quadratic_discriminant_analysis

Training Data
accuracy 0.7961335676625659
precision 0.7843137254901961
recall 0.6896551724137931
accuracy_count 453

Test Data
accuracy 0.8321678321678322
precision 0.82
recall 0.7321428571428571
accuracy_count 119

Classification :  survived - sgd

Training Data
accuracy 0.6942003514938488
precision 0.7938144329896907
recall 0.3333333333333333
accuracy_count 395

Test Data
accuracy 0.734

# Decision Tree

Fit a decision tree to training data using CART-(Classification And Regression Tree) algorithm

In [43]:
def decision_tree_fn(x_train, y_train, max_depth=None, max_features = None):
    
    model = DecisionTreeClassifier( max_depth = max_depth, max_features = max_features )
    model.fit(x_train, y_train)
    
    return model

In [45]:
result_dict['survived - decison_tree'] = build_model(decision_tree_fn, 'Survived', FEATURES, titanic)

compare_results()

Classification :  survived - logistic

Training Data
accuracy 0.7961335676625659
precision 0.7783505154639175
recall 0.6741071428571429
accuracy_count 453

Test Data
accuracy 0.7972027972027972
precision 0.8070175438596491
recall 0.71875
accuracy_count 114

Classification :  survived - linear_discriminant_analysis

Training Data
accuracy 0.8014059753954306
precision 0.7819905213270142
recall 0.7112068965517241
accuracy_count 456

Test Data
accuracy 0.7622377622377622
precision 0.6896551724137931
recall 0.7142857142857143
accuracy_count 109

Classification :  survived - quadratic_discriminant_analysis

Training Data
accuracy 0.7961335676625659
precision 0.7843137254901961
recall 0.6896551724137931
accuracy_count 453

Test Data
accuracy 0.8321678321678322
precision 0.82
recall 0.7321428571428571
accuracy_count 119

Classification :  survived - sgd

Training Data
accuracy 0.6942003514938488
precision 0.7938144329896907
recall 0.3333333333333333
accuracy_count 395

Test Data
accuracy 0.734

# Naive Bayes'

Naive Bayes' makes naive (strong) assumption about independence of features, Use Bayes' theorem to find which label is most likely , given the attributes observed  in teh features vector, and given how often the different labels occur in the data.

In [46]:
def naive_bayes_fn(x_train, y_train, priors=None):
    
    #Priors probabilities of the classes, when not specified the priors are adjusted based on the data
    
    model = GaussianNB(priors = priors)
    model.fit(x_train, y_train)
    
    #Different features in our dataset are considered to be independent 
    
    return model
    

In [48]:
result_dict['survived - naive_bayes'] = build_model(naive_bayes_fn, 'Survived', FEATURES, titanic)

compare_results()

Classification :  survived - logistic

Training Data
accuracy 0.7961335676625659
precision 0.7783505154639175
recall 0.6741071428571429
accuracy_count 453

Test Data
accuracy 0.7972027972027972
precision 0.8070175438596491
recall 0.71875
accuracy_count 114

Classification :  survived - linear_discriminant_analysis

Training Data
accuracy 0.8014059753954306
precision 0.7819905213270142
recall 0.7112068965517241
accuracy_count 456

Test Data
accuracy 0.7622377622377622
precision 0.6896551724137931
recall 0.7142857142857143
accuracy_count 109

Classification :  survived - quadratic_discriminant_analysis

Training Data
accuracy 0.7961335676625659
precision 0.7843137254901961
recall 0.6896551724137931
accuracy_count 453

Test Data
accuracy 0.8321678321678322
precision 0.82
recall 0.7321428571428571
accuracy_count 119

Classification :  survived - sgd

Training Data
accuracy 0.6942003514938488
precision 0.7938144329896907
recall 0.3333333333333333
accuracy_count 395

Test Data
accuracy 0.734