# Classification Test
## Purpose: To analyze which classification model works the best on the given data
### Dataset: Titanic from Seaborn library
### Predictor Variables: pclass, sex, age, sibsp, parch, fare, embarked
### Predicted Variable: survived
### Algorithms: Logistic Regression, Stochastic Gradient Descent, Support Vector Machines, Decision Tree, Random Forest, Naive Bayes, K-Nearest Neighbors

Import needed libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
titanic = sns.load_dataset("titanic")
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


Get rid of columns not being used

In [3]:
titanic = titanic.iloc[:,:8]
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


Check for null values

In [4]:
titanic.isna().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
dtype: int64

Group by pclass and fill all age null values with the median

In [5]:
titanic["age"] = titanic.groupby(["pclass"])["age"].transform(lambda x: x.fillna(x.median()))
titanic.isna().sum()

survived    0
pclass      0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    2
dtype: int64

Drop the rows that contain null values

In [6]:
titanic = titanic.dropna()
titanic.isna().sum()

survived    0
pclass      0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    0
dtype: int64

Use get_dummies to create columns that only contain binary values for categorical variables

In [7]:
titanic = pd.get_dummies(titanic, columns=["sex", "embarked"])
titanic.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
0,0,3,22.0,1,0,7.25,0,1,0,0,1
1,1,1,38.0,1,0,71.2833,1,0,1,0,0
2,1,3,26.0,0,0,7.925,1,0,0,0,1
3,1,1,35.0,1,0,53.1,1,0,0,0,1
4,0,3,35.0,0,0,8.05,0,1,0,0,1


Check data types

In [8]:
titanic.dtypes

survived        int64
pclass          int64
age           float64
sibsp           int64
parch           int64
fare          float64
sex_female      uint8
sex_male        uint8
embarked_C      uint8
embarked_Q      uint8
embarked_S      uint8
dtype: object

Seperate X variables and y variable

In [9]:
X = titanic.drop(columns="survived")
y = titanic["survived"]

Split training and testing data

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

Create functions to create model and get model metrics

In [11]:
def create_model(X_train_data, y_train_data, algorithm):
    model = algorithm
    model.fit(X_train_data, y_train_data)
    return model

def model_metrics(X_test_data, y_test_data, m):
    # accuracy, precision, recall, f1
    preds = m.predict(X_test_data)
    acc = accuracy_score(preds, y_test_data)
    prec = precision_score(preds, y_test_data)
    recall = recall_score(preds, y_test_data)
    f1 = f1_score(preds, y_test_data)
    metrics = [acc, prec, recall, f1]
    return metrics


Perform all classifiers

In [12]:
# Logistic Regression Classifier
logit_reg = create_model(X_train, y_train, LogisticRegression(max_iter=500))
logit_metrics = model_metrics(X_test, y_test, logit_reg)
print(f"Logistic Regression\nAccuracy:\t {logit_metrics[0]}")
print(f"Precision:\t {logit_metrics[1]}")
print(f"Recall:\t\t {logit_metrics[2]}")
print(f"F1:\t\t {logit_metrics[3]}")

Logistic Regression
Accuracy:	 0.7921348314606742
Precision:	 0.75
Recall:		 0.7397260273972602
F1:		 0.7448275862068966


In [13]:
# SGD Classifier
sgd_clf = create_model(X_train, y_train, SGDClassifier())
sgd_metrics = model_metrics(X_test, y_test, sgd_clf)
print(f"SGD Classifier\nAccuracy:\t {sgd_metrics[0]}")
print(f"Precision:\t {sgd_metrics[1]}")
print(f"Recall:\t\t {sgd_metrics[2]}")
print(f"F1:\t\t {sgd_metrics[3]}")

SGD Classifier
Accuracy:	 0.702247191011236
Precision:	 0.8611111111111112
Recall:		 0.5904761904761905
F1:		 0.7005649717514124


In [14]:
# SVM Classifier
svm_clf = create_model(X_train, y_train, SVC())
svm_metrics = model_metrics(X_test, y_test, svm_clf)
print(f"SVM Classifier\nAccuracy:\t {svm_metrics[0]}")
print(f"Precision:\t {svm_metrics[1]}")
print(f"Recall:\t\t {svm_metrics[2]}")
print(f"F1:\t\t {svm_metrics[3]}")

SVM Classifier
Accuracy:	 0.6404494382022472
Precision:	 0.2777777777777778
Recall:		 0.625
F1:		 0.3846153846153846


In [15]:
# Decision Tree Classifier
tree_clf = create_model(X_train, y_train, DecisionTreeClassifier())
tree_metrics = model_metrics(X_test, y_test, tree_clf)
print(f"Decision Tree Classifier\nAccuracy:\t {tree_metrics[0]}")
print(f"Precision:\t {tree_metrics[1]}")
print(f"Recall:\t\t {tree_metrics[2]}")
print(f"F1:\t\t {tree_metrics[3]}")

Decision Tree Classifier
Accuracy:	 0.7696629213483146
Precision:	 0.6666666666666666
Recall:		 0.7384615384615385
F1:		 0.7007299270072994


In [16]:
# Random Forest Classifier
rf_clf = create_model(X_train, y_train, RandomForestClassifier())
rf_metrics = model_metrics(X_test, y_test, rf_clf)
print(f"Decision Tree Classifier\nAccuracy:\t {rf_metrics[0]}")
print(f"Precision:\t {rf_metrics[1]}")
print(f"Recall:\t\t {rf_metrics[2]}")
print(f"F1:\t\t {rf_metrics[3]}")

Decision Tree Classifier
Accuracy:	 0.797752808988764
Precision:	 0.6805555555555556
Recall:		 0.7903225806451613
F1:		 0.7313432835820894


In [17]:
# Naive Bayes Classifier
nb_clf = create_model(X_train, y_train, BernoulliNB())
nb_metrics = model_metrics(X_test, y_test, nb_clf)
print(f"Naive Bayes Classifier\nAccuracy:\t {nb_metrics[0]}")
print(f"Precision:\t {nb_metrics[1]}")
print(f"Recall:\t\t {nb_metrics[2]}")
print(f"F1:\t\t {nb_metrics[3]}")

Naive Bayes Classifier
Accuracy:	 0.7696629213483146
Precision:	 0.6666666666666666
Recall:		 0.7384615384615385
F1:		 0.7007299270072994


In [18]:
# K-Nearest Neighbors Classifier
knn_clf = create_model(X_train, y_train, KNeighborsClassifier())
knn_metrics = model_metrics(X_test, y_test, knn_clf)
print(f"KNN Classifier\nAccuracy:\t {knn_metrics[0]}")
print(f"Precision:\t {knn_metrics[1]}")
print(f"Recall:\t\t {knn_metrics[2]}")
print(f"F1:\t\t {knn_metrics[3]}")

KNN Classifier
Accuracy:	 0.6910112359550562
Precision:	 0.625
Recall:		 0.6164383561643836
F1:		 0.6206896551724138


Create dataframe to store algorithm metrics

In [19]:
df = pd.DataFrame({"Algorithm":["Logistic_Regression", "Stochastic_Gradient_Descent", "Support_Vector_Machines",
                           "Decision_Tree", "Random_Forest", "Naive_Bayes", "K-Nearest_Neighbors"],
             "Accuracy":[logit_metrics[0], sgd_metrics[0], svm_metrics[0], tree_metrics[0], rf_metrics[0], nb_metrics[0], knn_metrics[0]],
                  "Precision":[logit_metrics[1], sgd_metrics[1], svm_metrics[1], tree_metrics[1], rf_metrics[1], nb_metrics[1], knn_metrics[1]],
                  "Recall":[logit_metrics[2], sgd_metrics[2], svm_metrics[2], tree_metrics[2], rf_metrics[2], nb_metrics[2], knn_metrics[2]],
                  "F1":[logit_metrics[3], sgd_metrics[3], svm_metrics[3], tree_metrics[3], rf_metrics[3], nb_metrics[3], knn_metrics[3]]})


#### Sort by Accuracy

In [20]:
df.sort_values(by="Accuracy", ascending=False).reset_index(drop=True)

Unnamed: 0,Algorithm,Accuracy,Precision,Recall,F1
0,Random_Forest,0.797753,0.680556,0.790323,0.731343
1,Logistic_Regression,0.792135,0.75,0.739726,0.744828
2,Decision_Tree,0.769663,0.666667,0.738462,0.70073
3,Naive_Bayes,0.769663,0.666667,0.738462,0.70073
4,Stochastic_Gradient_Descent,0.702247,0.861111,0.590476,0.700565
5,K-Nearest_Neighbors,0.691011,0.625,0.616438,0.62069
6,Support_Vector_Machines,0.640449,0.277778,0.625,0.384615


#### Sort by Precision

In [21]:
df.sort_values(by="Precision", ascending=False).reset_index(drop=True)

Unnamed: 0,Algorithm,Accuracy,Precision,Recall,F1
0,Stochastic_Gradient_Descent,0.702247,0.861111,0.590476,0.700565
1,Logistic_Regression,0.792135,0.75,0.739726,0.744828
2,Random_Forest,0.797753,0.680556,0.790323,0.731343
3,Decision_Tree,0.769663,0.666667,0.738462,0.70073
4,Naive_Bayes,0.769663,0.666667,0.738462,0.70073
5,K-Nearest_Neighbors,0.691011,0.625,0.616438,0.62069
6,Support_Vector_Machines,0.640449,0.277778,0.625,0.384615


#### Sort by Recall 

In [22]:
df.sort_values(by="Recall", ascending=False).reset_index(drop=True)

Unnamed: 0,Algorithm,Accuracy,Precision,Recall,F1
0,Random_Forest,0.797753,0.680556,0.790323,0.731343
1,Logistic_Regression,0.792135,0.75,0.739726,0.744828
2,Decision_Tree,0.769663,0.666667,0.738462,0.70073
3,Naive_Bayes,0.769663,0.666667,0.738462,0.70073
4,Support_Vector_Machines,0.640449,0.277778,0.625,0.384615
5,K-Nearest_Neighbors,0.691011,0.625,0.616438,0.62069
6,Stochastic_Gradient_Descent,0.702247,0.861111,0.590476,0.700565


#### Sort by F1

In [23]:
df.sort_values(by="Recall", ascending=False).reset_index(drop=True)

Unnamed: 0,Algorithm,Accuracy,Precision,Recall,F1
0,Random_Forest,0.797753,0.680556,0.790323,0.731343
1,Logistic_Regression,0.792135,0.75,0.739726,0.744828
2,Decision_Tree,0.769663,0.666667,0.738462,0.70073
3,Naive_Bayes,0.769663,0.666667,0.738462,0.70073
4,Support_Vector_Machines,0.640449,0.277778,0.625,0.384615
5,K-Nearest_Neighbors,0.691011,0.625,0.616438,0.62069
6,Stochastic_Gradient_Descent,0.702247,0.861111,0.590476,0.700565
