In [1]:
# data visualization
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

# linear algebra
import numpy as np

# data processing
import pandas as pd

# algorithms
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

# serializing
import pickle


In [2]:
# create df
train = pd.read_csv('titanic.csv')

# drop null values
train.dropna(inplace=True)

# features and target
target = 'Survived'
features = ['Pclass', 'Age', 'SibSp', 'Fare'] 

# X matrix, y vector
X = train[features]
y = train[target]


train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state=0)


In [3]:
# Logistic Regression
lr = LogisticRegression()

lr.fit(train_X, train_y)

lr_predictions = lr.predict(test_X)
lr_predictions


array([1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1], dtype=int64)

In [4]:
# Stochastic Gradient Descent (SGD)
sgd = SGDClassifier(max_iter=5, tol=None)

sgd.fit(train_X, train_y)

sgd_predictions = sgd.predict(test_X)
sgd_predictions


array([0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1], dtype=int64)

In [5]:
# Random Forest
rf = RandomForestClassifier(n_estimators=100)

rf.fit(train_X, train_y)

rf_predictions = rf.predict(test_X)
rf_predictions


array([1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1], dtype=int64)

In [6]:
# K Nearest Neighbor
knn = KNeighborsClassifier(n_neighbors = 3)

knn.fit(train_X, train_y)

knn_predictions = knn.predict(test_X)
knn_predictions


array([1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1], dtype=int64)

In [7]:
# Gaussian Naive Bayes
gnb = GaussianNB()

gnb.fit(train_X, train_y)

gnb_predictions = gnb.predict(test_X)
gnb_predictions


array([0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1], dtype=int64)

In [8]:
# Perceptron
perceptron = Perceptron(max_iter=10)

perceptron.fit(train_X, train_y)

perceptron_predictions = perceptron.predict(test_X)
perceptron_predictions


array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [9]:
# Linear Support Vector Machine
linear_svc = LinearSVC()

linear_svc.fit(train_X, train_y)

linear_svc_predictions = linear_svc.predict(test_X)
linear_svc_predictions




array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [10]:
# Decision Tree
dt = DecisionTreeClassifier()

dt.fit(train_X, train_y)

dt_predictions = dt.predict(test_X)
dt_predictions



array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1], dtype=int64)

In [11]:
# Score models
# Logistic Regression
lr_accuracy = accuracy_score(test_y, lr_predictions)

# Stochastic Gradient Descent (SGD)
sgd_accuracy = accuracy_score(test_y, sgd_predictions)

# Random Forest
rf_accuracy = accuracy_score(test_y, rf_predictions)

# K Nearest Neighbor
knn_accuracy = accuracy_score(test_y, knn_predictions)

# Gaussian Naive Bayes
gnb_accuracy = accuracy_score(test_y, gnb_predictions)

# Perceptron
perceptron_accuracy = accuracy_score(test_y, perceptron_predictions)

# Linear Support Vector Machine
linear_svc_accuracy = accuracy_score(test_y, linear_svc_predictions)

# Decision Tree
dt_accuracy = accuracy_score(test_y, dt_predictions)


In [12]:
# create new dataframe
results = pd.DataFrame({
    'Model': ['Logistic Regression',
              'Stochastic Gradient Descent (SGD)',
              'Random Forest', 
              'K Nearest Neighbor',
              'Gaussian Naive Bayes',
              'Perceptron', 
              'Linear Support Vector Machine', 
              'Decision Tree'],
    'Score': [lr_accuracy,
              sgd_accuracy,
              rf_accuracy, 
              knn_accuracy,
              gnb_accuracy,
              perceptron_accuracy, 
              linear_svc_accuracy,
              dt_accuracy]
})

# sort results by 'Score'
result_df = results.sort_values(by='Score', ascending=False)

# preview the dataframe
result_df.head(8)


Unnamed: 0,Model,Score
5,Perceptron,0.810811
6,Linear Support Vector Machine,0.810811
0,Logistic Regression,0.675676
1,Stochastic Gradient Descent (SGD),0.675676
3,K Nearest Neighbor,0.648649
2,Random Forest,0.621622
4,Gaussian Naive Bayes,0.621622
7,Decision Tree,0.594595
