#  Importing Packages

In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

## Data collection 

In [2]:
gender_sub=pd.read_csv("data/gender_submission.csv")
train_data=pd.read_csv("data/train.csv")
test_data=pd.read_csv("data/test.csv")

In [3]:
gender_sub.head(5)


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [4]:
train_data.head(8)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S


# Data Cleaning

In [5]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [6]:
# Controlling Missing values in dataset
train_data.isnull().sum()/train_data.count()*100

PassengerId      0.000000
Survived         0.000000
Pclass           0.000000
Name             0.000000
Sex              0.000000
Age             24.789916
SibSp            0.000000
Parch            0.000000
Ticket           0.000000
Fare             0.000000
Cabin          336.764706
Embarked         0.224972
dtype: float64

In [7]:
#Identify which columns are numerical and categorical:
train_data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

## Dealing with the Nan values

In [8]:
# calculating the mean of age column
age_mean=train_data["Age"].astype("float").mean(axis=0) # getting the age values mean
train_data.replace(np.nan,age_mean,inplace=True)
# same process for fare
fare_mean=train_data["Fare"].astype("float").mean(axis=0)
train_data.replace(np.nan,fare_mean,inplace=True)

# SAME PROCESS FOR TEST SET
# calculating the mean of age column
age_mean=test_data["Age"].astype("float").mean(axis=0) # getting the age values mean
test_data.replace(np.nan,age_mean,inplace=True)
# same process for fare
fare_mean=test_data["Fare"].astype("float").mean(axis=0)
test_data.replace(np.nan,fare_mean,inplace=True)

In [9]:
y=train_data["Survived"].to_numpy()

y[0:100]



array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0])

In [10]:
train_data["Sex"].replace("male",0,inplace=True)
train_data["Sex"].replace("female",1,inplace=True)
test_data["Sex"].replace("male",0,inplace=True)
test_data["Sex"].replace("female",1,inplace=True)
train_data["Sex"].head()

0    0
1    1
2    1
3    1
4    0
Name: Sex, dtype: int64

In [11]:
train_data["Embarked"].replace("S",0,inplace=True)
train_data["Embarked"].replace("C",1,inplace=True)
train_data["Embarked"].replace("Q",2,inplace=True)
train_data["Embarked"]

test_data["Embarked"].replace("S",0,inplace=True)
test_data["Embarked"].replace("C",1,inplace=True)
test_data["Embarked"].replace("Q",2,inplace=True)


In [12]:
X=train_data[["PassengerId","Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]]
X_test=test_data[["PassengerId","Pclass","Sex","Age","SibSp","Parch","Fare","Embarked"]]
X

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,0,22.000000,1,0,7.2500,0.0
1,2,1,1,38.000000,1,0,71.2833,1.0
2,3,3,1,26.000000,0,0,7.9250,0.0
3,4,1,1,35.000000,1,0,53.1000,0.0
4,5,3,0,35.000000,0,0,8.0500,0.0
...,...,...,...,...,...,...,...,...
886,887,2,0,27.000000,0,0,13.0000,0.0
887,888,1,1,19.000000,0,0,30.0000,0.0
888,889,3,1,29.699118,1,2,23.4500,0.0
889,890,1,0,26.000000,0,0,30.0000,1.0


In [48]:
# Normalize training
transform=preprocessing.StandardScaler()
X=transform.fit(X).transform(X)
X_test=transform.fit(X_test).transform(X_test)
X

array([[-1.73010796,  0.82737724, -0.73769513, ..., -0.47367361,
        -0.50244517, -0.28037604],
       [-1.72622007, -1.56610693,  1.35557354, ..., -0.47367361,
         0.78684529,  0.37462188],
       [-1.72233219,  0.82737724,  1.35557354, ..., -0.47367361,
        -0.48885426, -0.28037604],
       ...,
       [ 1.72233219,  0.82737724,  1.35557354, ...,  2.00893337,
        -0.17626324, -0.28037604],
       [ 1.72622007, -1.56610693, -0.73769513, ..., -0.47367361,
        -0.04438104,  0.37462188],
       [ 1.73010796,  0.82737724, -0.73769513, ..., -0.47367361,
        -0.49237783,  1.0296198 ]])

# LOGISTIC REGRESSION

In [14]:
parameters={"C":[0.01,0.1,1],"penalty":["l2"],'solver':['lbfgs']}
lr= LogisticRegression()
logreg_cv=GridSearchCV(lr,parameters,cv=10)
logreg_cv.fit(X,y)

GridSearchCV(cv=10, estimator=LogisticRegression(),
             param_grid={'C': [0.01, 0.1, 1], 'penalty': ['l2'],
                         'solver': ['lbfgs']})

In [15]:
print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

tuned hpyerparameters :(best parameters)  {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
accuracy : 0.7968539325842696


# KNN

In [16]:
parameters = {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
              'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
              'p': [1,2]}

KNN = KNeighborsClassifier()

In [17]:

knn_cv=GridSearchCV(KNN,parameters,cv=10)
knn_cv.fit(X, y)

GridSearchCV(cv=10, estimator=KNeighborsClassifier(),
             param_grid={'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                         'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'p': [1, 2]})

In [18]:
print("tuned hpyerparameters :(best parameters) ",knn_cv.best_params_)
print("accuracy :",knn_cv.best_score_)

tuned hpyerparameters :(best parameters)  {'algorithm': 'auto', 'n_neighbors': 10, 'p': 1}
accuracy : 0.8249063670411985


# TREE

In [19]:
parameters = {'criterion': ['gini', 'entropy'],
     'splitter': ['best', 'random'],
     'max_depth': [2*n for n in range(1,10)],
     'max_features': ['auto', 'sqrt'],
     'min_samples_leaf': [1, 2, 4],
     'min_samples_split': [2, 5, 10]}

tree = DecisionTreeClassifier()

In [20]:
tree_cv=GridSearchCV(tree,parameters,cv=10)
tree_cv.fit(X, y)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 4, 6, 8, 10, 12, 14, 16, 18],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'splitter': ['best', 'random']})

In [21]:
print("tuned hpyerparameters :(best parameters) ",tree_cv.best_params_)
print("accuracy :",tree_cv.best_score_)

tuned hpyerparameters :(best parameters)  {'criterion': 'entropy', 'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'splitter': 'best'}
accuracy : 0.8148564294631709


# ANN

In [22]:

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)

2.6.2


In [23]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [54]:
model = Sequential()
model.add(Dense(12, input_dim=8, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [56]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=150, batch_size=10)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

<keras.callbacks.History at 0x7f7ed7685c50>

In [58]:
# evaluate the keras model
_, accuracy = model.evaluate(X, y)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 85.63


In [71]:
ann_results= model.predict(X_test)
ann_results[ann_results<0.5] = 0
ann_results[ann_results>=0.5] = 1
yhat=ann_results.astype("int")


## Submitting CSV

In [72]:
test_data["PassengerId"]

0       892
1       893
2       894
3       895
4       896
       ... 
413    1305
414    1306
415    1307
416    1308
417    1309
Name: PassengerId, Length: 418, dtype: int64

In [26]:
yhat.size


418

In [73]:

result= pd.DataFrame(test_data["PassengerId"].values,columns=["PassengerId"])
result


Unnamed: 0,PassengerId
0,892
1,893
2,894
3,895
4,896
...,...
413,1305
414,1306
415,1307
416,1308


In [74]:
result.insert(1,"Survived",yhat)
result


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [75]:
result.to_csv('result.csv', index=False)