In [208]:
# Import the required packages here

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Use this to list the files present in the directory
# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

In [209]:
# Import the datasets here
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [210]:
print(train.shape)
print(test.shape)

(891, 12)
(418, 11)


In [211]:
# Append the train and test dataframes for data cleaning

train['test_flag'] = 0
test['test_flag'] = 1
df_combined = pd.concat([train, test], axis=0, copy=True)

### High level EDA

In [212]:
# Check the % missing values in all the columns of the train set
print(df_combined.isnull().sum()*100/df_combined.shape[0])

PassengerId     0.000000
Survived       31.932773
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            20.091673
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.076394
Cabin          77.463713
Embarked        0.152788
test_flag       0.000000
dtype: float64


In [213]:
df_combined.dtypes

PassengerId      int64
Survived       float64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
test_flag        int64
dtype: object

In [214]:
# Subsetting for the list of columns which has less to no missing values

df_subset = df_combined[['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Name', 'Embarked', 'test_flag']]

### Data Cleaning and missing value imputation for the columns

In [215]:
# categorical = train_subset[['Pclass', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Name']]
# continuous = train_subset[['Age', 'Fare']]

In [216]:
# Cleaning and level modifications for the categorical features

for dataset in df_subset:
    df_subset['Title'] = df_subset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

pd.crosstab(df_subset['Title'], df_subset['Sex'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,4
Countess,1,0
Don,0,1
Dona,1,0
Dr,1,7
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,61


In [217]:
for dataset in df_subset:
    df_subset['Title'] = df_subset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
     'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    df_subset['Title'] = df_subset['Title'].replace('Mlle', 'Miss')
    df_subset['Title'] = df_subset['Title'].replace('Ms', 'Miss')
    df_subset['Title'] = df_subset['Title'].replace('Mme', 'Mrs')
    
df_subset[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value inste

Unnamed: 0,Title,Survived
0,Master,0.575
1,Miss,0.702703
2,Mr,0.156673
3,Mrs,0.793651
4,Rare,0.347826


In [218]:
cols_new = ['Title']

for col in cols_new:
    df_subset[col] = pd.factorize(df_subset[col])[0] + 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [219]:
# print(df_subset.isnull().sum()*100/df_subset.shape[0])

In [220]:
df_subset = df_subset.drop(['Name'], axis = 1)

In [221]:
cols_new = ['Sex']

for col in cols_new:
    df_subset[col] = pd.factorize(df_subset[col])[0] + 1

In [222]:
# df_subset['Cabin'].unique()

In [223]:
df_subset['Cabin'] = df_subset['Cabin'].replace(np.nan, 'U')
df_subset['Cabin_Class'] = df_subset['Cabin'].astype(str).str[0]

In [224]:
cols_new = ['Cabin_Class']

for col in cols_new:
    df_subset[col] = pd.factorize(df_subset[col])[0] + 1

In [225]:
df_subset = df_subset.drop(['Ticket', 'Cabin'], axis = 1)

In [226]:
median_age = df_subset["Age"].median()
df_subset["Age"].fillna(median_age, inplace=True)

In [227]:
for dataset in df_subset:
    df_subset['FamilySize'] = df_subset['SibSp'] + df_subset['Parch'] + 1

In [228]:
df_subset = df_subset.drop(['SibSp', 'Parch'], axis = 1)

In [229]:
mode_embarked = df_subset["Embarked"].mode()
df_subset["Embarked"].fillna(mode_embarked, inplace=True)

In [230]:
cols_new = ['Embarked']

for col in cols_new:
    df_subset[col] = pd.factorize(df_subset[col])[0] + 1
    
# df_subset = df_subset.drop(['Embarked'], axis = 1)

In [231]:
df_subset['Fare'].fillna(df_subset['Fare'].dropna().median(), inplace=True)

In [232]:
df_subset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Embarked,test_flag,Title,Cabin_Class,FamilySize
0,1,0.0,3,1,22.0,7.25,1,0,1,1,2
1,2,1.0,1,2,38.0,71.2833,2,0,2,2,2
2,3,1.0,3,2,26.0,7.925,1,0,3,1,1
3,4,1.0,1,2,35.0,53.1,1,0,2,2,2
4,5,0.0,3,1,35.0,8.05,1,0,1,1,1


In [233]:
train_set = df_subset[df_subset['test_flag']==0]
test_set = df_subset[df_subset['test_flag']==1]

In [234]:
print(train_set.shape)
print(test_set.shape)

(891, 11)
(418, 11)


In [235]:
test_set = test_set.drop(['Survived', 'test_flag'], axis = 1)
train_set = train_set.drop(['test_flag'], axis = 1)

In [236]:
train_set.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Cabin_Class,FamilySize
0,1,0.0,3,1,22.0,7.25,1,1,1,2
1,2,1.0,1,2,38.0,71.2833,2,2,2,2
2,3,1.0,3,2,26.0,7.925,1,3,1,1
3,4,1.0,1,2,35.0,53.1,1,2,2,2
4,5,0.0,3,1,35.0,8.05,1,1,1,1


In [237]:
from sklearn.model_selection import train_test_split

X = train_set[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Title', 'Cabin_Class',
               'FamilySize']]
y = train_set[['Survived']]

In [238]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(596, 8) (596, 1)
(295, 8) (295, 1)


In [239]:
X_test.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Title,Cabin_Class,FamilySize
709,3,1,28.0,15.2458,2,4,1,3
439,2,1,31.0,10.5,1,1,1,1
840,3,1,20.0,7.925,1,1,1,1
720,2,2,6.0,33.0,1,3,1,2
39,3,2,14.0,11.2417,2,3,1,2


In [240]:
# Logistic Regression Model

from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logistic_model = logreg.fit(X_train, y_train)
# logistic_model
predictions = logistic_model.predict(X_test)
predictions[0:10]

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


array([0., 0., 0., 1., 1., 1., 1., 0., 1., 1.])

In [241]:
from sklearn.metrics import classification_report

print(classification_report(y_test, predictions))
# test_set.head()

              precision    recall  f1-score   support

         0.0       0.84      0.89      0.86       175
         1.0       0.82      0.76      0.79       120

    accuracy                           0.83       295
   macro avg       0.83      0.82      0.83       295
weighted avg       0.83      0.83      0.83       295



In [250]:
# Decision Tree model
from sklearn.tree import DecisionTreeClassifier

dec_tree = DecisionTreeClassifier()
decision_tree = dec_tree.fit(X_train, y_train)
prediction_tree = decision_tree.predict(X_test)
acc_decision_tree = round(decision_tree.score(X_train, y_train) * 100, 2)
acc_decision_tree

98.32

In [246]:
print(classification_report(y_test, prediction_tree))

              precision    recall  f1-score   support

         0.0       0.80      0.79      0.80       175
         1.0       0.70      0.72      0.71       120

    accuracy                           0.76       295
   macro avg       0.75      0.75      0.75       295
weighted avg       0.76      0.76      0.76       295



In [266]:
# Random Forest Model

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100)
random_forest = rf.fit(X_train, y_train)
prediction_forest = random_forest.predict(X_test)
acc_random_forest = round(random_forest.score(X_train, y_train) * 100, 2)
acc_random_forest

  


98.32

In [267]:
print(classification_report(y_test, prediction_forest))

              precision    recall  f1-score   support

         0.0       0.82      0.82      0.82       175
         1.0       0.73      0.73      0.73       120

    accuracy                           0.78       295
   macro avg       0.78      0.78      0.78       295
weighted avg       0.78      0.78      0.78       295



In [272]:
# Naive Bayes

from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gaussian = gnb.fit(X_train, y_train)
prediction_naive = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, y_train) * 100, 2)
acc_gaussian

  return f(*args, **kwargs)


77.01

In [274]:
print(classification_report(y_test, prediction_naive))

              precision    recall  f1-score   support

         0.0       0.85      0.81      0.83       175
         1.0       0.74      0.79      0.77       120

    accuracy                           0.80       295
   macro avg       0.80      0.80      0.80       295
weighted avg       0.81      0.80      0.80       295



In [273]:
# Perceptron Model

from sklearn.linear_model import Perceptron

pcp = Perceptron()
perceptron = pcp.fit(X_train, y_train)
prediction_perceptron = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, y_train) * 100, 2)
acc_perceptron

  return f(*args, **kwargs)


62.75

In [275]:
print(classification_report(y_test, prediction_perceptron))

              precision    recall  f1-score   support

         0.0       0.60      1.00      0.75       175
         1.0       1.00      0.01      0.02       120

    accuracy                           0.60       295
   macro avg       0.80      0.50      0.38       295
weighted avg       0.76      0.60      0.45       295



### Use the section below to submit on the test set

In [259]:
test_feature = test_set[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Title',
                         'Cabin_Class', 'FamilySize']]
test_id = test_subset['PassengerId']
test_feature.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,Title,Cabin_Class,FamilySize
0,3,1,34.5,7.8292,3,1,1,1
1,3,2,47.0,7.0,1,2,1,2
2,2,1,62.0,9.6875,3,1,1,1
3,3,1,27.0,8.6625,1,1,1,1
4,3,2,22.0,12.2875,1,2,1,3


In [269]:
test_predictions = random_forest.predict(test_feature).astype(int)

In [270]:
output = pd.DataFrame({'PassengerId' : test_id, 'Survived': test_predictions})
output.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [None]:
# Use this if the score comes as 0 in submission

# model.predict(test_data).astype(int)

In [271]:
output.to_csv('/kaggle/working/submission.csv', index=False)