In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df_train = pd.read_csv('train (1).csv')
df_test = pd.read_csv('test.csv')
df_submission = pd.read_csv('gender_submission.csv')

In [6]:
df_train.head()

In [7]:
df_test.head()

In [8]:
df_submission.head()

In [9]:
df_test['Survived'] = df_submission['Survived']

In [10]:
df_test.head()

In [11]:
df = pd.concat([df_train,df_test])

In [12]:
df.head()

## EDA and Data Cleaning

In [13]:
df.shape

In [14]:
df.info()

### Age, Fare, Cabin, Embarked have missing values

In [15]:
df[df['Fare'].isnull()]

In [16]:
df = df.dropna(axis=0, subset=['Fare'])

In [17]:
df[df['Embarked'].isnull()]

In [18]:
df = df.dropna(axis=0, subset=['Embarked'])

In [19]:
100*(df.isnull().sum()/len(df))

In [20]:
sns.kdeplot(data=df['Age'])

In [21]:
df.describe()

In [22]:
df['Age'] = df['Age'].fillna(df['Age'].mean())

### There are roughly 77% data missing in cabin column, So its better if we drop

In [23]:
df = df.drop(['Cabin'],axis=1)

In [24]:
df.isnull().sum()

### Checking Outliers

In [25]:
sns.boxplot(x=df['Survived'], y =df['Fare'])

In [26]:
df[(df['Survived'] == 1) & (df['Fare'] > 400)][['Survived','Fare']]

In [27]:
drop_index = df[(df['Survived'] == 1) & (df['Fare'] > 400)][['Survived','Fare']].index

df = df.drop(drop_index, axis=0)

In [28]:
sns.boxplot(x=df['Survived'], y =df['Fare'])

## EDA

In [29]:
sns.countplot(data=df,x='Survived',palette='RdBu_r')

In [30]:
sns.barplot(x = df['Survived'], y = df['Fare'],palette='RdBu_r')

### Those who paid more for tickets they survived

In [31]:
sns.barplot(y = df['Survived'], x = df['Sex'],palette='RdBu_r')

### Clearly shows most of the survivors were female

In [32]:
sns.barplot(x = df['Survived'], y = df['Age'],palette='RdBu_r')

In [33]:
sns.countplot(x=df['Survived'],hue=df['Pclass'],palette='hls')

In [34]:
sns.countplot(data=df,x='Survived',hue='Embarked',palette='rainbow')

In [35]:
plt.figure(figsize=(15,6))

sns.heatmap(data=df.corr(),annot=True,)

### Converting Categorical features - Embarked, Sex

In [36]:
df = pd.get_dummies(df,columns = ['Embarked','Sex','Pclass'], drop_first=True)

In [37]:
df.head()

We wont need Name column so lets drop that

In [38]:
df.drop(['Name'], axis=1, inplace=True)

### Training Model

In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Survived','Ticket'], axis=1),
                                                   df['Survived'],
                                                   test_size = 0.2,
                                                   random_state = 100)

In [40]:
X_train.head()

In [41]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(solver='liblinear')

In [42]:
log_model.fit(X_train,y_train)

In [43]:
y_pred = log_model.predict(X_test)

In [44]:
X_test2 = X_test.copy()
X_test2['predictions'] = y_pred
X_test2['Actual'] = y_test

In [45]:
X_test2

In [46]:
from sklearn.metrics import classification_report, precision_score, recall_score, accuracy_score, f1_score, confusion_matrix

In [47]:
precision_score(y_test,y_pred)

In [48]:
accuracy_score(y_test,y_pred)

In [49]:
recall_score(y_test,y_pred)

In [50]:
f1_score(y_test,y_pred)

In [51]:
sns.heatmap(confusion_matrix(y_test,y_pred), annot=True, fmt='0.0f')
plt.show()

In [52]:
print(classification_report(y_test,y_pred))

Let us try with scaling the features whether our model improves

In [53]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [54]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Survived','Ticket'], axis=1),
                                                   df['Survived'],
                                                   test_size = 0.2,
                                                   random_state = 100)

In [55]:
sc.fit(X_train)
scaled_Xtrain = sc.transform(X_train)
scaled_Xtest = sc.transform(X_test)

In [56]:
log_model = LogisticRegression(solver='liblinear')

In [57]:
log_model.fit(scaled_Xtrain,y_train)

In [58]:
y_pred_1 = log_model.predict(scaled_Xtest)

In [59]:
precision_score(y_test,y_pred_1)

In [60]:
accuracy_score(y_test,y_pred_1)

In [61]:
recall_score(y_test,y_pred_1)

In [62]:
sns.heatmap(confusion_matrix(y_test,y_pred_1), annot=True, fmt='0.0f')
plt.show()

In [63]:
print(classification_report(y_test,y_pred_1))

In [64]:
from sklearn.metrics import precision_recall_curve, plot_precision_recall_curve, plot_roc_curve

ROC Curve - Helpes you to identify the best threshold for making a decision

In [67]:
plot_roc_curve(log_model, scaled_Xtest, y_test)