In [None]:
# Import the required packages here

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Use this to list the files present in the directory
# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

In [None]:
# Import the datasets here
train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
print(train.shape)
print(test.shape)

In [None]:
# Append the train and test dataframes for data cleaning

train['test_flag'] = 0
test['test_flag'] = 1
df_combined = pd.concat([train, test], axis=0, copy=True)

### High level EDA

In [None]:
# Check the % missing values in all the columns of the train set
print(df_combined.isnull().sum()*100/df_combined.shape[0])

In [None]:
df_combined.dtypes

In [None]:
# Subsetting for the list of columns which has less to no missing values

df_subset = df_combined[['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Name', 'Embarked', 'test_flag']]

### Data Cleaning and missing value imputation for the columns

In [None]:
# categorical = train_subset[['Pclass', 'Sex', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Name']]
# continuous = train_subset[['Age', 'Fare']]

In [None]:
# Cleaning and level modifications for the categorical features

for dataset in df_subset:
    df_subset['Title'] = df_subset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

pd.crosstab(df_subset['Title'], df_subset['Sex'])

In [None]:
for dataset in df_subset:
    df_subset['Title'] = df_subset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
     'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    df_subset['Title'] = df_subset['Title'].replace('Mlle', 'Miss')
    df_subset['Title'] = df_subset['Title'].replace('Ms', 'Miss')
    df_subset['Title'] = df_subset['Title'].replace('Mme', 'Mrs')
    
df_subset[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

In [None]:
cols_new = ['Title']

for col in cols_new:
    df_subset[col] = pd.factorize(df_subset[col])[0] + 1

In [None]:
# print(df_subset.isnull().sum()*100/df_subset.shape[0])

In [None]:
df_subset = df_subset.drop(['Name'], axis = 1)

In [None]:
cols_new = ['Sex']

for col in cols_new:
    df_subset[col] = pd.factorize(df_subset[col])[0] + 1

In [None]:
# df_subset['Cabin'].unique()

In [None]:
df_subset['Cabin'] = df_subset['Cabin'].replace(np.nan, 'U')
df_subset['Cabin_Class'] = df_subset['Cabin'].astype(str).str[0]

In [None]:
cols_new = ['Cabin_Class']

for col in cols_new:
    df_subset[col] = pd.factorize(df_subset[col])[0] + 1

In [None]:
df_subset = df_subset.drop(['Ticket', 'Cabin'], axis = 1)

In [None]:
median_age = df_subset["Age"].median()
df_subset["Age"].fillna(median_age, inplace=True)

In [None]:
for dataset in df_subset:
    df_subset['FamilySize'] = df_subset['SibSp'] + df_subset['Parch'] + 1

In [None]:
df_subset = df_subset.drop(['SibSp', 'Parch'], axis = 1)

In [None]:
mode_embarked = df_subset["Embarked"].mode()
df_subset["Embarked"].fillna(mode_embarked, inplace=True)

In [None]:
cols_new = ['Embarked']

for col in cols_new:
    df_subset[col] = pd.factorize(df_subset[col])[0] + 1
    
# df_subset = df_subset.drop(['Embarked'], axis = 1)

In [None]:
df_subset['Fare'].fillna(df_subset['Fare'].dropna().median(), inplace=True)

In [None]:
df_subset.head()

In [None]:
train_set = df_subset[df_subset['test_flag']==0]
test_set = df_subset[df_subset['test_flag']==1]

In [None]:
print(train_set.shape)
print(test_set.shape)

In [None]:
test_set = test_set.drop(['Survived', 'test_flag'], axis = 1)
train_set = train_set.drop(['test_flag'], axis = 1)

In [None]:
train_set.head(5)

In [None]:
from sklearn.model_selection import train_test_split

X = train_set[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Title', 'Cabin_Class',
               'FamilySize']]
y = train_set[['Survived']]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
X_test.head()

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

logistic_model = logreg.fit(X_train, y_train)

logistic_model

In [None]:
predictions = logistic_model.predict(X_test)
predictions[0:10]

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, predictions))

In [None]:
test_set.head()

In [None]:
test_feature = test_set[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'Title',
                         'Cabin_Class', 'FamilySize']]
test_id = test_subset['PassengerId']
test_feature.head()

In [None]:
test_predictions = logistic_model.predict(test_feature)

In [None]:
output = pd.DataFrame({'PassengerId' : test_id, 'Survived': test_predictions})
output.head()

In [None]:
output.to_csv('/kaggle/working/submission.csv', index=False)