# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
  
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn import metrics 
from sklearn.ensemble import BaggingClassifier

# Importing Data.

In [2]:
train_ds = pd.read_csv("./Dataset/train.csv")
test_ds = pd.read_csv("./Dataset/test.csv")

In [3]:
print("Train data shape: {}\nTest data shape: {}".format(train_ds.shape, test_ds.shape))

Train data shape: (891, 12)
Test data shape: (418, 11)


In [4]:
data = train_ds.copy()

# Data Preprocessing and Exploration

- SibSp: # of sibling/spouses aboard the Titanic
- Parch: # of parents/childred aboard the Titanic
- embarked: port of embarkataion (Boarding port of passengers)
    - C = Cherbourg
    - Q = Queenstown
    - S = Southampton
- Pclass: Socio-economic status
    - 1st = Upper
    - 2nd = middle
    - 3rd = lower
- SibSp: Sibling/Spouse(brother, sister, stepbrother, stepsister / husband, wife)
- parch: parent(mother, fatehr), child(daughter, son, stepdaughter, stepson)
- rest are self explanatory

In [5]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Dealing with missing value

In [6]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
data.Age.describe()
# since the mean age and median(or second quartile) are close to each other,
# so the missing values can be replaced by mean value

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [8]:
data.Cabin.describe()
# The missing value in case of Cabin are very large so we'll convert the value into true and false
# depending upon whether the person has cabin or not.

count         204
unique        147
top       B96 B98
freq            4
Name: Cabin, dtype: object

In [9]:
data.Embarked.describe()
# The missing value in Embarked will be replaced by model of Embarked i.e S

count     889
unique      3
top         S
freq      644
Name: Embarked, dtype: object

In [10]:
data['Age'].fillna(data['Age'].median(), inplace = True)
data['Embarked'].fillna('S', inplace = True)
data['Cabin'] = data['Cabin'].notnull().replace({True:1, False:0})

data.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

## Feature Engineering

### Numerical Variable
- PassengerId
    - We are going to drop it
- Name
    - Extract the title and drop the column name
- Age
    - Convert into categorical column using bins
- Ticket
    - drop it
- Fare
    - convert into categorical variable

### Categorical Variable

- Sruvived
    - we have to predict this variable
- Pclass
- Sex
- Embarked
- SibSp
- Parch
- Cabin

In [11]:
data['name_s'] = data['Name'].apply(lambda x : x.split(".")[0].split(",")[1].strip())
data['name_s'].unique() 

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
       'Jonkheer'], dtype=object)

In [12]:
data['name_s'] = data['name_s'].replace('Mlle','Miss')
data['name_s'] = data['name_s'].replace('Mme','Mrs')
s = ( 'Major', 'Lady', 'Sir','the Countess', 'Jonkheer', 'Don', 'Rev', 'Dr','Capt','Col')
data['name_s'] =  data['name_s'].replace(s,'special')

In [13]:
data['Age'] = pd.qcut(data['Age'], q=4, labels = False)
data['Fare'] = pd.qcut(data['Fare'], q=4, labels = False)

In [14]:
data = data.drop(['Name', 'Ticket','PassengerId',], axis=1)

In [15]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,name_s
0,0,3,male,0,1,0,0,0,S,Mr
1,1,1,female,3,1,0,3,1,C,Mrs
2,1,3,female,1,0,0,1,0,S,Miss
3,1,1,female,2,1,0,3,1,S,Mrs
4,0,3,male,2,0,0,1,0,S,Mr
