### Modules Importing 

In [437]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
import os
import re
sns.set()

### Data Importing

In [438]:
print("Importing dataframes")
df_train = pd.read_csv('C:/Users/Jisnu/Downloads/Kaggle/train.csv', index_col=  'PassengerId')
df_test = pd.read_csv('C:/Users/Jisnu/Downloads/Kaggle/test.csv',index_col=  'PassengerId')
print ("")
print("Done !!!")
print('\nAll Data shape: {} Rows, {} Columns'.format(*df_train.shape))

Importing dataframes

Done !!!

All Data shape: 891 Rows, 11 Columns


### General Review

In [439]:
df_train.sample(5)
df_test_raw = df_test.copy()
df_train_raw = df_train.copy()
df_test_raw = df_test_raw.reset_index()

In [440]:
df_test.sample(5)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1004,1,"Evans, Miss. Edith Corse",female,36.0,0,0,PC 17531,31.6792,A29,C
1075,3,"Lane, Mr. Patrick",male,,0,0,7935,7.75,,Q
998,3,"Buckley, Mr. Daniel",male,21.0,0,0,330920,7.8208,,Q
949,3,"Abelseth, Mr. Olaus Jorgensen",male,25.0,0,0,348122,7.65,F G63,S
1001,2,"Swane, Mr. George",male,18.5,0,0,248734,13.0,F,S


### Explorative Data Analysis

In [441]:
df_train.info()
print('-' * 50)
print('There are many missing values in cabin and age columns also is missing some values')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Ticket      891 non-null object
Fare        891 non-null float64
Cabin       204 non-null object
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB
--------------------------------------------------
There are many missing values in cabin and age columns also is missing some values


In [442]:
# Dropping rows with missing values in Embarked columns
df_train.dropna(axis ='rows', subset = ['Embarked'], inplace = True)

#Convert the type to Catogorical values
df_train['Sex'] = df_train['Sex'].astype('category')
df_train['Embarked'] = df_train['Embarked'].astype('category')

In [443]:
# Dropping the Cabin column since most of the values are missing
df_train.drop(['Cabin'],axis = 1, inplace = True)
print('\n Data shape: {} Rows, {} Columns'.format(*df_train.shape))


 Data shape: 889 Rows, 10 Columns


In [444]:
print(df_train.describe())
print('-' * 50)
print('Looking at the summary, Scaling of the values is required.')

         Survived      Pclass         Age       SibSp       Parch        Fare
count  889.000000  889.000000  712.000000  889.000000  889.000000  889.000000
mean     0.382452    2.311586   29.642093    0.524184    0.382452   32.096681
std      0.486260    0.834700   14.492933    1.103705    0.806761   49.697504
min      0.000000    1.000000    0.420000    0.000000    0.000000    0.000000
25%      0.000000    2.000000   20.000000    0.000000    0.000000    7.895800
50%      0.000000    3.000000   28.000000    0.000000    0.000000   14.454200
75%      1.000000    3.000000   38.000000    1.000000    0.000000   31.000000
max      1.000000    3.000000   80.000000    8.000000    6.000000  512.329200
--------------------------------------------------
Looking at the summary, Scaling of the values is required.


In [445]:
# Checking for Class imbalance
print(df_train.groupby('Sex')['Sex'].count())
print('-' * 50)
print('There seems to be imbalance in the classes, therefore measures need to taken')

Sex
female    312
male      577
Name: Sex, dtype: int64
--------------------------------------------------
There seems to be imbalance in the classes, therefore measures need to taken


### Aligning the dataframes

In [446]:
y_train = df_train['Survived']

# Align the training and testing data, keep only columns present in both dataframes
df_train, df_test = df_train.align(df_test, join = 'inner', axis = 1)

print('Training Features shape: ', df_train.shape)
print('Testing Features shape: ', df_test.shape)


Training Features shape:  (889, 9)
Testing Features shape:  (418, 9)


In [447]:
df_train.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [448]:
df_train.drop(df_train.columns[[1,6]],axis =1, inplace =True)
df_test.drop(df_test.columns[[1,6]],axis =1, inplace =True)

### Analysis of Categorical Features

In [449]:
df_train= pd.get_dummies(df_train, drop_first = True)
df_test = pd.get_dummies(df_test, drop_first = True)

print('Training Features shape: ', df_train.shape)
print('Testing Features shape: ', df_test.shape)


Training Features shape:  (889, 8)
Testing Features shape:  (418, 8)


In [450]:
df_train.head()

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,3,22.0,1,0,7.25,1,0,1
2,1,38.0,1,0,71.2833,0,0,0
3,3,26.0,0,0,7.925,0,0,1
4,1,35.0,1,0,53.1,0,0,1
5,3,35.0,0,0,8.05,1,0,1


### Imputation and Feature Scaling

In [451]:
imputer = Imputer(missing_values='NaN', strategy='mean')
df_train = imputer.fit_transform(df_train)
df_test = imputer.fit_transform(df_test)

In [452]:
df_train = pd.DataFrame(df_train)
df_test= pd.DataFrame(df_test)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 889 entries, 0 to 888
Data columns (total 8 columns):
0    889 non-null float64
1    889 non-null float64
2    889 non-null float64
3    889 non-null float64
4    889 non-null float64
5    889 non-null float64
6    889 non-null float64
7    889 non-null float64
dtypes: float64(8)
memory usage: 55.6 KB


### Prediction using differnt Models

In [453]:
#logistic regression
log_reg = LogisticRegression()
log_reg.fit(df_train, y_train)
y_pred = log_reg.predict(df_test)
 
# Print the score
print(log_reg.score(df_train,y_train))

0.802024746907


In [454]:
# Decision Tree
Decision_tree = DecisionTreeClassifier()
decision_tree.fit(df_train, y_train)
y_pred = decision_tree.predict(df_test)

#Print the predict score
print(decision_tree.score(df_train, y_train))

0.982002249719


In [455]:
submission = pd.DataFrame({
        "PassengerId": df_test_raw["PassengerId"],
        "Survived": y_pred
    })
submission.to_csv('titanic.csv', index=False)