In [213]:
import numpy as np
import pandas as pd
from collections import Counter

In [214]:
df_train = pd.read_csv('/home/jinesh/Desktop/Titanic/train.csv')
df_test  = pd.read_csv('/home/jinesh/Desktop/Titanic/test.csv')
df_sample= pd.read_csv('/home/jinesh/Desktop/Titanic/gender_submission.csv')

In [215]:
### Define function to remove Outleirs data
def detect_outliers(df,n,features):
    """
    Takes a dataframe df of features and returns a list of the indices
    corresponding to the observations containing more than n outliers according
    to the Tukey method.
    """
    outlier_indices = []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    
    return multiple_outliers

In [216]:
Outliers_to_drop = detect_outliers(df_train,2,["Age","SibSp","Parch","Fare"])
df_train = df_train.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)

  interpolation=interpolation)


In [217]:
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):

        if Pclass == 1:
            return 37

        elif Pclass == 2:
            return 29

        else:
            return 24

    else:
        return Age

In [218]:
def impute_fare(cols):
    Fare = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Fare):

        if Pclass == 1:
            return 84

        elif Pclass == 2:
            return 20

        else:
            return 13

    else:
        return Fare

In [219]:
df_train['Age'] = df_train[['Age','Pclass']].apply(impute_age,axis=1)

In [220]:
sex = pd.get_dummies(df_train['Sex'],drop_first=True)
embark = pd.get_dummies(df_train['Embarked'],drop_first=True)
df_train = pd.concat([df_train,sex,embark],axis=1)

In [221]:
dataset_title = [i.split(",")[1].split(".")[0].strip() for i in df_train["Name"]]
df_train["Title"] = pd.Series(dataset_title)
df_train["Title"] = df_train["Title"].replace(['Lady', 'the Countess','Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
df_train["Title"] = df_train["Title"].map({"Master":0, "Miss":1, "Ms" : 1 , "Mme":1, "Mlle":1, "Mrs":1, "Mr":2, "Rare":3})
df_train["Title"] = df_train["Title"].astype(int)

In [222]:
df_train["Family"] = df_train["SibSp"] + df_train["Parch"] + 1
df_train['Single'] = df_train['Family'].map(lambda s: 1 if s == 1 else 0)
df_train['SmallF'] = df_train['Family'].map(lambda s: 1 if  s == 2  else 0)
df_train['MedF']   = df_train['Family'].map(lambda s: 1 if 3 <= s <= 4 else 0)
df_train['LargeF'] = df_train['Family'].map(lambda s: 1 if s >= 5 else 0)
df_train['Senior'] = df_train['Age'].map(lambda s:1 if s>60 else 0)
df_train.drop(labels = ["Name"], axis = 1, inplace = True)

In [223]:
dataset_title = [i.split(",")[1].split(".")[0].strip() for i in df_test["Name"]]
df_test["Title"] = pd.Series(dataset_title)
df_test["Title"] = df_test["Title"].replace(['Lady', 'the Countess','Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
df_test["Title"] = df_test["Title"].map({"Master":0, "Miss":1, "Ms" : 1 , "Mme":1, "Mlle":1, "Mrs":1, "Mr":2, "Rare":3})
df_test["Title"] = df_test["Title"].astype(int)
df_test.drop(labels = ["Name"], axis = 1, inplace = True)

In [224]:
df_test['Age'] = df_test[['Age','Pclass']].apply(impute_age,axis=1)
sex = pd.get_dummies(df_test['Sex'],drop_first=True)
embark = pd.get_dummies(df_test['Embarked'],drop_first=True)
df_test = pd.concat([df_test,sex,embark],axis=1)

df_test['Fare'].fillna(value=df_test['Fare'].median(),inplace=True)

In [225]:
df_test['Fare'] = df_test[['Fare','Pclass']].apply(impute_fare,axis=1)
df_test["Fare"] = df_test["Fare"].map(lambda i: np.log(i) if i > 0 else 0)
df_test["Family"] = df_test["SibSp"] + df_test["Parch"] + 1

In [226]:
df_test['Single'] = df_test['Family'].map(lambda s: 1 if s == 1 else 0)
df_test['SmallF'] = df_test['Family'].map(lambda s: 1 if  s == 2  else 0)
df_test['MedF']   = df_test['Family'].map(lambda s: 1 if 3 <= s <= 4 else 0)
df_test['LargeF'] = df_test['Family'].map(lambda s: 1 if s >= 5 else 0)
df_test['Senior'] = df_test['Age'].map(lambda s:1 if s>60 else 0)

In [227]:
def get_person(passenger):
    age,sex = passenger
    return 'child' if age < 16 else sex

In [228]:
df_train['Person'] = df_train[['Age','Sex']].apply(get_person,axis=1)
df_test['Person']  = df_test[['Age','Sex']].apply(get_person,axis=1)

person_dummies_train  = pd.get_dummies(df_train['Person'])
person_dummies_train.columns = ['Child','Female','Male']
person_dummies_train.drop(['Male'], axis=1, inplace=True)

person_dummies_test  = pd.get_dummies(df_test['Person'])
person_dummies_test.columns = ['Child','Female','Male']
person_dummies_test.drop(['Male'], axis=1, inplace=True)

df_train = df_train.join(person_dummies_train)
df_test  = df_test.join(person_dummies_test)

df_train.drop(['Person'],axis=1,inplace=True)
df_test.drop(['Person'],axis=1,inplace=True)

In [229]:
df_train.drop('male',axis=1,inplace=True)
df_test.drop('male',axis=1,inplace=True)

In [230]:
df_train.drop(['Cabin','Ticket'],axis = 1, inplace= True)
df_test.drop(['Ticket','Cabin'],axis = 1, inplace= True)

In [231]:
df_train.drop(['Sex','Embarked'],axis=1,inplace=True)
df_test.drop(['Sex','Embarked'],axis=1,inplace=True)


In [232]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Q,S,Title,Family,Single,SmallF,MedF,LargeF,Senior,Child,Female
0,1,0,3,22.0,1,0,7.25,0,1,2,2,0,1,0,0,0,0,0
1,2,1,1,38.0,1,0,71.2833,0,0,1,2,0,1,0,0,0,0,1
2,3,1,3,26.0,0,0,7.925,0,1,1,1,1,0,0,0,0,0,1
3,4,1,1,35.0,1,0,53.1,0,1,1,2,0,1,0,0,0,0,1
4,5,0,3,35.0,0,0,8.05,0,1,2,1,1,0,0,0,0,0,0


In [233]:
# Now slipt the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_train.drop('Survived',axis=1), 
                                                    df_train['Survived'], test_size=0.20, 
                                                    random_state=101)
X_train

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Q,S,Title,Family,Single,SmallF,MedF,LargeF,Senior,Child,Female
247,253,1,62.0,0,0,26.5500,0,1,2,1,1,0,0,0,1,0,0
607,615,3,35.0,0,0,8.0500,0,1,2,1,1,0,0,0,0,0,0
299,305,3,24.0,0,0,8.0500,0,1,2,1,1,0,0,0,0,0,0
527,535,3,30.0,0,0,8.6625,0,1,1,1,1,0,0,0,0,0,1
113,116,3,21.0,0,0,7.9250,0,1,2,1,1,0,0,0,0,0,0
389,397,3,31.0,0,0,7.8542,0,1,1,1,1,0,0,0,0,0,1
27,29,3,24.0,0,0,7.8792,1,0,1,1,1,0,0,0,0,0,1
191,196,1,58.0,0,0,146.5208,0,0,1,1,1,0,0,0,0,0,1
630,638,2,31.0,1,1,26.2500,0,1,2,3,0,0,1,0,0,0,0
856,867,2,27.0,1,0,13.8583,0,0,1,2,0,1,0,0,0,0,1


In [234]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=750,max_depth=6,min_samples_split=3)
forest_clf.fit(X_train,y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=750, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [235]:
forest_clf.score(X_test,y_test)

0.8531073446327684

In [236]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(weights='distance',n_neighbors=12)

In [237]:
knn_clf.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=12, p=2,
           weights='distance')

In [238]:
knn_clf.score(X_test,y_test)

0.6440677966101694

In [239]:
data = forest_clf.predict(X_test)
X_test.columns

Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Q', 'S',
       'Title', 'Family', 'Single', 'SmallF', 'MedF', 'LargeF', 'Senior',
       'Child', 'Female'],
      dtype='object')

In [256]:
df_test.columns

Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Title', 'Q',
       'S', 'Family', 'Single', 'SmallF', 'MedF', 'LargeF', 'Senior', 'Child',
       'Female'],
      dtype='object')

In [257]:
data =forest_clf.predict(df_test)

In [289]:
df_final = df_test.copy()

In [290]:
df_final["Survived"] = data

In [291]:
df_final

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Title,Q,S,Family,Single,SmallF,MedF,LargeF,Senior,Child,Female,Survived
0,892,3,34.5,0,0,2.057860,2,1,0,1,1,0,0,0,0,0,0,0
1,893,3,47.0,1,0,1.945910,1,0,1,2,0,1,0,0,0,0,1,0
2,894,2,62.0,0,0,2.270836,2,1,0,1,1,0,0,0,1,0,0,0
3,895,3,27.0,0,0,2.159003,2,0,1,1,1,0,0,0,0,0,0,0
4,896,3,22.0,1,1,2.508582,1,0,1,3,0,0,1,0,0,0,1,0
5,897,3,14.0,0,0,2.221917,2,0,1,1,1,0,0,0,0,1,0,1
6,898,3,30.0,0,0,2.031983,1,1,0,1,1,0,0,0,0,0,1,0
7,899,2,26.0,1,1,3.367296,2,0,1,3,0,0,1,0,0,0,0,1
8,900,3,18.0,0,0,1.978128,1,0,0,1,1,0,0,0,0,0,1,1
9,901,3,21.0,2,0,3.184284,2,0,1,3,0,0,1,0,0,0,0,1


In [292]:
df_final.drop(['Pclass','Age','SibSp','Parch','Fare','Title','Q','S','Family','Single','SmallF','MedF','LargeF','Senior','Child','Female'],axis=1,inplace=True)

In [293]:
df_final.index = df_final.index + 1

In [295]:
df_final.to_csv("/home/jinesh/Desktop/submission.csv", index=False)