In [9]:
import pixiedust
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from keras.layers import *
import sys, os, pickle
from collections import OrderedDict as ODict

# %pixie_debugger
%matplotlib inline
sns.set(style='white', context='notebook', palette='husl')

In [10]:
datadir = os.getcwd() + '/../data'
datadict, filenames = ODict(), []
for files in os.listdir(datadir):
    if filenames not in filenames:
        filenames.append(files)
    with open(datadir + '/' + files, mode='r') as csvfile:
        datadict[files] = pd.read_csv(csvfile, header=0)
        csvfile.close()
datadict.keys(), filenames

(odict_keys(['train.csv', 'test.csv', 'gender_submission.csv']),
 ['train.csv', 'test.csv', 'gender_submission.csv'])

In [12]:
# %%
# Print out data, for quick look.

genderdata = datadict[filenames[-1]]
traindata = datadict[filenames[0]]
testdata = datadict[filenames[1]]

print(traindata.shape[0],"Rows")
traindata.set_index('PassengerId')
traindata.info()

891 Rows
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [16]:
testdata.set_index('PassengerId')
testdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [15]:
genderdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
PassengerId    418 non-null int64
Survived       418 non-null int64
dtypes: int64(2)
memory usage: 6.6 KB


In [22]:
# check sample sizes per class per column
len(traindata)/traindata.nunique()

PassengerId      1.000000
Survived       445.500000
Pclass         297.000000
Name             1.000000
Sex            445.500000
Age             10.125000
SibSp          127.285714
Parch          127.285714
Ticket           1.308370
Fare             3.592742
Cabin            6.061224
Embarked       297.000000
dtype: float64

In [None]:
#set categorical values and create working dataframe copy

catcolumns  = ['Pclass','Sex','Ticket','Fare','Cabin','Embarked']
for _col_ in catcolumns:
    traindata[_col_] = traindata[_col_].astype('category')
    
traindata['Name'] = traindata.Name.astype(str)
    
print('All OK!' if (missingdata == traindata.isnull().sum()).all() else 'Bad Op')

nomissing = traindata.drop(['Cabin','Embarked'],axis=1).copy()
nomissing.isnull().sum()

In [None]:
# Whats in a name ?

Prefixes = ['Mr.','Mrs.','Master.','Miss.','Don.','Dr.','Rev.','Col.','Major.','Ms.','Mme.','Lady.','Sir.','Mlle.','Countess.','Capt.','Jonkheer.']

def chk_prefix(s)->str:
    for _pf_ in Prefixes:
        if _pf_ in s:
            return _pf_
    print(s)
    return None

def extract_FamilyName(s)->str:
    return s.split(',')[0]

Namedata = traindata[['PassengerId','Name']].copy()
Namedata['Prefixes'] = traindata.loc[:,('Name')].astype(str).apply(chk_prefix)
Namedata['FamilyNames'] = traindata.loc[:,('Name')].astype(str).apply(extract_FamilyName)

print(Namedata.nunique(),"\n\n",Namedata.isnull().sum())

In [None]:
workingdf = nomissing.drop(['Name','Survived'],axis=1).join(Namedata.set_index('PassengerId').drop(['Name'],axis=1), on='PassengerId')
workingdf.isnull().sum()

In [None]:
workingdf['Sex'] = workingdf.Sex.astype('category').cat.codes
workingdf['Pclass'] = workingdf.Pclass.astype('category').cat.codes
workingdf['SibSp'] = workingdf.SibSp.astype('category').cat.codes
workingdf['Parch'] = workingdf.Parch.astype('category').cat.codes
workingdf['Ticket'] = workingdf.Ticket.astype('category').cat.codes
workingdf['Fare'] = workingdf.Fare.astype('category').cat.codes
workingdf['Prefixes'] = workingdf.Prefixes.astype('category').cat.codes
workingdf['FamilyNames'] = workingdf.FamilyNames.astype('category').cat.codes
workingdf.head()

In [None]:
from fancyimpute import KNN as impKNN

minage, maxage = workingdf.Age.dropna().min(), workingdf.Age.dropna().max()
newcols = [x for x in workingdf.columns if not x=='PassengerId']

agepredictor = impKNN(k=5,min_value=minage,max_value=maxage)
ages_df = pd.DataFrame(data=agepredictor.fit_transform(workingdf.drop(['PassengerId'],axis=1)),
                     columns=newcols,
                     index=workingdf.index)

ages_df['PassengerId']=workingdf['PassengerId']
ages_df['Old_Age']=workingdf['Age']

In [None]:
ages_df[ages_df.Old_Age.isnull()].Age.max(), ages_df[ages_df.Old_Age.isnull()].Age.min()

In [None]:
# Predicting Cabins (W/o using predicted ages)

workingdf['Cabin']= traindata.Cabin.astype('category').cat.codes
workingdf.Cabin.replace(to_replace=-1,value=float('nan'),inplace=True)

workingdf['Age_Class'] = workingdf.Age.fillna(-1).apply(np.int)


mincab, maxcab = workingdf[workingdf.Cabin >= 0].Cabin.min(), workingdf[workingdf.Cabin >= 0].Cabin.max()

dropcols = ['PassengerId','Age','Sex','SibSp','Parch','Ticket','FamilyNames']
newcols = [x for x in workingdf.columns if x not in dropcols]

cab_predictor = impKNN(k=5,min_value=minage,max_value=maxage)
cab_df = pd.DataFrame(data=cab_predictor.fit_transform(workingdf.drop(dropcols,axis=1)),
                     columns=newcols,
                     index=workingdf.index)

cab_df['PassengerId']=workingdf['PassengerId']
# cab_df['Age']=workingdf['Age']
cab_df['Orig_Cabin']=workingdf['Cabin']
cab_df.set_index('PassengerId',inplace=True)

In [None]:
cab_df.head()

In [None]:
#is it even worth predicting cabins ?

trncset, tstcset = set(traindata.Cabin.dropna()),set(testdata.Cabin.dropna())
len(trncset),len(tstcset),len(trncset-tstcset),len(tstcset-trncset)