# titanic

In [3]:

import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler 
from sklearn.feature_selection import chi2, SelectKBest

import statsmodels.formula.api as smf
import statsmodels.api as sm
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
%matplotlib inline

  from pandas.core import datetools


Keras does get its source of randomness from the NumPy random number generator, so this must be seeded regardless of whether you are using a Theano or TensorFlow backend.

In [7]:
from numpy.random import seed
seed(123)
from tensorflow import set_random_seed
set_random_seed(234)
train = pd.read_csv("../raw_data/train (1).csv")
test = pd.read_csv("../raw_data/test (1).csv")
full = train.append(test, ignore_index = True )

In [None]:
#full.Survived[891] #FYI check [890]
#full.tail(2)

In [8]:
titanic = full[:891]
#del train , test
titanic.columns
#titanic[:2]['Age']

Index(['Age', 'Cabin', 'Embarked', 'Fare', 'Name', 'Parch', 'PassengerId',
       'Pclass', 'Sex', 'SibSp', 'Survived', 'Ticket'],
      dtype='object')

In [12]:
test.head()
train.columns


Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [6]:
 def findnan(df, n):    
    '''n = "column"'''
    count = []
    for i in df[n]:
        if pd.isnull(i) == True:
            count.append(i)
    print('{0} = {1}'.format(n, len(count)))
    
def allnans(df):
    for i in df.columns:
        findnan(df, i)

In [7]:
allnans(full)

Age = 263
Cabin = 1014
Embarked = 2
Fare = 1
Name = 0
Parch = 0
PassengerId = 0
Pclass = 0
Sex = 0
SibSp = 0
Survived = 418
Ticket = 0


### Embarked

In [8]:
from collections import Counter
Counter(full.Embarked)

Counter({'C': 270, 'Q': 123, 'S': 914, nan: 2})

In [9]:
findnan(full, "Embarked")

Embarked = 2


In [10]:
full.Embarked = full.Embarked.fillna('S')

In [11]:
embarked = pd.get_dummies(full.Embarked, prefix='embarked', 
                          drop_first=True)

In [12]:
embarked.head(2)

Unnamed: 0,embarked_Q,embarked_S
0,0,1
1,0,0


### Sex

In [12]:
Counter(full.Sex)

Counter({'female': 466, 'male': 843})

In [13]:
sex = pd.get_dummies(full.Sex,drop_first=True)

### pclass

In [15]:
full.columns

Index(['Age', 'Cabin', 'Embarked', 'Fare', 'Name', 'Parch', 'PassengerId',
       'Pclass', 'Sex', 'SibSp', 'Survived', 'Ticket'],
      dtype='object')

In [16]:
#np.unique(full.Pclass)

In [17]:
from collections import Counter #like the R table function
Counter(full.Pclass)

Counter({1: 323, 2: 277, 3: 709})

In [14]:
pclass = pd.get_dummies(full.Pclass, prefix='pclass',
                       drop_first=True)

### Age and Fare

In [15]:
findnan(full, "Age")

Age = 263


In [20]:
full.Age = full.Age.fillna(full.Age.mean())
full.Fare = full.Fare.fillna(full.Fare.mean())

In [21]:
findnan(full, 'Fare')

Fare = 0


In [22]:
age = pd.DataFrame(full.Age)
fare = pd.DataFrame(full.Fare)


In [None]:
age.head(2)

### Title

In [23]:
import re
def name(n):
    '''n = df.column'''
    title = []
    for i in range(0, len(n)):
        x = re.sub("^(.*,[ ])|(\.[ ].*)$", "", n[i])
        title.append(x)
    return(title)

In [24]:
title= name(full.Name)

In [25]:
title_dict = {"Capt": "crew", "Col": "crew",
                    "Major":"crew","Jonkheer": "highso",
                    "Don": "highso","Sir" : "highso",
                    "Dr": "crew","Rev": "crew",
                    "the Countess":"highso","Dona": "highso",
                    "Mme": "Mrs", "Mlle": "Miss",
                    "Ms": "Miss", "Mr": "Mr",
                    "Mrs": "Mrs", "Miss": "Miss",
                    "Master": "highso", "Lady": "highso"}


In [26]:
title = [title_dict[key] for key in title]

In [27]:
print(np.unique(title)) 
title = pd.DataFrame(title)

['Miss' 'Mr' 'Mrs' 'crew' 'highso']


In [28]:
title = pd.get_dummies(title, prefix = 'title', 
                      drop_first=True)

In [29]:
title.shape

(1309, 4)

In [30]:
title.head(2)

Unnamed: 0,title_Mr,title_Mrs,title_crew,title_highso
0,1,0,0,0
1,0,1,0,0


### Cabin

cabin = pd.DataFrame()
g = lambda x: x[0]
cabin['Cabin'] = full.Cabin.fillna('U')
cabin['Cabin'] = cabin['Cabin'].map(g)
cabin = pd.get_dummies(cabin['Cabin'], prefix = 'Cabin')

In [31]:
cabin = pd.DataFrame()
g = lambda x: x[0] # just the first term
cabin = full.Cabin.fillna('U')

In [32]:
cabin = cabin.map(g)

In [33]:
cabin = pd.get_dummies(cabin, prefix = 'cabin',
                      drop_first=True)

In [34]:
cabin.head(2)

Unnamed: 0,cabin_B,cabin_C,cabin_D,cabin_E,cabin_F,cabin_G,cabin_T,cabin_U
0,0,0,0,0,0,0,0,1
1,0,1,0,0,0,0,0,0


### Cabin number

In [35]:
full.Cabin = full.Cabin.fillna(0)

In [36]:
cabin_num = pd.DataFrame(full.Cabin)

In [37]:
cabin_num.head(2)

Unnamed: 0,Cabin
0,0
1,C85


In [38]:
#cabin_num.Cabin[8] == 0

In [39]:
findnan(full, 'Cabin')

Cabin = 0


In [40]:
n = len(cabin_num)
for i in range(0, n):
    if cabin_num.Cabin[i] != 0:
        cabin_num.Cabin[i] = len(cabin_num.Cabin[i].split())
        

In [41]:
 cabin_num.head(2)

Unnamed: 0,Cabin
0,0
1,1


### Ticket

In [2]:
def tic(n):
    t = re.sub("[/,\.]", "", n).split() # can add the r[] if preferred
    for i in range(0, len(t)): 
        if t[i].isdigit() == False:
            return(len(t[i]))
        else:
            return(0)

In [43]:
ticket = pd.DataFrame(full.Ticket.map(tic))

### Family

In [44]:
family = pd.DataFrame()
family['FamilySize'] = full.Parch + full.SibSp + 1

In [45]:
def f(x):
    if x == 1:
        return 1
    else:
        return 0
    
def g(x):
    if 2 <= x <= 4:
        return 1
    else:
        return 0
def h(x):
    if x >= 5:
        return 1
    else: 
        return 0

In [46]:
family['Family_Single'] = family['FamilySize'].map(f)
family['Family_Small'] = family['FamilySize'].map(g)
family['Family_Large'] = family['FamilySize'].map(h)


In [47]:
family_n = family['FamilySize']

In [48]:
family = family.drop(['FamilySize'], axis = 1)

In [49]:
family_n.head(2)

0    2
1    2
Name: FamilySize, dtype: int64

In [50]:
mm = MinMaxScaler()

y = pd.concat([age, fare, cabin_num, family_n, ticket], axis = 1)



In [51]:
y.columns

Index(['Age', 'Fare', 'Cabin', 'FamilySize', 'Ticket'], dtype='object')

In [52]:
y = mm.fit_transform(y[['Age', 'FamilySize',
                'Cabin', 'Ticket', 'Fare']])

In [53]:
y = pd.DataFrame(y)

In [54]:
y.columns = ['Age', 'Fare', 'Cabin', 'FamilySize', 'Ticket']

In [55]:
y.head(2)

Unnamed: 0,Age,Fare,Cabin,FamilySize,Ticket
0,0.273456,0.1,0.0,0.285714,0.014151
1,0.473882,0.1,0.25,0.285714,0.139136


In [56]:
X = pd.concat([sex, title, pclass, cabin, embarked, family, y] , axis=1 )


In [57]:
X.shape

(1309, 25)

In [80]:
s = full.Survived
type(s)

pandas.core.series.Series

In [81]:
X_ = pd.concat([X, s], axis = 1)
# need to add the labels back on before sending

In [85]:
X_.tail(1)

Unnamed: 0,male,title_Mr,title_Mrs,title_crew,title_highso,pclass_2,pclass_3,cabin_B,cabin_C,cabin_D,...,embarked_S,Family_Single,Family_Small,Family_Large,Age,Fare,Cabin,FamilySize,Ticket,Survived
1308,1,0,0,0,1,0,1,0,0,0,...,0,0,1,0,0.37218,0.2,0.0,0.0,0.04364,


In [98]:
X_.to_csv('../processed_data/4_16.csv', index=False)


### Further processing

In [87]:
X_train_v = X[ 0:891 ]
y_train_v = titanic.Survived
X_test_f = X[891:]

In [88]:
print(X_train_v.shape)
np.linalg.matrix_rank(X_train_v) # needs to be full rank

(891, 25)


25

In [89]:
X_train_v = SelectKBest(chi2, k=20).fit_transform(X_train_v, y_train_v)

In [90]:
np.linalg.matrix_rank(X_train_v)

20

In [None]:
from statsmodels.api import add_constant
X_train_v = sm.tools.add_constant(X_train_v, has_constant = 'add') 


In [None]:
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

logit = sm.Logit(y_train_v, X_train_v)
result = logit.fit_regularized()
result.summary() 

In [None]:
X_train_v.shape

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X_train_v, y_train_v,
                                                    test_size = 0.2)

In [95]:
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [96]:
predictions = logmodel.predict(X_test)

In [97]:
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

        0.0       0.86      0.92      0.89       106
        1.0       0.88      0.78      0.83        73

avg / total       0.87      0.87      0.86       179



Unfortunately, you'll need to preprocess all the test data step by step

In [None]:
passenger_id = full[891:].PassengerId

In [None]:
y_final = [ ]
for i in y_pred:
    if i >= 0.5:
        y_final.append(1)
    else:
        y_final.append(0)
        

In [None]:
test = pd.DataFrame({'PassengerId': passenger_id, 'Survived': y_final})


In [None]:
test.head()

In [None]:
test.to_csv('predictionANN_jander.csv', index=False)