In [1]:
import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import numpy as np

In [2]:
# some information:
# pclass: A proxy for socio-economic status (SES)
# 1st = Upper
# 2nd = Middle
# 3rd = Lower

# age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

# sibsp: The dataset defines family relations in this way...
# Sibling = brother, sister, stepbrother, stepsister
# Spouse = husband, wife (mistresses and fiancés were ignored)

# parch: The dataset defines family relations in this way...
# Parent = mother, father
# Child = daughter, son, stepdaughter, stepson
# Some children travelled only with a nanny, therefore parch=0 for them. 

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
full = train.append(test,ignore_index=True)

In [5]:
full['Fare']=full['Fare'].fillna(full['Fare'].median())
full['Age']=full['Age'].fillna(full['Age'].median())
full['Cabin']=full['Cabin'].fillna('U')
full['Embarked']=full['Embarked'].fillna('S')

In [6]:
# change Sec into index
# male = 0 and female = 1
full.replace('male',0,inplace=True)
full.replace('female',1,inplace=True)

In [7]:
# Embarked
df = pd.DataFrame()
df=pd.get_dummies(full['Embarked'],prefix='Embarked')
df.head()

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [8]:
def getTitle(name):
    str1=name.split(',')[1]
    str2=str1.split('.')[0]
    str3=str2.strip()
    return str3

titleDf=pd.DataFrame()
titleDf['Title']=full['Name'].map(getTitle)
Counter(titleDf['Title'])

Counter({'Mr': 757,
         'Mrs': 197,
         'Miss': 260,
         'Master': 61,
         'Don': 1,
         'Rev': 8,
         'Dr': 8,
         'Mme': 1,
         'Ms': 2,
         'Major': 2,
         'Lady': 1,
         'Sir': 1,
         'Mlle': 2,
         'Col': 4,
         'Capt': 1,
         'the Countess': 1,
         'Jonkheer': 1,
         'Dona': 1})

In [9]:
title_Dict={}
title_Dict.update(dict.fromkeys(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer'))
title_Dict.update(dict.fromkeys(['Don', 'Sir', 'the Countess', 'Dona', 'Lady','Jonkheer'], 'Royalty'))
title_Dict.update(dict.fromkeys(['Mme', 'Ms', 'Mrs'], 'Mrs'))
title_Dict.update(dict.fromkeys(['Mlle', 'Miss'], 'Miss'))
title_Dict.update(dict.fromkeys(['Mr'], 'Mr'))
title_Dict.update(dict.fromkeys(['Master'], 'Master'))

titleDf['Title']=titleDf['Title'].map(title_Dict)
titleDf = pd.get_dummies(titleDf['Title'],prefix='Title')
titleDf.head()

Unnamed: 0,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty
0,0,0,1,0,0,0
1,0,0,0,1,0,0
2,0,1,0,0,0,0
3,0,0,0,1,0,0
4,0,0,1,0,0,0


In [10]:
# Cabin
cabinDf=pd.DataFrame()
full['Cabin']=full['Cabin'].map(lambda c:c[0])

cabinDf=pd.get_dummies(full['Cabin'],prefix='Cabin')
cabinDf.head()

Unnamed: 0,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U
0,0,0,0,0,0,0,0,0,1
1,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1
3,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1


In [11]:
# # Pclass
# pclassDf=pd.DataFrame()
# pclassDf=pd.get_dummies(full['Pclass'],prefix='Pclass')
# pclassDf.head()

In [12]:
# concat SibSp and Parch into one column Family
familyDf=pd.DataFrame()
familyDf['family'] = full['SibSp']+full['Parch']
familyDf.head()

Unnamed: 0,family
0,1
1,1
2,0
3,1
4,0


In [13]:
full=pd.concat([full,df,titleDf,cabinDf,familyDf],axis=1)
full.drop(['Cabin','Embarked','Name','Ticket'],axis=1,inplace=True)
full.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,...,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_T,Cabin_U,family
0,1,0.0,3,0,22.0,1,0,7.25,0,0,...,0,0,0,0,0,0,0,0,1,1
1,2,1.0,1,1,38.0,1,0,71.2833,1,0,...,0,0,1,0,0,0,0,0,0,1
2,3,1.0,3,1,26.0,0,0,7.925,0,0,...,0,0,0,0,0,0,0,0,1,0
3,4,1.0,1,1,35.0,1,0,53.1,0,0,...,0,0,1,0,0,0,0,0,0,1
4,5,0.0,3,0,35.0,0,0,8.05,0,0,...,0,0,0,0,0,0,0,0,1,0


In [14]:
traindf = pd.concat([full[full['Survived'] == 1],full[full['Survived'] == 0]])

In [15]:
# divide into train and test dataset
Xtrain,Xtest,Ytrain,Ytest = train_test_split(traindf,traindf['Survived'])

In [16]:
Xtest.drop(columns = 'Survived',inplace = True)
Xtrain.drop(columns = 'Survived',inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [17]:
# A very simple decision Tree prediction
clf = tree.DecisionTreeClassifier(criterion="entropy")
clf = clf.fit(Xtrain,Ytrain)
Ypred=clf.predict(Xtest)
score = clf.score(Xtest,Ytest)
f1score =  f1_score(Ytest,Ypred)
print(score,f1score)

0.7488789237668162 0.6744186046511628


In [18]:
# Logistic regression prediction
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(C=1.0,penalty='none',tol=0.01)
LR.fit(Xtrain,Ytrain)
Ypred = LR.predict(Xtest)
f1score =  f1_score(Ytest,Ypred)
score = LR.score(Xtest,Ytest)
print(score,f1score)

0.8385650224215246 0.783132530120482


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
