In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm

train = pd.read_csv('titanic/train.csv', dtype={"Age": np.float64})
test = pd.read_csv('titanic/test.csv', dtype={"Age": np.float64})
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [9]:
from scipy.stats import mode
import string

def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if big_string.find(substring) != -1:
            return substring
    return np.nan

In [10]:
def phase1clean(df):
    #setting silly values to nan
    df.Fare = df.Fare.map(lambda x: np.nan if x==0 else x)
    
    #Special case for cabins as nan may be signal
    df.Cabin = df.Cabin.fillna('Unknown') 
    
    df.Embarked = df.Embarked.fillna('Unknown')

    #creating a title column from name
    title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                'Don', 'Jonkheer']

    df['Title']=df['Name'].map(lambda x: substrings_in_string(x, title_list))
    
    #replacing all titles with mr, mrs, miss, master
    def replace_titles(x):
        title=x['Title']
        if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
            return 'Mr'
        elif title in ['Countess', 'Mme']:
            return 'Mrs'
        elif title in ['Mlle', 'Ms']:
            return 'Miss'
        elif title =='Dr':
            if x['Sex']=='Male':
                return 'Mr'
            else:
                return 'Mrs'
        else:
            return title
    df['Title']=df.apply(replace_titles, axis=1)

    #Turning cabin number into Deck
    cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
    df['Deck']=df['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))
        
    #Creating new family_size column
    df['Family_Size']=df['SibSp']+df['Parch']
    
    return df

def phase2clean(train, test):
    #data type dictionary
    data_type_dict={'Pclass':'ordinal', 'Sex':'nominal', 
                    'Age':'numeric', 
                    'Fare':'numeric', 'Embarked':'nominal', 'Title':'nominal',
                    'Deck':'nominal', 'Family_Size':'ordinal'}      
    for df in [train, test]:
        meanAge=np.mean(df.Age)
        df.Age=df.Age.fillna(meanAge)
    
#    Fare per person
    for df in [train, test]:
        df['Fare_Per_Person']=df['Fare']/(df['Family_Size']+1)
    
    #Age times class
    for df in [train, test]:
        df['Age*Class']=df['Age']*df['Pclass']
    
    data_type_dict['Fare_Per_Person']='numeric'
    data_type_dict['Age*Class']='numeric'
    
    return [train,test, data_type_dict]


In [11]:
traindf = phase1clean(train)
testdf = phase1clean(test)
train_cleaned, test_cleaned, data_type_dict = phase2clean(traindf,testdf)

train_cleaned.to_csv("train_cleaned.csv", index=False)
test_cleaned.to_csv("test_cleaned.csv", index=False)

In [12]:
pclass = np.array(train_cleaned['Pclass'])
familysize = np.array(train_cleaned['Family_Size'])
age = np.array(train_cleaned['Age'])
fareper = np.nan_to_num(np.array(train_cleaned['Fare_Per_Person']))
ageclass = np.array(train_cleaned['Age*Class'])
title = np.array(train_cleaned['Title'].apply(lambda x : 0 if x=='Mr' else(1 if x=='Mrs' else(2 if x=='Miss' else 3))))
sex = np.array(train_cleaned['Sex'].apply(lambda x : 1 if x=='male' else 0))
deck = np.array(train_cleaned['Deck'].apply(lambda x : 0 if x=='Unknown' else 1))
embarked = np.array(train_cleaned['Embarked'].apply(lambda x:1 if x=='S' else(2 if x=='C' else(3 if x =='Q' else 0))))
survived = np.array(train_cleaned['Survived'])

We make some plot here.

In [13]:
plt.clf()
sns.barplot(x="Sex", y="Survived", data=train, palette='Blues')
plt.savefig('images2/Sex.eps')
plt.clf()
sns.barplot(x="Family_Size", y="Survived", data=train, palette='Blues')
plt.savefig('images2/FamilySize.eps')
plt.clf()
sns.barplot(x="Title", y="Survived", data=train, palette='Blues')
plt.savefig('images2/Title.eps')
plt.clf()
sns.barplot(x="Deck", y="Survived", data=train, palette='Blues')
plt.savefig('images2/Deck.eps')
plt.clf()
sns.barplot(x="Embarked", y="Survived", data=train, palette='Blues')
plt.savefig('images2/Embarked.eps')
plt.clf()
sns.barplot(x="Pclass", y="Survived", data=train, palette='Blues')
plt.savefig('images2/Pclass.eps')

In [14]:
plt.clf()
facet = sns.FacetGrid(train_cleaned, hue='Survived')
facet.map(sns.kdeplot,'Fare_Per_Person')
facet.set(xlim=(0, 100))
facet.add_legend()
plt.savefig('images2/Fare_Per_Person.eps')
plt.clf()
facet = sns.FacetGrid(train_cleaned, hue='Survived')
facet.map(sns.kdeplot,'Age')
facet.set(xlim=(0, 80))
facet.add_legend()
plt.savefig('images2/Age.eps')

In [15]:
df = pd.DataFrame({'pclass': pclass,
                   'familysize': familysize,
                   'age': age,
                   'fareper': fareper,
                   'title': title,
                   'sex': sex,
                   'deck': deck,
                   'embarked': embarked,
                   'survived': survived
                   })
print(df.corr())
plt.clf()
sns.heatmap(df.corr(),cmap=sns.diverging_palette(220, 10, as_cmap=True))
plt.savefig('images2/correlation_map_2.eps')

              pclass  familysize       age   fareper     title       sex  \
pclass      1.000000    0.065997 -0.331339 -0.485079 -0.022762  0.131900   
familysize  0.065997    1.000000 -0.248512 -0.099173  0.394511 -0.200988   
age        -0.331339   -0.248512  1.000000  0.141649 -0.391449  0.084153   
fareper    -0.485079   -0.099173  0.141649  1.000000  0.062001 -0.115143   
title      -0.022762    0.394511 -0.391449  0.062001  1.000000 -0.689837   
sex         0.131900   -0.200988  0.084153 -0.115143 -0.689837  1.000000   
deck       -0.725541   -0.009175  0.233123  0.373966  0.072510 -0.140391   
embarked    0.050992   -0.077927  0.001932  0.075745  0.124234 -0.111249   
survived   -0.338481    0.016639 -0.069809  0.221600  0.473108 -0.543351   

                deck  embarked  survived  
pclass     -0.725541  0.050992 -0.338481  
familysize -0.009175 -0.077927  0.016639  
age         0.233123  0.001932 -0.069809  
fareper     0.373966  0.075745  0.221600  
title       0.072510  0.

In [16]:
x = np.vstack((sex,pclass,deck,familysize,title)).T
y = np.array(train['Survived'])

In [17]:
clf = svm.SVR(kernel='rbf')
clf.fit(x, y)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [18]:
test_pclass = np.array(test_cleaned['Pclass'])
test_familysize = np.array(test_cleaned['Family_Size'])
test_age = np.array(test_cleaned['Age'])
test_fareper = np.nan_to_num(np.array(test_cleaned['Fare_Per_Person']))
test_ageclass = np.array(test_cleaned['Age*Class'])
test_title = np.array(test_cleaned['Title'].apply(lambda x : 0 if x=='Mr' else(1 if x=='Mrs' else(2 if x=='Miss' else 3))))
test_sex = np.array(test_cleaned['Sex'].apply(lambda x : 1 if x=='male' else 0))
test_deck = np.array(test_cleaned['Deck'].apply(lambda x : 0 if x=='Unknown' else 1))
test_embarked = np.array(test_cleaned['Embarked'].apply(lambda x:1 if x=='S' else(2 if x=='C' else(3 if x =='Q' else 0))))

In [19]:
test_x = np.vstack((test_sex,test_pclass,test_deck,test_familysize,test_title)).T
y = clf.predict(test_x)
test_survived = []
for item in y:
    if item < 0.5:
        test_survived.append(0)
    else:
        test_survived.append(1)

In [20]:
#SVM Submission
passengerid = np.array(test['PassengerId'])
submission = pd.DataFrame({"PassengerId": passengerid, "Survived": test_survived})
submission.to_csv("submission_svm.csv", index=False)

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

x = np.vstack((sex,pclass,deck,familysize,title)).T
y = np.array(train['Survived'])
test_x = np.vstack((test_sex,test_pclass,test_deck,test_familysize,test_title)).T
linearReg = LinearRegression().fit(x, y)

y = linearReg.predict(test_x)
test_survived_linear_reg = []
for item in y:
    if item < 0.5:
        test_survived_linear_reg.append(0)
    else:
        test_survived_linear_reg.append(1)

  linalg.lstsq(X, y)


In [14]:
# Linear Reg submission
passengerid = np.array(test['PassengerId'])
submission = pd.DataFrame({"PassengerId": passengerid, "Survived": test_survived_linear_reg})
submission.to_csv("submission_LinearReg.csv", index=False)

In [15]:
from sklearn.neural_network import MLPRegressor

x = np.vstack((sex,pclass,deck,familysize,title)).T
y = np.array(train['Survived'])
NN = MLPRegressor(solver = 'lbfgs').fit(x,y)

y = NN.predict(test_x)
test_survived_NN = []
for item in y:
    if item < 0.5:
        test_survived_NN.append(0)
    else:
        test_survived_NN.append(1)

In [16]:
# NN submission
passengerid = np.array(test['PassengerId'])
submission = pd.DataFrame({"PassengerId": passengerid, "Survived": test_survived_NN})
submission.to_csv("submission_NN.csv", index=False)