In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm

train = pd.read_csv('train.csv', dtype={"Age": np.float64})
test = pd.read_csv('test.csv', dtype={"Age": np.float64})

In [3]:
from scipy.stats import mode
import string

def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if big_string.find(substring) != -1:
            return substring
    return np.nan

In [4]:
def phase1clean(df):
    #setting silly values to nan
    df.Fare = df.Fare.map(lambda x: np.nan if x==0 else x)
    
    #Special case for cabins as nan may be signal
    df.Cabin = df.Cabin.fillna('Unknown') 
    
    df.Embarked = df.Embarked.fillna('Unknown')

    #creating a title column from name
    title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                'Don', 'Jonkheer']

    df['Title']=df['Name'].map(lambda x: substrings_in_string(x, title_list))
    
    #replacing all titles with mr, mrs, miss, master
    def replace_titles(x):
        title=x['Title']
        if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
            return 'Mr'
        elif title in ['Countess', 'Mme']:
            return 'Mrs'
        elif title in ['Mlle', 'Ms']:
            return 'Miss'
        elif title =='Dr':
            if x['Sex']=='Male':
                return 'Mr'
            else:
                return 'Mrs'
        else:
            return title
    df['Title']=df.apply(replace_titles, axis=1)

    #Turning cabin number into Deck
    cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
    df['Deck']=df['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))
        
    #Creating new family_size column
    df['Family_Size']=df['SibSp']+df['Parch']
    
    return df

def phase2clean(train, test):
    #data type dictionary
    data_type_dict={'Pclass':'ordinal', 'Sex':'nominal', 
                    'Age':'numeric', 
                    'Fare':'numeric', 'Embarked':'nominal', 'Title':'nominal',
                    'Deck':'nominal', 'Family_Size':'ordinal'}      
    for df in [train, test]:
        meanAge=np.mean(df.Age)
        df.Age=df.Age.fillna(meanAge)
    
#    Fare per person
    for df in [train, test]:
        df['Fare_Per_Person']=df['Fare']/(df['Family_Size']+1)
    
    #Age times class
    for df in [train, test]:
        df['Age*Class']=df['Age']*df['Pclass']
    
    data_type_dict['Fare_Per_Person']='numeric'
    data_type_dict['Age*Class']='numeric'
    
    return [train,test, data_type_dict]


In [37]:
traindf = phase1clean(train)
testdf = phase1clean(test)

train_cleaned, test_cleaned, data_type_dict = phase2clean(traindf,testdf)
train_cleaned.to_csv("train_cleaned.csv", index=False)
test_cleaned.to_csv("test_cleaned.csv", index=False)

In [62]:
pclass = np.array(train_cleaned['Pclass'])
familysize = np.array(train_cleaned['Family_Size'])
age = np.array(train_cleaned['Age'])
fareper = np.nan_to_num(np.array(train_cleaned['Fare_Per_Person']))
ageclass = np.array(train_cleaned['Age*Class'])
title = np.array(train_cleaned['Title'].apply(lambda x : 0 if x=='Mr' else(1 if x=='Mrs' else(2 if x=='Miss' else 3))))
sex = np.array(train_cleaned['Sex'].apply(lambda x : 1 if x=='male' else 0))
deck = np.array(train_cleaned['Deck'].apply(lambda x : 0 if x=='Unknown' else 1))
embarked = np.array(train_cleaned['Embarked'].apply(lambda x:1 if x=='S' else(2 if x=='C' else(3 if x =='Q' else 0))))

In [63]:
x = np.vstack((sex,pclass,deck,familysize,title)).T
y = np.array(train['Survived'])

In [8]:
clf = svm.SVR(kernel='rbf')
clf.fit(x, y)



SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

In [40]:
test_pclass = np.array(test_cleaned['Pclass'])
test_familysize = np.array(test_cleaned['Family_Size'])
test_age = np.array(test_cleaned['Age'])
test_fareper = np.nan_to_num(np.array(test_cleaned['Fare_Per_Person']))
test_ageclass = np.array(test_cleaned['Age*Class'])
test_title = np.array(test_cleaned['Title'].apply(lambda x : 0 if x=='Mr' else(1 if x=='Mrs' else(2 if x=='Miss' else 3))))
test_sex = np.array(test_cleaned['Sex'].apply(lambda x : 1 if x=='male' else 0))
test_deck = np.array(test_cleaned['Deck'].apply(lambda x : 0 if x=='Unknown' else 1))
test_embarked = np.array(test_cleaned['Embarked'].apply(lambda x:1 if x=='S' else(2 if x=='C' else(3 if x =='Q' else 0))))

In [60]:

test_x = np.vstack((test_sex,test_pclass,test_deck,test_familysize,test_title)).T
y = clf.predict(test_x)
test_survived = []
for item in y:
    if item < 0.5:
        test_survived.append(0)
    else:
        test_survived.append(1)

In [42]:
#SVM Submission
passengerid = np.array(test['PassengerId'])
submission = pd.DataFrame({"PassengerId": passengerid, "Survived": test_survived})
submission.to_csv("submission_svm.csv", index=False)

In [83]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

x = np.vstack((sex,pclass,deck,familysize,title)).T
y = np.array(train['Survived'])
test_x = np.vstack((test_sex,test_pclass,test_deck,test_familysize,test_title)).T
linearReg = LinearRegression().fit(x, y)

y = linearReg.predict(test_x)
test_survived_linear_reg = []
for item in y:
    if item < 0.5:
        test_survived_linear_reg.append(0)
    else:
        test_survived_linear_reg.append(1)

(891, 5)
(418, 5)


In [84]:
# Linear Reg submission
passengerid = np.array(test['PassengerId'])
submission = pd.DataFrame({"PassengerId": passengerid, "Survived": test_survived_linear_reg})
submission.to_csv("submission_LinearReg.csv", index=False)

In [85]:
from sklearn.neural_network import MLPRegressor

x = np.vstack((sex,pclass,deck,familysize,title)).T
y = np.array(train['Survived'])
NN = MLPRegressor(solver = 'lbfgs').fit(x,y)

y = NN.predict(test_x)
test_survived_NN = []
for item in y:
    if item < 0.5:
        test_survived_NN.append(0)
    else:
        test_survived_NN.append(1)

In [86]:
# NN submission
passengerid = np.array(test['PassengerId'])
submission = pd.DataFrame({"PassengerId": passengerid, "Survived": test_survived_NN})
submission.to_csv("submission_NN.csv", index=False)