In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy
import sys
import re
from scipy.io import loadmat

from sklearn.model_selection import train_test_split

from scipy.optimize import minimize

from sklearn.preprocessing import PolynomialFeatures

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler





In [2]:
# Define function to extract titles from passenger names
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

def transform_data(dataset):
    #full_data = [train, test]

    # Some features of my own that I have added in
    # Gives the length of the name
    #train['Name_length'] = train['Name'].apply(len)
    #test['Name_length'] = test['Name'].apply(len)
    # Feature that tells whether a passenger had a cabin on the Titanic
    dataset['Has_Cabin'] = dataset["Cabin"].apply(lambda x: 0 if type(x) == float else 1)

    # Feature engineering steps taken from Sina
    # Create new feature FamilySize as a combination of SibSp and Parch
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
    # Create new feature IsAlone from FamilySize
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
    # Remove all NULLS in the Embarked column
    dataset['Embarked'] = dataset['Embarked'].fillna('S')
    # Remove all NULLS in the Fare column and create a new feature CategoricalFare
    dataset['Fare'] = dataset['Fare'].fillna(train['Fare'].median())
    #### train['CategoricalFare'] = pd.qcut(train['Fare'], 4)
    # Create a New feature CategoricalAge
    age_avg = dataset['Age'].mean()
    age_std = dataset['Age'].std()
    age_null_count = dataset['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
    dataset['Age'][np.isnan(dataset['Age'])] = age_null_random_list
    dataset['Age'] = dataset['Age'].astype(int)
    ###### train['CategoricalAge'] = pd.cut(train['Age'], 5)



    # Create a new feature Title, containing the titles of passenger names
    dataset['Title'] = dataset['Name'].apply(get_title)
    # Group all non-common titles into one single grouping "Rare"
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
    # Mapping Sex
    dataset['Sex'] = dataset['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    
    # Mapping titles
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    dataset['Title'] = dataset['Title'].map(title_mapping)
    dataset['Title'] = dataset['Title'].fillna(0)
    
    # Mapping Embarked
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
    
    # Mapping Fare
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] 						        = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] 							        = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    
    # Mapping Age
    dataset.loc[ dataset['Age'] <= 10, 'Age0'] 					       = 1
    dataset.loc[(dataset['Age'] > 10) & (dataset['Age'] <= 18), 'Age1'] = 1
    dataset.loc[(dataset['Age'] > 18) & (dataset['Age'] <= 25), 'Age2'] = 1
    dataset.loc[(dataset['Age'] > 25) & (dataset['Age'] <= 32), 'Age3'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age4'] = 1
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age5'] = 1
    dataset.loc[ dataset['Age'] > 64, 'Age6'] = 1 ;
    
    dataset.loc[ dataset['Age0'] != 1, 'Age0'] = 0 ;
    dataset.loc[ dataset['Age1'] != 1, 'Age1'] = 0 ;
    dataset.loc[ dataset['Age2'] != 1, 'Age2'] = 0 ;
    dataset.loc[ dataset['Age3'] != 1, 'Age3'] = 0 ;
    dataset.loc[ dataset['Age4'] != 1, 'Age4'] = 0 ;
    dataset.loc[ dataset['Age5'] != 1, 'Age5'] = 0 ;
    dataset.loc[ dataset['Age6'] != 1, 'Age6'] = 0 ;

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

ypassengerid = test['PassengerId']

transform_data(train)
transform_data(test)   
# Feature selection
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Age']
#drop_elements = ['Name', 'Ticket', 'Cabin']
train = train.drop(drop_elements, axis = 1)
#train = train.drop(['CategoricalAge', 'CategoricalFare'], axis = 1)
test  = test.drop(drop_elements, axis = 1)
print train.head(3)

   Survived  Pclass  Sex  Parch  Fare  Embarked  Has_Cabin  FamilySize  \
0         0       3    1      0     0         0          0           2   
1         1       1    0      0     3         1          1           2   
2         1       3    0      0     1         0          0           1   

   IsAlone  Title  Age0  Age1  Age2  Age3  Age4  Age5  Age6  
0        0      1   0.0   0.0   1.0   0.0   0.0   0.0   0.0  
1        0      3   0.0   0.0   0.0   0.0   1.0   0.0   0.0  
2        1      2   0.0   0.0   0.0   1.0   0.0   0.0   0.0  


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [3]:
y_train = np.array(train['Survived'])
train = train.drop(['Survived'], axis=1)
x_train = train.values # Creates an array of the train data
x_test = test.values # Creats an array of the test data

In [4]:
print 'Random :', np.sum(y_train == 1) * 1.0 / y_train.size
mlp = MLPClassifier(activation='relu', solver='lbfgs', hidden_layer_sizes=(6,5,5 ))

classifiers = [LogisticRegression(), 
               SVC(C=0.1, kernel='linear'), 
               SVC(C=1, kernel='rbf', gamma=0.1), 
               RandomForestClassifier(random_state=0),
               mlp
            ]
for classif in classifiers:
    print type(classif).__name__
    precision = 0.0;
    trainprec = 0.0;
    X_train, X_test, Y_train, Y_test = train_test_split(x_train, y_train, test_size=0.2, random_state=0)
    classif.fit(X_train, Y_train)

    prediction = classif.predict(X_test)
    precision += np.sum(prediction == Y_test) * 100.0 / Y_test.size
    prediction = classif.predict(X_train)
    trainprec += np.sum(prediction == Y_train) * 100.0 / Y_train.size

    print 'Accuracy :', precision , 'TrainPrec :', trainprec


Random : 0.383838383838
LogisticRegression
Accuracy : 79.8882681564 TrainPrec : 82.0224719101
SVC
Accuracy : 77.6536312849 TrainPrec : 82.1629213483
SVC
Accuracy : 81.0055865922 TrainPrec : 84.2696629213
RandomForestClassifier
Accuracy : 83.2402234637 TrainPrec : 90.308988764
MLPClassifier
Accuracy : 79.3296089385 TrainPrec : 82.8651685393
