In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from logistic import logistic_regression
from model_eval import model_eval
%matplotlib inline 

In [18]:
train_data = pd.read_csv('train.csv', delimiter=',')
test_data = pd.read_csv('test.csv', delimiter=',')

In [19]:
'''
Turning catagory value using 1 hot and convert binary to 0/1 value
Inital experiement, dropping name and ticket where it makes linear regression easier
The cluster analysis should include such features to increase information gain
''' 
def clean_linear_reg (train_data):
    temp = train_data.copy()
    #Features to modify or drop
    temp.loc[:,'Cabin'] = temp['Cabin'].apply(lambda x : str(x)[:1])
    temp.loc[:, 'Sex'] = temp['Sex'].apply(lambda x: 1 if x == "male" else 0)
    temp = temp.drop(['Name','Ticket', 'PassengerId'], 1)
    #temp = temp.drop('Ticket',1,)
    #temp = temp.drop('PassengerId', 1)
    #Apply one hot encoding for categorical data to analysis
    temp = pd.get_dummies(temp)
    #exclusive for titanic data
    temp['Age'].fillna(temp['Age'].mean(), inplace=True)
    cols = temp.columns.tolist()
    cols.pop(0)
    cols.append('Survived')
    temp = temp[cols]
    return temp

def normalize(df):
    result = df.copy()
    for feature_name in df.columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

def split_data(data, percent):
    train_size = len(data) * percent * 0.01
    train_size = int(train_size)
    return data.iloc[:train_size, :], data.iloc[train_size:, :]

In [20]:
clean_data = clean_linear_reg(train_data)
norm_data= normalize(clean_data)
clean_train_norm, clean_test_norm = split_data(norm_data,80)
#clean_train_norm = normalize(clean_data[0])
#clean_test_norm = normalize(clean_data[1])

In [21]:
print clean_train_norm.shape
print clean_test_norm.shape

(712, 19)
(179, 19)


In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn import svm
from sklearn import neural_network

In [24]:
model = LogisticRegression()
model.fit(clean_train_norm.iloc[:,:-1], clean_train_norm.iloc[:,-1])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [25]:
model.score(clean_test_norm.iloc[:,:-1], clean_test_norm.iloc[:,-1])

0.82122905027932958

In [26]:
model = logistic_regression()
Y = clean_train_norm.iloc[:,-1]
X = clean_train_norm.iloc[:,:-1]
model.fit(X,Y)

In [27]:
temp = model_eval(logistic_regression())

In [28]:
temp.k_cross_validation(3,X,Y)

237


0.810126582278481

The logistic regression performs well with basic classification after normalization and conversion to numeric value.
The NA value is filled by the mean of the column. The performance of the model does not seem to be affect by the number of missing data. Logsitic model is good at handling noises

In [29]:
#Decision tree classifier
decision_tree_classifer = tree.DecisionTreeClassifier()
decision_tree_classifer.fit(clean_train_norm.iloc[:,:-1], clean_train_norm.iloc[:,-1])
decision_tree_classifer.score(clean_test_norm.iloc[:,:-1], clean_test_norm.iloc[:,-1])

0.82681564245810057

In [30]:
#K-Nearest neighbor
knn_classifer = KNeighborsClassifier(n_neighbors=3)
knn_classifer.fit(clean_train_norm.iloc[:,:-1], clean_train_norm.iloc[:,-1])
knn_classifer.score(clean_test_norm.iloc[:,:-1], clean_test_norm.iloc[:,-1])

0.83798882681564246

In [31]:
#Support Vector machine
svm_classifer = svm.SVC(C=100)
svm_classifer.fit(clean_train_norm.iloc[:,:-1], clean_train_norm.iloc[:,-1])
svm_classifer.score(clean_test_norm.iloc[:,:-1], clean_test_norm.iloc[:,-1])

0.84357541899441346

In [34]:
ann_classifer = neural_network.MLPClassifier(hidden_layer_sizes=5, alpha=0.001, learning_rate='adaptive')
ann_classifer.fit(clean_train_norm.iloc[:,:-1], clean_train_norm.iloc[:,-1])
ann_classifer.score(clean_test_norm.iloc[:,:-1], clean_test_norm.iloc[:,-1])



0.81005586592178769