In [1]:
import pandas as pd
import pylab as plt
%matplotlib inline
import numpy as np
import math

df=pd.read_csv('data/train.csv', sep=',')

features = ['Fare_s','Sex_', 'Title_s', 'Pclass_s', 'Age_s','HasAge','Parch_s']
#features = ['Sex_', 'Title_s', 'Pclass_s', 'Age_s']
#features = ['Sex_', 'Age_s_e']

from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(C=0.1, random_state=0, n_jobs=-1, class_weight={0: 0.7, 1: 1-0.7})

from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(5)

from sklearn.metrics import accuracy_score
from sklearn.cross_validation import StratifiedKFold

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score


## Transform features

In [2]:
def feature_engineering(_df,title_mapping):
    _df['Sex_'] = _df['Sex'].apply(lambda x: 0.1 if x=='female' else 1)
    
    _df['Fare'] = _df['Fare'].fillna(10)
    _df['Fare'] = _df['Fare'].apply(lambda x: 40 if x > 40 else x)
    
    fareMax = _df['Fare'].max()
    fareMin = _df['Fare'].min()

    _df['Fare_s'] = (_df['Fare']-fareMin)/(fareMax-fareMin)
    
    _df['Pclass_s'] = _df['Pclass']/3
    
    _df['SibSp'] = _df['SibSp'].apply(lambda x: 3 if x > 3 else x)
    _df['SibSp_s'] = _df['SibSp']/_df['SibSp'].max()
    
    _df['Parch'] = _df['Parch'].apply(lambda x: 3 if x > 3 else x)
    _df['Parch_s'] = _df['Parch']/_df['Parch'].max()

    _df['HasAge'] = _df['Age'].apply(lambda x: 0 if np.isnan(x) else 1)

    _df['Age_'] = _df['Age'].fillna(18)
    _df['Age_'] = _df['Age_'].apply(lambda x: 1 if x <= 16 else x)
    _df['Age_'] = _df['Age_'].apply(lambda x: 22 if x >12 and x <= 29 else x)
    _df['Age_'] = _df['Age_'].apply(lambda x: 40 if x >29 and x <= 46 else x)
    _df['Age_'] = _df['Age_'].apply(lambda x: 60 if x >46 else x)
    ageMax = _df['Age_'].max()
    ageMin = _df['Age_'].min()

    _df['Age_s'] = (_df['Age_']-ageMin)/(ageMax-ageMin)
    
    
    _df['Age_with_Sex'] = _df['Age_s'] + _df['Sex_']
    
    _df['Title'] = _df['Name'].apply(lambda x: x.replace('.',',').split(',')[1].strip())
    _df['Title_s'] = _df['Title'].map(title_mapping)
    _df['Title_s'] = _df['Title_s'].fillna(0);
    
########################



## Feature engineering

In [3]:
df['Title'] = df['Name'].apply(lambda x: x.replace('.',',').split(',')[1].strip())
title_mapping={}
for t in np.unique(df['Title']):
    x = df[df['Title'] == t]
    title_mapping[t] = 1 - x['Survived'].sum()/float(len(x['Survived']))


feature_engineering(df, title_mapping)

title_mapping

{'Capt': 1.0,
 'Col': 0.5,
 'Don': 1.0,
 'Dr': 0.5714285714285714,
 'Jonkheer': 1.0,
 'Lady': 0.0,
 'Major': 0.5,
 'Master': 0.42500000000000004,
 'Miss': 0.30219780219780223,
 'Mlle': 0.0,
 'Mme': 0.0,
 'Mr': 0.8433268858800773,
 'Mrs': 0.20799999999999996,
 'Ms': 0.0,
 'Rev': 1.0,
 'Sir': 0.0,
 'the Countess': 0.0}

## Study Model

### Random shaffle validation - (don't use this, use the KFold CV below instead)

In [4]:
def correct_rate(lr_model):
    correct_rate = 0.0
    correct_rate_train = 0.0

    number_run = 10
    for i in range(number_run):

        msk = np.random.rand(len(df)) < 0.8
        _train = df[msk]
        _validation = df[~msk]

        X_train = _train[features]
        X_train = poly.fit_transform(X_train)


        lr_model.fit(X_train ,_train['Survived'])

        X_validation = _validation[features]
        X_validation = poly.fit_transform(X_validation)

        #predict = lr_model.predict(X_validation)

        #correct_rate += accuracy_score(_validation['Survived'],predict)
        correct_rate += lr_model.score(X_validation, _validation['Survived'])
        correct_rate_train += lr_model.score(X_train, _train['Survived'])

    correct_rate = correct_rate/number_run
    correct_rate_train = correct_rate_train/number_run

    return (correct_rate, correct_rate_train)

In [5]:
# correct_rate(lr_model)

## K Fold cross validation

In [6]:
def correct_rate_KFold(_df, _lr_model,features, label, quiet=False):
    # call it as: correct_rate_KFold(df, lr_model, features, 'Survived')
    train_data =  poly.fit_transform(_df[features])
    kfold = StratifiedKFold(y=_df[label], n_folds=10, random_state=2)

    scores = []

    for k, (train, test) in enumerate(kfold):
        _lr_model.fit(train_data[train], _df[label][train])
        
        #score = _lr_model.score(train_data[test], _df[label][test])
        score_t = _lr_model.score(train_data[train], _df[label][train])
        
        prediction_test = _lr_model.predict(train_data[test])
        
        score = accuracy_score(y_true=_df[label][test], y_pred=prediction_test)
        p = precision_score(y_true=_df[label][test], y_pred=prediction_test)
        r = recall_score(y_true=_df[label][test], y_pred=prediction_test)
        f1 = f1_score(y_true=_df[label][test], y_pred=prediction_test)
        
        scores.append((score, score_t, p, r, f1))
        if not quiet:
            print('Fold: %s, Label dist.: %s, Acc: %.3f, Train_acc: %.3f, P: %.3f, R: %.3f, F1: %.3f' % (k+1,
                                                          np.bincount(df[label][train]),
                                                          score, score_t, p, r, f1
                                                         ))
    if not quiet:
        r = np.array(scores).mean(0)
        print('Total accuracy: %.3f Train_acc: %.3f, P: %.3f, R: %.3f, F1: %.3f' % tuple(r))
    return np.mean(np.array(scores),axis=0)

In [7]:
correct_rate_KFold(df, lr_model, features, 'Survived')

Fold: 1, Label dist.: [494 307], Acc: 0.756, Train_acc: 0.805, P: 0.933, R: 0.400, F1: 0.560
Fold: 2, Label dist.: [494 307], Acc: 0.744, Train_acc: 0.797, P: 1.000, R: 0.343, F1: 0.511
Fold: 3, Label dist.: [494 308], Acc: 0.742, Train_acc: 0.805, P: 0.824, R: 0.412, F1: 0.549
Fold: 4, Label dist.: [494 308], Acc: 0.865, Train_acc: 0.789, P: 0.893, R: 0.735, F1: 0.806
Fold: 5, Label dist.: [494 308], Acc: 0.831, Train_acc: 0.792, P: 1.000, R: 0.559, F1: 0.717
Fold: 6, Label dist.: [494 308], Acc: 0.809, Train_acc: 0.794, P: 0.947, R: 0.529, F1: 0.679
Fold: 7, Label dist.: [494 308], Acc: 0.820, Train_acc: 0.791, P: 1.000, R: 0.529, F1: 0.692
Fold: 8, Label dist.: [494 308], Acc: 0.753, Train_acc: 0.803, P: 1.000, R: 0.353, F1: 0.522
Fold: 9, Label dist.: [494 308], Acc: 0.820, Train_acc: 0.793, P: 0.950, R: 0.559, F1: 0.704
Fold: 10, Label dist.: [495 308], Acc: 0.818, Train_acc: 0.791, P: 0.950, R: 0.559, F1: 0.704
Total accuracy: 0.796 Train_acc: 0.796, P: 0.950, R: 0.498, F1: 0.644

array([ 0.79586313,  0.7959864 ,  0.94970883,  0.49781513,  0.64437902])

In [None]:
plt.hist(df['Survived'], 3)

## Precision and Recall evaluation

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

msk = np.random.rand(len(df)) < 0.8
_train = df[msk]
_validation = df[~msk]

X_train = _train[features]
X_train = poly.fit_transform(X_train)


lr_model.fit(X_train ,_train['Survived'])

X_validation = _validation[features]
X_validation = poly.fit_transform(X_validation)
predict = lr_model.predict(X_validation)

confmat = confusion_matrix(y_true=_validation['Survived'], y_pred=predict)

print 'One test on a 8/2 split validation data:'
print ''
print(confmat)

print('Accuracy: %.3f' % accuracy_score(y_true=_validation['Survived'], y_pred=predict))
print('Precision: %.3f' % precision_score(y_true=_validation['Survived'], y_pred=predict))
print('Recall: %.3f' % recall_score(y_true=_validation['Survived'], y_pred=predict))
print('F1: %.3f' % f1_score(y_true=_validation['Survived'], y_pred=predict))

## Example - using cross_validation with f1 score

In [None]:
from sklearn import cross_validation
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

threshold = []
scores = []
for t in np.linspace(0.3, 0.8, num=20):
    
    lr_model.class_weight = {0:t, 1:1-t}
    #f1_s = cross_validation.cross_val_score(lr_model,
    #                             poly.fit_transform(df[features]),
    #                             df['Survived'],
    #                             scoring='f1',
    #                             cv=8)
    
    predicted = cross_validation.cross_val_predict(lr_model, poly.fit_transform(df[features]),
                                                   df['Survived'], cv=8, n_jobs=-1)
    acc_s = accuracy_score(df['Survived'], predicted)
    p_s = precision_score(df['Survived'], predicted)
    r_s = recall_score(df['Survived'], predicted)
    f1_s = f1_score(df['Survived'], predicted)
    
    print('Threshold: %.3f, Acc: %.3f, P: %.3f, R: %.3f, F1: %.3f' % (t, acc_s, p_s, r_s, f1_s))
    threshold.append(t)
    scores.append(np.mean(f1_s))
    
    #http://stackoverflow.com/questions/19984957/scikit-predict-default-threshold

## Choose Threshold as 0.721

In [None]:
class_weight={0: 0.721,
              1: 1-0.721}
lr_model.class_weight = class_weight
print lr_model

## Good results above

## Choosing C:

In [None]:
C_para, score = [], []

for c in np.arange(-5, 8):
    lr = LogisticRegression(C=3**c, random_state=0, n_jobs=-1, class_weight=class_weight)
    rate = correct_rate_KFold(df, lr, features, 'Survived', quiet=True)
    C_para.append(3**c)
    score.append(rate)
    
    print('C: %f, acc: %.3f' % (3**c, rate[0]))

score = np.array(score)


In [None]:
plt.plot(C_para, score[:,0], label='validation correct rate')
plt.plot(C_para, score[:,1], label='training correct rate')
plt.plot(C_para, score[:,2], label='validation precision')
plt.plot(C_para, score[:,3], label='validation recall')
plt.plot(C_para, score[:,4], label='validation F1')
plt.legend(loc='lower right')
plt.xscale('log')
print lr.class_weight

In [None]:
n=3
print C_para[n]
print score[n]

## Choose C=0.1

## Train Model

In [None]:
# Note, df has already been feature engineered above
X = df[features]
X = poly.fit_transform(X)

lr_model_pro = LogisticRegression(C=0.1, random_state=0, n_jobs=-1, class_weight=class_weight)

lr_model_pro.fit(X ,df['Survived'])

In [None]:
pre = lr_model_pro.predict(X)
print 1 - (pre != df['Survived']).sum()/float(len(df))

#alternativly:

from sklearn import cross_validation
scores = cross_validation.cross_val_score(lr_model_pro,
                                 poly.fit_transform(df[features]),
                                 df['Survived'],
                                 scoring='accuracy',
                                 cv=8)
print scores.mean()
scores = cross_validation.cross_val_score(lr_model_pro,
                                 poly.fit_transform(df[features]),
                                 df['Survived'],
                                 scoring='precision',
                                 cv=8)
print scores.mean()

In [None]:
print len(lr_model_pro.coef_[0])

In [None]:
print X.shape

## Predict test

In [None]:
test = pd.read_csv('data/test.csv', sep=',')

feature_engineering(test, title_mapping)

X_test = test[features]
X_test = poly.fit_transform(X_test)



test['Survived'] = lr_model.predict(X_test).astype(int)

In [None]:
plt.hist(test['Survived'] )

In [None]:
test

In [None]:
np.unique(test['Title'])

In [None]:
import csv as csv
predictions_file = open("LRmodel.Od9.C0.1-thre0.721-acc0.79-p0.94.csv", "wb")
predictions_file_object = csv.writer(predictions_file)
predictions_file_object.writerow(["PassengerId", "Survived"])	# write the column headers
for index, row in test.iterrows():									# For each row in test file,
    predictions_file_object.writerow([row['PassengerId'], row['Survived']])			# write the PassengerId, and predict 1
predictions_file.close()