In [1]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import pandas as pd
import sklearn
import seaborn as sns
from sklearn import metrics
from sklearn.cross_validation import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn import svm
%matplotlib inline
style.use("ggplot")

accuracy = metrics.accuracy_score



### Instantiate Models

In [2]:
m1 = rforest = RandomForestClassifier()
m2 = logreg = LogisticRegression()
m3 = knn = KNeighborsClassifier()
m4 = gnb = GaussianNB()
m5 = multi = MultinomialNB()
m6 = bernoulli = BernoulliNB()

m7 = poly = svm.SVC(kernel='poly', C=1,gamma='auto')
m8 = rbf = svm.SVC(kernel='rbf', C=1,gamma='auto')
m9 = linear = svm.SVC(kernel='linear', C=1,gamma='auto')
m10 = sigmoid = svm.SVC(kernel='sigmoid', C=1,gamma='auto')

models = [m1,m2,m3,m4,m5,m6]

In [3]:
m = [m7,m8,m9,m10] # Looping through these SVM models lag the script

### Merge Data

In [4]:
train = pd.read_csv("titanic_training.csv", header = 0) 
test = pd.read_csv("titanic_test.csv", header = 0)
ID = test['PassengerId'] # Extract ID Names From Test Set
data = pd.concat([train, test], axis = 0)
data.shape

(1309, 12)

### Delete Columns

In [5]:
del data['Name'], data['Ticket'], data['Cabin'], data['PassengerId']

### Convert Data

In [6]:
def transform_category(category): # Convert each string to a categorical value
    if category == 'Q': return 0
    if category == 'S': return 1
    if category == 'C': return 2

data['Embarked'] = data['Embarked'].apply(transform_category)
data['Sex'] = pd.get_dummies(data['Sex'])

In [7]:
print(train.shape)
print(test.shape)

(891, 12)
(418, 11)


In [8]:
data.fillna(method = 'ffill', inplace = True)
data = data.astype(float)
train = data[0:891]
test = data[891:]

### Separate Target From Training Data | Delete Survived Column From Test Data

In [9]:
# Grab Location of Survived
print("Shape of Data:", train.shape)
print("Index Location of Target:", train.columns.get_loc("Survived"))

Shape of Data: (891, 8)
Index Location of Target: 7


In [10]:
target = train.ix[:,7] # Separate Target
X = pd.DataFrame(train.ix[:, 0:7]) # Join All Other Data

In [11]:
del test['Survived'] # Delete Target From Testing Set to Match Shape of Training Set

In [12]:
print(target.shape)
print(X.shape)
print(test.shape)

(891,)
(891, 7)
(418, 7)


### Cross Validation 

In [13]:
from sklearn.cross_validation import cross_val_score
for model in models:
    cv_score = cross_val_score(model, X, target, cv=3, scoring = 'precision')
    print("CV Score",cv_score)
    print('Mean CV Score:',np.mean(cv_score))
    print(model, "\n")

CV Score [ 0.75789474  0.79807692  0.75700935]
Mean CV Score: 0.770993668571
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) 

CV Score [ 0.71153846  0.69421488  0.76344086]
Mean CV Score: 0.723064732596
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False) 

CV Score [ 0.58333333  0.62857143  0.64473684]
Mean CV Score: 0.61888053467
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
       

### Split Data to Test Accuracy on Model

In [14]:
X_train, X_test, target_train, target_test = train_test_split(X, target, test_size = 0.35)

In [15]:
print ("Features For Training Set: ", X_train.shape)
print ("Target Training Set: ", target_train.shape)
print ("Features For Testing Set: ", X_test.shape)
print ("Target For Testing Set: ", target_test.shape)

Features For Training Set:  (579, 7)
Target Training Set:  (579,)
Features For Testing Set:  (312, 7)
Target For Testing Set:  (312,)


### Fit Model

In [16]:
for model in models: 
    model.fit(X_train, target_train)
    target_pred = model.predict(X_test)
    print(round(accuracy(target_test, target_pred)*100, 2), "% Accuracy")
    print(model, "\n")

77.56 % Accuracy
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) 

76.28 % Accuracy
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False) 

66.03 % Accuracy
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform') 

75.0 % Accuracy
GaussianNB(priors=None) 

63.78 % Accuracy
MultinomialNB(alpha=1.0, class_pri

In [17]:
model = m1

In [18]:
model.fit(X_train, target_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [19]:
target_pred = model.predict(X_test)

In [20]:
print (round(accuracy(target_test, target_pred)*100, 2), "% Accuracy")

79.17 % Accuracy


# Test on Actual Testing Set

In [21]:
target_pred = model.predict(test)
predictions = pd.DataFrame(ID)
def predict(predictions):
    predictions["Survived"] = target_pred
    predictions = predictions.astype(int)
    return predictions

predict(predictions)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,0
7,899,0
8,900,1
9,901,0


In [22]:
# predictions.to_csv('titanic_submission.csv',index = False)