In [35]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import pandas as pd
import sklearn
from sklearn import metrics
from sklearn.cross_validation import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn import svm
from sklearn.preprocessing import OneHotEncoder 
enc  = OneHotEncoder()
from sklearn import preprocessing

%matplotlib inline
style.use("ggplot")

accuracy = metrics.accuracy_score

### Instantiate Models

In [36]:
m1 = rforest = RandomForestClassifier()
m2 = logreg = LogisticRegression()
m3 = knn = KNeighborsClassifier()
m4 = gnb = GaussianNB()
m5 = multi = MultinomialNB()
m6 = bernoulli = BernoulliNB()

m7 = poly = svm.SVC(kernel='poly', C=1,gamma='auto')
m8 = rbf = svm.SVC(kernel='rbf', C=1,gamma='auto')
m9 = linear = svm.SVC(kernel='linear', C=1,gamma='auto')
m10 = sigmoid = svm.SVC(kernel='sigmoid', C=1,gamma='auto')

models = [m1,m2,m3,m4,m5,m6]
svm_models = [m7,m8,m9,m10] # Looping Through These SVM Models Lag The Script

### Merge Data

In [37]:
train = pd.read_csv("titanic_training.csv", header = 0) 
test = pd.read_csv("titanic_test.csv", header = 0)
ID = test['PassengerId'] # Extract ID Names From Test Set
data = pd.concat([train, test], axis = 0)
data.shape

(1309, 12)

### Delete Columns

In [38]:
del data['Name'], data['Ticket'], data['Cabin'], data['PassengerId']

### Convert Data

##### Manually Transforming

In [39]:
# def transform_category(category): # Convert each string to a categorical value
#     if category == 'Q': return 0
#     if category == 'S': return 1
#     if category == 'C': return 2

# data['Embarked'] = data['Embarked'].apply(transform_category)

# data.drop(data.index[data["Embarked"] == 0])
# data['Embarked'] = pd.get_dummies(data['Embarked'])
# data['Sex'] = pd.get_dummies(data['Sex'])
# # data.head()

##### Transforming Using a Dictionary

In [40]:
# data['Sex'] = data.Sex.map({'male':1, 'female':0})
# data['Embarked'] = data.Embarked.map({'Q':0, 'S':1, 'C':2})
# data.drop(data.index[data["Embarked"] == 2])

##### Pandas Get Dummies 

In [41]:
print(data.Embarked.value_counts(), "\n")
print(data.Sex.value_counts())

S    914
C    270
Q    123
0      2
Name: Embarked, dtype: int64 

male      843
female    466
Name: Sex, dtype: int64


##### pd.get_dummies and drop_first
- Drops One Category to Avoid Collinearity

In [42]:
data = pd.get_dummies(data, columns = ['Sex', 'Embarked'], drop_first = True)
data.head()

Unnamed: 0,Age,Fare,Parch,Pclass,SibSp,Survived,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,22.0,7.25,0,3,1,0.0,1,0,0,1
1,38.0,71.2833,0,1,1,1.0,0,1,0,0
2,26.0,7.925,0,3,0,1.0,0,0,0,1
3,35.0,53.1,0,1,1,1.0,0,0,0,1
4,35.0,8.05,0,3,0,0.0,1,0,0,1


In [43]:
del data["Embarked_C"]
data.head()

Unnamed: 0,Age,Fare,Parch,Pclass,SibSp,Survived,Sex_male,Embarked_Q,Embarked_S
0,22.0,7.25,0,3,1,0.0,1,0,1
1,38.0,71.2833,0,1,1,1.0,0,0,0
2,26.0,7.925,0,3,0,1.0,0,0,1
3,35.0,53.1,0,1,1,1.0,0,0,1
4,35.0,8.05,0,3,0,0.0,1,0,1


##### Label Encoding

In [44]:
# le = preprocessing.LabelEncoder()
# le.fit(np.array(data.Embarked))
# data["Embarked"] = le.transform(data.Embarked)
# le.fit(np.array(data.Sex))
# data["Sex"] = le.transform(data.Sex)

### Impute Missing Data | Split Data | Normalize Data

In [45]:
data.fillna(method = 'ffill', inplace = True) # Impute Missing Data
data = data.astype(float) # Convert DF Type to Float

"""Split Data"""
train = data[0:len(train)]
test = data[len(train):]

"""Normalize Data for Faster Computation"""
train = train/train.max().astype(np.float64)
test = test/test.max().astype(np.float64)

In [46]:
print(train.shape)
print(test.shape)

(891, 9)
(418, 9)


### Separate Target From Training Data | Delete Survived Column From Test Data

In [47]:
# Grab Location of Survived
print("Shape of Data:", train.shape)
print("Index Location of Target:", train.columns.get_loc("Survived"))

Shape of Data: (891, 9)
Index Location of Target: 5


In [48]:
target = train.ix[:,5] # Separate Target
# X = pd.DataFrame(train.ix[:, 0:7]) # Join All Other Data

X = pd.DataFrame.join(train.ix[:, :5], train.ix[:, 6:]) # Used If Target is Between Data

In [49]:
"""Delete Target From Testing Set to Match Shape of Training Set"""
del test['Survived'] 

In [50]:
print(target.shape)
print(X.shape)
print(test.shape)

(891,)
(891, 8)
(418, 8)


### Split Data to Test Accuracy on Model

In [51]:
from sklearn.cross_validation import cross_val_score
X_train, X_test, target_train, target_test = train_test_split(X, target, test_size = 0.35, random_state = 1)

print ("Features For Training Set: ", X_train.shape)
print ("Target Training Set: ", target_train.shape)
print ("Features For Testing Set: ", X_test.shape)
print ("Target For Testing Set: ", target_test.shape)

Features For Training Set:  (579, 8)
Target Training Set:  (579,)
Features For Testing Set:  (312, 8)
Target For Testing Set:  (312,)


In [66]:
m1.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [73]:
##### GRID SEARCH
from sklearn.model_selection import GridSearchCV
from sklearn import svm 
ranges = range(1,31)
param_grid = dict(max_features = ranges, min_impurity_decrease = ranges)
Model = m1
clf = GridSearchCV(Model, param_grid)
clf.fit(X_train, target_train)
clf.grid_scores_

ValueError: max_features must be in (0, n_features]

### Cross Validation on Models

In [18]:
for model in models:
    model.fit(X_train, target_train)
    target_pred = model.predict(X_test)
    cv_score = cross_val_score(model, X_train, target_train, cv=5, scoring = 'precision')
    print(model, "\n")
    print("CV Score:",cv_score, "\n")
    print('Mean CV Score:',np.mean(cv_score), "\n")
    print(metrics.classification_report(target_pred, target_test))
    print("____________________________________________________________________________________________", "\n")

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) 

CV Score: [ 0.71428571  0.80555556  0.78947368  0.78947368  0.74418605] 

Mean CV Score: 0.768594936955 

             precision    recall  f1-score   support

        0.0       0.89      0.76      0.82       215
        1.0       0.59      0.78      0.68        97

avg / total       0.80      0.77      0.77       312

____________________________________________________________________________________________ 

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          

### Cross Validation on SVM Models

In [19]:
for model in svm_models:
    model.fit(X_train, target_train)
    target_pred = model.predict(X_test)
    cv_score = cross_val_score(model, X_train, target_train, cv=5, scoring = 'precision')
    print(model, "\n")
    print("CV Score:",cv_score, "\n")
    print('Mean CV Score:',np.mean(cv_score), "\n")
    print(metrics.classification_report(target_pred, target_test))
    print("____________________________________________________________________________________________", "\n")

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) 

CV Score: [ 0.74285714  0.71153846  0.7         0.80555556  0.64705882] 

Mean CV Score: 0.721401996696 

             precision    recall  f1-score   support

        0.0       0.72      0.79      0.75       167
        1.0       0.73      0.64      0.68       145

avg / total       0.72      0.72      0.72       312

____________________________________________________________________________________________ 

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) 

CV Score: [ 0.77419355  0.74418605  0.72916667  0.87096774  0.68888889] 

Mean CV Score: 0.761480578478 

             precision    r

### Quick Accuracy Look: 

###### SVM Models

In [20]:
svm_kernels = ['poly','rbf','linear','sigmoid']
for i in svm_kernels:
    mod = svm.SVC(kernel = i)
    mod.fit(X_train, target_train)
    target_pred = mod.predict(X_test)
    print(round(accuracy(target_test, target_pred)*100, 2), "% Accuracy")
    print(mod, "\n")
    print(metrics.classification_report(target_pred, target_test), "\n")
    print("____________________________________________________________________________________________", "\n")

72.12 % Accuracy
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) 

             precision    recall  f1-score   support

        0.0       0.72      0.79      0.75       167
        1.0       0.73      0.64      0.68       145

avg / total       0.72      0.72      0.72       312
 

____________________________________________________________________________________________ 

75.64 % Accuracy
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) 

             precision    recall  f1-score   support

        0.0       0.83      0.78      0.80       196
        1.0       0.66      0.72      0.69       116

avg / total       0.76      0.76      

###### All Other Models

In [21]:
for model in models: 
    model.fit(X_train, target_train)
    target_pred = model.predict(X_test)
    print(round(accuracy(target_test, target_pred)*100, 2), "% Accuracy")
    print(model, "\n")
    print(metrics.classification_report(target_pred, target_test))
    print("____________________________________________________________________________________________", "\n")

75.32 % Accuracy
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) 

             precision    recall  f1-score   support

        0.0       0.86      0.75      0.81       211
        1.0       0.59      0.75      0.66       101

avg / total       0.78      0.75      0.76       312

____________________________________________________________________________________________ 

75.96 % Accuracy
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
       

### Interact SVM Models to Test Model Accuracy

In [22]:
"""SVM Model: poly"""
from ipywidgets import interact
ranges = [1,100,1]
@interact(c_range = ranges, degree = ranges, gamma = ranges)
def acc1(c_range, degree, gamma):
    model = svm.SVC(kernel = 'poly', C = c_range, degree = degree, gamma = gamma)
    model.fit(X_train, target_train)
    print(round(accuracy(target_test, target_pred)*100, 2), "% Accuracy")
    print(model, "\n")
    print(metrics.classification_report(target_pred, target_test), "\n")

75.32 % Accuracy
SVC(C=50, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=50, gamma=50, kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) 

             precision    recall  f1-score   support

        0.0       0.82      0.78      0.80       193
        1.0       0.66      0.71      0.69       119

avg / total       0.76      0.75      0.75       312
 



In [23]:
"""SVM Model: rbf"""
from ipywidgets import interact
ranges = [1,100,1]
@interact(c_range = ranges, degree = ranges, gamma = ranges)
def acc2(c_range, degree, gamma):
    model = svm.SVC(kernel = 'rbf', C = c_range, degree = degree, gamma = gamma)
    model.fit(X_train, target_train)
    print(round(accuracy(target_test, target_pred)*100, 2), "% Accuracy")
    print(model, "\n")
    print(metrics.classification_report(target_pred, target_test), "\n")

75.32 % Accuracy
SVC(C=50, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=50, gamma=50, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) 

             precision    recall  f1-score   support

        0.0       0.82      0.78      0.80       193
        1.0       0.66      0.71      0.69       119

avg / total       0.76      0.75      0.75       312
 



In [24]:
"""SVM Model: linear"""
from ipywidgets import interact
ranges = [1,100,1]
@interact(c_range = ranges, degree = ranges, gamma = ranges)
def acc3(c_range, degree, gamma):
    model = svm.SVC(kernel = 'linear', C = c_range, degree = degree, gamma = gamma)
    model.fit(X_train, target_train)
    print(round(accuracy(target_test, target_pred)*100, 2), "% Accuracy")
    print(model, "\n")
    print(metrics.classification_report(target_pred, target_test), "\n")

75.32 % Accuracy
SVC(C=50, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=50, gamma=50, kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) 

             precision    recall  f1-score   support

        0.0       0.82      0.78      0.80       193
        1.0       0.66      0.71      0.69       119

avg / total       0.76      0.75      0.75       312
 



In [25]:
"""SVM Model: sigmoid"""
from ipywidgets import interact
ranges = [1,100,1]
@interact(c_range = ranges, degree = ranges, gamma = ranges)
def acc4(c_range, degree, gamma):
    model = svm.SVC(kernel = 'sigmoid', C = c_range, degree = degree, gamma = gamma)
    model.fit(X_train, target_train)
    print(round(accuracy(target_test, target_pred)*100, 2), "% Accuracy")
    print(model, "\n")
    print(metrics.classification_report(target_pred, target_test), "\n")

75.32 % Accuracy
SVC(C=50, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=50, gamma=50, kernel='sigmoid',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) 

             precision    recall  f1-score   support

        0.0       0.82      0.78      0.80       193
        1.0       0.66      0.71      0.69       119

avg / total       0.76      0.75      0.75       312
 



In [26]:
"""K Nearest Neighbors"""
from ipywidgets import interact
@interact(neighbors = [1,100,1])
def acc5(neighbors):
    model = KNeighborsClassifier(n_neighbors = neighbors)
    model.fit(X_train, target_train)
    print(round(accuracy(target_test, target_pred)*100, 2), "% Accuracy")

75.32 % Accuracy


In [76]:
KNeighborsClassifier().get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': 1,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [74]:
"""Get Parameters of a Model: KNeighborsClassifier().get_params()"""
"""Grid Search"""
from sklearn.model_selection import GridSearchCV
"""Specify a Range"""
ranges = range(1,31)
"""Set a Model's Parameter to the Ranges to Try in GridSearch Using a Dictionary"""
param_grid = dict(n_neighbors = ranges)
"""Specify the Model"""
Model = KNeighborsClassifier()
"""Instantiate the GridSearchModel With Model, Parameter Grid, and Proper Parameters of Grid"""
clf = GridSearchCV(Model, param_grid, cv = 10, scoring = 'accuracy')
"""Fit Data"""
clf.fit(X_train, target_train)
"""Output Scores"""
clf.grid_scores_



[mean: 0.75648, std: 0.02596, params: {'n_neighbors': 1},
 mean: 0.80484, std: 0.03547, params: {'n_neighbors': 2},
 mean: 0.81693, std: 0.03191, params: {'n_neighbors': 3},
 mean: 0.82729, std: 0.02258, params: {'n_neighbors': 4},
 mean: 0.82038, std: 0.04223, params: {'n_neighbors': 5},
 mean: 0.83938, std: 0.03534, params: {'n_neighbors': 6},
 mean: 0.82729, std: 0.04176, params: {'n_neighbors': 7},
 mean: 0.83074, std: 0.03583, params: {'n_neighbors': 8},
 mean: 0.81693, std: 0.03941, params: {'n_neighbors': 9},
 mean: 0.82556, std: 0.04310, params: {'n_neighbors': 10},
 mean: 0.81693, std: 0.04180, params: {'n_neighbors': 11},
 mean: 0.82556, std: 0.03259, params: {'n_neighbors': 12},
 mean: 0.81347, std: 0.03614, params: {'n_neighbors': 13},
 mean: 0.83074, std: 0.02968, params: {'n_neighbors': 14},
 mean: 0.82556, std: 0.03208, params: {'n_neighbors': 15},
 mean: 0.83247, std: 0.03506, params: {'n_neighbors': 16},
 mean: 0.83074, std: 0.03000, params: {'n_neighbors': 17},
 mean:

In [27]:
"""Random Forest"""
from ipywidgets import interact
ranges = [2,100,1]
@interact(n_estimators = ranges, min_samples_split = ranges)
def acc5(n_estimators, min_samples_split):
    model = RandomForestClassifier(n_estimators = n_estimators, min_samples_split = min_samples_split)
    model.fit(X_train, target_train)
    print(round(accuracy(target_test, target_pred)*100, 2), "% Accuracy", "\n")
    print(model)

75.32 % Accuracy 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=51,
            min_weight_fraction_leaf=0.0, n_estimators=51, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


### Model Accuracy on a Single Model 

In [28]:
model = m1
model.fit(X_train, target_train)
target_pred = model.predict(X_test)
print (round(accuracy(target_test, target_pred)*100, 2), "% Accuracy")
print(model, "\n")
print(metrics.classification_report(target_pred, target_test))

77.88 % Accuracy
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) 

             precision    recall  f1-score   support

        0.0       0.90      0.76      0.83       217
        1.0       0.60      0.81      0.69        95

avg / total       0.81      0.78      0.79       312



# Test on Actual Testing Set

In [29]:
# Note: Update Parameters For Better Accuracy
estimator = m1
y_pred = estimator.predict(test)
predictions = pd.DataFrame(ID)
def predict(predictions):
    predictions["Survived"] = y_pred
    predictions = predictions.astype(int)
    return predictions

predict(predictions)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
5,897,0
6,898,0
7,899,0
8,900,1
9,901,0


In [30]:
# predictions.to_csv('titanic_submission.csv',index = False)