In [1]:
# !open . <--- Opens File Location 

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
import pandas as pd
import sklearn
from sklearn import metrics
from sklearn.cross_validation import train_test_split 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn import svm
from sklearn import preprocessing
%matplotlib inline
style.use("ggplot")
accuracy = metrics.accuracy_score

import warnings
warnings.filterwarnings("ignore")



### Instantiate Models

In [3]:
m1 = rforest = RandomForestClassifier()
m2 = logreg = LogisticRegression()
m3 = knn = KNeighborsClassifier()
m4 = gnb = GaussianNB()
m5 = multi = MultinomialNB()
m6 = bernoulli = BernoulliNB()

m7 = poly = svm.SVC(kernel='poly', C=1,gamma='auto')
m8 = rbf = svm.SVC(kernel='rbf', C=1,gamma='auto')
m9 = linear = svm.SVC(kernel='linear', C=1,gamma='auto')
m10 = sigmoid = svm.SVC(kernel='sigmoid', C=1,gamma='auto')

models = [m1,m2,m3,m4,m5,m6]
svm_models = [m7,m8,m9,m10] # Looping Through These SVM Models Lag The Script

### Merge Data

In [4]:
train = pd.read_csv("titanic_training.csv", header = 0) 
test = pd.read_csv("titanic_test.csv", header = 0)
ID = test['PassengerId'] # Extract ID Names From Test Set
data = pd.concat([train, test], axis = 0)
data.shape

(1309, 12)

# Preprocessing

In [5]:
cols = data.columns.tolist()
cols.insert(0, cols.pop(cols.index('Survived')))
data = data.reindex(columns= cols)
data.head()

Unnamed: 0,Survived,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Ticket
0,0.0,22.0,0,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,A/5 21171
1,1.0,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,PC 17599
2,1.0,26.0,0,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,STON/O2. 3101282
3,1.0,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,113803
4,0.0,35.0,0,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,373450


In [6]:
data['Title'] = 'Other'
data.loc[['Master.' in title for title in data.Name], 'Title'] = 'Master'
data.loc[['Miss.' in title for title in data.Name], 'Title'] = 'Miss'
data.loc[['Mr.' in title for title in data.Name], 'Title'] = 'Mr'
data.loc[['Mrs.' in title for title in data.Name], 'Title'] = 'Mrs'
data.head()

Unnamed: 0,Survived,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Ticket,Title
0,0.0,22.0,0,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,A/5 21171,Mr
1,1.0,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,PC 17599,Mrs
2,1.0,26.0,0,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,STON/O2. 3101282,Miss
3,1.0,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,113803,Mrs
4,0.0,35.0,0,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,373450,Mr


In [7]:
data['Sex'] = data.Sex.map({'male':1, 'female':0})

In [8]:
data = pd.get_dummies(data, columns = ['Sex', 'Embarked', 'Pclass', 'Title'], drop_first = True)
data.head()

Unnamed: 0,Survived,Age,Cabin,Fare,Name,Parch,PassengerId,SibSp,Ticket,Sex_1,Embarked_C,Embarked_Q,Embarked_S,Pclass_2,Pclass_3,Title_Miss,Title_Mr,Title_Mrs,Title_Other
0,0.0,22.0,0,7.25,"Braund, Mr. Owen Harris",0,1,1,A/5 21171,1,0,0,1,0,1,0,1,0,0
1,1.0,38.0,C85,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,PC 17599,0,1,0,0,0,0,0,0,1,0
2,1.0,26.0,0,7.925,"Heikkinen, Miss. Laina",0,3,0,STON/O2. 3101282,0,0,0,1,0,1,1,0,0,0
3,1.0,35.0,C123,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,113803,0,0,0,1,0,0,0,0,1,0
4,0.0,35.0,0,8.05,"Allen, Mr. William Henry",0,5,0,373450,1,0,0,1,0,1,0,1,0,0


In [9]:
del data["Embarked_C"]
data.head()

Unnamed: 0,Survived,Age,Cabin,Fare,Name,Parch,PassengerId,SibSp,Ticket,Sex_1,Embarked_Q,Embarked_S,Pclass_2,Pclass_3,Title_Miss,Title_Mr,Title_Mrs,Title_Other
0,0.0,22.0,0,7.25,"Braund, Mr. Owen Harris",0,1,1,A/5 21171,1,0,1,0,1,0,1,0,0
1,1.0,38.0,C85,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,PC 17599,0,0,0,0,0,0,0,1,0
2,1.0,26.0,0,7.925,"Heikkinen, Miss. Laina",0,3,0,STON/O2. 3101282,0,0,1,0,1,1,0,0,0
3,1.0,35.0,C123,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,113803,0,0,1,0,0,0,0,1,0
4,0.0,35.0,0,8.05,"Allen, Mr. William Henry",0,5,0,373450,1,0,1,0,1,0,1,0,0


In [10]:
def transform(num): 
    if num <= 15: return 0
    if num > 15 and num <= 40: return 1
    if num > 50: return 2

data['Age'] = data['Age'].apply(transform)
data.head()

Unnamed: 0,Survived,Age,Cabin,Fare,Name,Parch,PassengerId,SibSp,Ticket,Sex_1,Embarked_Q,Embarked_S,Pclass_2,Pclass_3,Title_Miss,Title_Mr,Title_Mrs,Title_Other
0,0.0,1.0,0,7.25,"Braund, Mr. Owen Harris",0,1,1,A/5 21171,1,0,1,0,1,0,1,0,0
1,1.0,1.0,C85,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,PC 17599,0,0,0,0,0,0,0,1,0
2,1.0,1.0,0,7.925,"Heikkinen, Miss. Laina",0,3,0,STON/O2. 3101282,0,0,1,0,1,1,0,0,0
3,1.0,1.0,C123,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,113803,0,0,1,0,0,0,0,1,0
4,0.0,1.0,0,8.05,"Allen, Mr. William Henry",0,5,0,373450,1,0,1,0,1,0,1,0,0


In [11]:
def transform(num): 
    if num <= 100: return 0
    else: return num

data['Fare'] = data['Fare'].apply(transform)
data.head()

Unnamed: 0,Survived,Age,Cabin,Fare,Name,Parch,PassengerId,SibSp,Ticket,Sex_1,Embarked_Q,Embarked_S,Pclass_2,Pclass_3,Title_Miss,Title_Mr,Title_Mrs,Title_Other
0,0.0,1.0,0,0.0,"Braund, Mr. Owen Harris",0,1,1,A/5 21171,1,0,1,0,1,0,1,0,0
1,1.0,1.0,C85,0.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,PC 17599,0,0,0,0,0,0,0,1,0
2,1.0,1.0,0,0.0,"Heikkinen, Miss. Laina",0,3,0,STON/O2. 3101282,0,0,1,0,1,1,0,0,0
3,1.0,1.0,C123,0.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,113803,0,0,1,0,0,0,0,1,0
4,0.0,1.0,0,0.0,"Allen, Mr. William Henry",0,5,0,373450,1,0,1,0,1,0,1,0,0


In [12]:
data = pd.get_dummies(data, columns = ['Age'], drop_first = True)

In [13]:
del data['Name'], data['Ticket'], data['Cabin'], data['PassengerId']

In [68]:
data.head()

Unnamed: 0,Survived,Fare,Parch,SibSp,Sex_1,Embarked_Q,Embarked_S,Pclass_2,Pclass_3,Title_Miss,Title_Mr,Title_Mrs,Title_Other,Age_1.0,Age_2.0
0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


### Convert Data

##### Manually Transforming

In [14]:
# def transform_category(category): # Convert each string to a categorical value
#     if category == 'Q': return 0
#     if category == 'S': return 1
#     if category == 'C': return 2

# data['Embarked'] = data['Embarked'].apply(transform_category)

# data.drop(data.index[data["Embarked"] == 0])
# data['Embarked'] = pd.get_dummies(data['Embarked'])
# data['Sex'] = pd.get_dummies(data['Sex'])
# # data.head()

##### Transforming Using a Dictionary

In [15]:
# data['Sex'] = data.Sex.map({'male':1, 'female':0})
# data['Embarked'] = data.Embarked.map({'Q':0, 'S':1, 'C':2})
# data.drop(data.index[data["Embarked"] == 2])

##### Pandas Get Dummies 

In [16]:
# print(data.Embarked.value_counts(), "\n")
# print(data.Sex.value_counts())

##### pd.get_dummies and drop_first
- Drops One Category to Avoid Collinearity

In [17]:
# data = pd.get_dummies(data, columns = ['Sex', 'Embarked'], drop_first = True)
# data.head()

In [18]:
# del data["Embarked_C"]
# data.head()

##### Label Encoding

In [19]:
# le = preprocessing.LabelEncoder()
# le.fit(np.array(data.Embarked))
# data["Embarked"] = le.transform(data.Embarked)
# le.fit(np.array(data.Sex))
# data["Sex"] = le.transform(data.Sex)

### Impute Missing Data | Split Data | Normalize Data

In [20]:
data.fillna(method = 'ffill', inplace = True) # Impute Missing Data
data = data.astype(float) # Convert DF Type to Float

"""Split Data"""
train = data[0:len(train)]
test = data[len(train):]

"""Normalize Data for Faster Computation"""
train = train/train.max().astype(np.float64)
test = test/test.max().astype(np.float64)

In [21]:
print(train.shape)
print(test.shape)

(891, 15)
(418, 15)


### Separate Target From Training Data | Delete Survived Column From Test Data

In [22]:
# Grab Location of Survived
print("Shape of Data:", train.shape)
print("Index Location of Target:", train.columns.get_loc("Survived"))

Shape of Data: (891, 15)
Index Location of Target: 0


In [23]:
target = train.ix[:,0]
X = pd.DataFrame(train.ix[:,1:])

In [24]:
"""Delete Target From Testing Set to Match Shape of Training Set"""
del test['Survived'] 

In [25]:
print(target.shape)
print(X.shape)
print(test.shape)

(891,)
(891, 14)
(418, 14)


### Split Data to Test Accuracy on Model

In [26]:
from sklearn.cross_validation import cross_val_score
X_train, X_test, target_train, target_test = train_test_split(X, target, test_size = 0.35, random_state = 1)

print ("Features For Training Set: ", X_train.shape)
print ("Target Training Set: ", target_train.shape)
print ("Features For Testing Set: ", X_test.shape)
print ("Target For Testing Set: ", target_test.shape)

Features For Training Set:  (579, 14)
Target Training Set:  (579,)
Features For Testing Set:  (312, 14)
Target For Testing Set:  (312,)


### Cross Validation on Models

In [27]:
for model in models:
    model.fit(X_train, target_train)
    target_pred = model.predict(X_test)
    cv_score = cross_val_score(model, X_train, target_train, cv=5, scoring = 'precision')
    print(model, "\n")
    print("CV Score:",cv_score, "\n")
    print('Mean CV Score:',np.mean(cv_score), "\n")
    print(metrics.classification_report(target_pred, target_test))
    print("____________________________________________________________________________________________", "\n")

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) 

CV Score: [ 0.85294118  0.83333333  0.78947368  0.91176471  0.73170732] 

Mean CV Score: 0.823844043394 

             precision    recall  f1-score   support

        0.0       0.86      0.78      0.82       205
        1.0       0.64      0.77      0.70       107

avg / total       0.79      0.77      0.78       312

____________________________________________________________________________________________ 

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          

### Cross Validation on SVM Models

In [28]:
for model in svm_models:
    cv_score = cross_val_score(model, X_train, target_train, cv=5, scoring = 'precision')
    print(model, "\n")
    print("CV Score:",cv_score, "\n")
    print('Mean CV Score:',np.mean(cv_score), "\n")
    print("Classification Report:", "\n")
    model.fit(X_train, target_train)
    target_pred = model.predict(X_test)
    print(metrics.classification_report(target_pred, target_test))
    print("____________________________________________________________________________________________", "\n")

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) 

CV Score: [ 1.          0.75        0.76470588  0.88888889  0.84615385] 

Mean CV Score: 0.849949723479 

Classification Report: 

             precision    recall  f1-score   support

        0.0       0.86      0.77      0.82       206
        1.0       0.63      0.76      0.69       106

avg / total       0.79      0.77      0.77       312

____________________________________________________________________________________________ 

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) 

CV Score: [ 0.77419355  0.74418605  0.73469388  0.76315789  0.70454545] 

Mean CV Score: 0.744155364346 

Cl

### Quick Accuracy Look: 

###### SVM Models

In [29]:
svm_kernels = ['poly','rbf','linear','sigmoid']
for i in svm_kernels:
    mod = svm.SVC(kernel = i)
    mod.fit(X_train, target_train)
    target_pred = mod.predict(X_test)
    print(round(accuracy(target_test, target_pred)*100, 2), "% Accuracy")
    print(mod, "\n")
    print(metrics.classification_report(target_pred, target_test), "\n")
    print("____________________________________________________________________________________________", "\n")

76.92 % Accuracy
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) 

             precision    recall  f1-score   support

        0.0       0.86      0.77      0.82       206
        1.0       0.63      0.76      0.69       106

avg / total       0.79      0.77      0.77       312
 

____________________________________________________________________________________________ 

76.6 % Accuracy
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) 

             precision    recall  f1-score   support

        0.0       0.84      0.78      0.81       199
        1.0       0.66      0.74      0.70       113

avg / total       0.77      0.77      0

###### All Other Models

In [30]:
for model in models: 
    model.fit(X_train, target_train)
    target_pred = model.predict(X_test)
    print(round(accuracy(target_test, target_pred)*100, 2), "% Accuracy")
    print(model, "\n")
    print(metrics.classification_report(target_pred, target_test))
    print("____________________________________________________________________________________________", "\n")

78.53 % Accuracy
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) 

             precision    recall  f1-score   support

        0.0       0.89      0.78      0.83       209
        1.0       0.64      0.80      0.71       103

avg / total       0.80      0.79      0.79       312

____________________________________________________________________________________________ 

77.24 % Accuracy
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
       

### Interact SVM Models to Test Model Accuracy

In [31]:
"""SVM Model: poly"""
from ipywidgets import interact
"""Specify Ranges"""
ranges = [1,100,1]
"""Create Interact Bars For Adjusting Parameters"""
"""Create Function That Takes the Interact Arguments"""
@interact(c_range = ranges, degree = ranges, gamma = ranges)
def acc1(c_range, degree, gamma):
    """Set Model and Parameters"""
    model = svm.SVC(kernel = 'poly', C = c_range, degree = degree, gamma = gamma)
    """Fit Data"""
    model.fit(X_train, target_train)
    """Print Accuracy"""
    print(round(accuracy(target_test, target_pred)*100, 2), "% Accuracy")
    """Print Specifications of Model"""
    print(model, "\n")
    """Print Metrics"""
    print(metrics.classification_report(target_pred, target_test), "\n")

75.64 % Accuracy
SVC(C=50, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=50, gamma=50, kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) 

             precision    recall  f1-score   support

        0.0       0.80      0.79      0.80       188
        1.0       0.69      0.71      0.70       124

avg / total       0.76      0.76      0.76       312
 



In [32]:
"""SVM Model: rbf"""
from ipywidgets import interact
ranges = [1,100,1]
@interact(c_range = ranges, degree = ranges, gamma = ranges)
def acc2(c_range, degree, gamma):
    model = svm.SVC(kernel = 'rbf', C = c_range, degree = degree, gamma = gamma)
    model.fit(X_train, target_train)
    print(round(accuracy(target_test, target_pred)*100, 2), "% Accuracy")
    print(model, "\n")
    print(metrics.classification_report(target_pred, target_test), "\n")

75.64 % Accuracy
SVC(C=50, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=50, gamma=50, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) 

             precision    recall  f1-score   support

        0.0       0.80      0.79      0.80       188
        1.0       0.69      0.71      0.70       124

avg / total       0.76      0.76      0.76       312
 



In [33]:
"""SVM Model: linear"""
from ipywidgets import interact
ranges = [1,100,1]
@interact(c_range = ranges, degree = ranges, gamma = ranges)
def acc3(c_range, degree, gamma):
    model = svm.SVC(kernel = 'linear', C = c_range, degree = degree, gamma = gamma)
    model.fit(X_train, target_train)
    print(round(accuracy(target_test, target_pred)*100, 2), "% Accuracy")
    print(model, "\n")
    print(metrics.classification_report(target_pred, target_test), "\n")

75.64 % Accuracy
SVC(C=50, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=50, gamma=50, kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) 

             precision    recall  f1-score   support

        0.0       0.80      0.79      0.80       188
        1.0       0.69      0.71      0.70       124

avg / total       0.76      0.76      0.76       312
 



In [34]:
"""SVM Model: sigmoid"""
from ipywidgets import interact
ranges = [1,100,1]
@interact(c_range = ranges, degree = ranges, gamma = ranges)
def acc4(c_range, degree, gamma):
    model = svm.SVC(kernel = 'sigmoid', C = c_range, degree = degree, gamma = gamma)
    model.fit(X_train, target_train)
    print(round(accuracy(target_test, target_pred)*100, 2), "% Accuracy")
    print(model, "\n")
    print(metrics.classification_report(target_pred, target_test), "\n")

75.64 % Accuracy
SVC(C=50, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=50, gamma=50, kernel='sigmoid',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) 

             precision    recall  f1-score   support

        0.0       0.80      0.79      0.80       188
        1.0       0.69      0.71      0.70       124

avg / total       0.76      0.76      0.76       312
 



In [35]:
"""K Nearest Neighbors"""
from ipywidgets import interact
@interact(neighbors = [1,100,1])
def acc5(neighbors):
    model = KNeighborsClassifier(n_neighbors = neighbors)
    model.fit(X_train, target_train)
    print(round(accuracy(target_test, target_pred)*100, 2), "% Accuracy")

75.64 % Accuracy


In [36]:
"""Random Forest"""
from ipywidgets import interact
ranges = [2,100,1]
@interact(n_estimators = ranges, min_samples_split = ranges)
def acc5(n_estimators, min_samples_split):
    model = RandomForestClassifier(n_estimators = n_estimators, min_samples_split = min_samples_split)
    model.fit(X_train, target_train)
    print(round(accuracy(target_test, target_pred)*100, 2), "% Accuracy", "\n")
    print(model)

75.64 % Accuracy 

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=51,
            min_weight_fraction_leaf=0.0, n_estimators=51, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


### Model Accuracy on a Single Model 

In [37]:
model = m1
model.fit(X_train, target_train)
target_pred = model.predict(X_test)
print (round(accuracy(target_test, target_pred)*100, 2), "% Accuracy")
print(model, "\n")
print(metrics.classification_report(target_pred, target_test))

78.85 % Accuracy
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) 

             precision    recall  f1-score   support

        0.0       0.88      0.79      0.83       204
        1.0       0.66      0.79      0.72       108

avg / total       0.80      0.79      0.79       312



### Grid Search

In [38]:
KNeighborsClassifier().get_params

<bound method BaseEstimator.get_params of KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')>

In [39]:
"""Get Parameters of a Model Using: Model().get_params()"""
from sklearn.model_selection import GridSearchCV
"""Specify a Range"""
ranges = range(1,50)
"""Set a Model's Parameter to the Ranges to Try in GridSearch Using a Dictionary"""
param_grid = dict(n_neighbors = ranges)
"""Specify the Model"""
Model = KNeighborsClassifier()
"""Instantiate the GridSearchModel With Model, Parameter Grid, and Proper Parameters of Grid"""
grid = GridSearchCV(Model, param_grid, cv = 10, scoring = 'accuracy')
"""Fit Unsplit Data"""
grid.fit(X, target)
"""Output Scores"""
grid.grid_scores_

print(grid.best_score_, "\n")
print(grid.best_params_, "\n")
print(grid.best_estimator_)

### Grid Search On SVM Models

In [40]:
LogisticRegression()

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [41]:
# """Logistic Regression"""
# ranges = range(1,30)
# param_grid = dict(C = ranges, intercept_scaling = ranges)
# Model = LogisticRegression()
# grid = GridSearchCV(Model, param_grid, cv = 10, scoring = "accuracy")
# grid.fit(X, target)
# grid.grid_scores_

# print(grid.best_score_, "\n")
# print(grid.best_params_, "\n")
# print(grid.best_estimator_)

In [42]:
MultinomialNB().get_params

<bound method BaseEstimator.get_params of MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)>

In [43]:
# """MultinomalNB"""
# ranges = (0.0001,0.001, 0.01, 1)
# param_grid = dict(alpha = ranges)
# Model = MultinomialNB()
# grid = GridSearchCV(Model, param_grid, cv = 10, scoring = 'accuracy')
# grid.fit(X, target)
# grid.grid_scores_

# print(grid.best_score_, "\n")
# print(grid.best_params_, "\n")
# print(grid.best_estimator_)

In [44]:
svm.SVC().get_params

<bound method BaseEstimator.get_params of SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)>

In [45]:
# """Grid Search on Multiple SVM Models"""

# """Specify a Range of Kernels to Try"""
# kernel_options = ['poly','rbf','linear','sigmoid']
# """Specify a Range to Try"""
# ranges = range(1,15)
# """Set a Model's Parameter to the Ranges to Try in GridSearch Using a Dictionary"""
# param_grid = dict(kernel = kernel_options)
# """Specify the Model"""
# Model = svm.SVC()
# """Instantiate the GridSearchModel With Model, Parameter Grid, and Proper Parameters of Grid"""
# grid = GridSearchCV(Model, param_grid, cv = 10, scoring = 'accuracy')
# """Fit Unsplit Data"""
# grid.fit(X, target)
# """Output Scores"""
# grid.grid_scores_

# print(grid.best_score_, "\n")
# print(grid.best_params_, "\n")
# print(grid.best_estimator_)

In [46]:
# """Grid Search on Single SVM Model With Multiple Parameters"""
# kernel_options = ['rbf']
# """Specify a Range to Try"""
# ranges = range(1,15)
# """Set a Model's Parameter to the Ranges to Try in GridSearch Using a Dictionary"""
# param_grid = dict(kernel = kernel_options, C = ranges, gamma = ranges)
# """Specify the Model"""
# Model = svm.SVC()
# """Instantiate the GridSearchModel With Model, Parameter Grid, and Proper Parameters of Grid"""
# grid = GridSearchCV(Model, param_grid, cv = 10, scoring = 'accuracy')
# """Fit Unsplit Data"""
# grid.fit(X, target)
# """Output Scores"""
# grid.grid_scores_

# print(grid.best_score_, "\n")
# print(grid.best_params_, "\n")
# print(grid.best_estimator_)

### Random Search for Parameter Tuning

In [47]:
# from sklearn.grid_search import RandomizedSearchCV
# kernel_options = ['poly','rbf','linear','sigmoid']
# """Specify a Range"""
# ranges = range(1,5)
# """Set a Model's Parameter to the Ranges to Try in GridSearch Using a Dictionary"""
# param_rand = dict(kernel = kernel_options, C = ranges, gamma = ranges)
# """Specify the Model"""
# Model2 = svm.SVC()
# """Instantiate the RandomizedSearchModel With Model, Parameter Grid, Scoring, etc, and n_iter"""
# randsearch = RandomizedSearchCV(Model2, param_rand, cv = 10, scoring = 'accuracy', n_iter = 4)
# """Fit Unsplit Data"""
# randsearch.fit(X, target)
# """Output Scores"""
# randsearch.grid_scores_

# print(randsearch.best_score_, "\n")
# print(randsearch.best_params_, "\n")
# print(randsearch.best_estimator_)

## Ensemble Methods

##### Bagging

In [48]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
est = KNeighborsClassifier(n_neighbors = 10)
bag = BaggingClassifier(base_estimator=est, 
                        n_estimators = 10, 
                        max_samples = 0.35,
                        max_features = 8,
                        random_state = 1, 
                        oob_score = True)
bag.fit(X_train, target_train)
print(bag.score(X_test, target_test))
print(bag.oob_score_)
print(bag.score(X_train, target_train))

0.782051282051
0.846286701209
0.842832469775


In [49]:
est = KNeighborsClassifier(n_neighbors = 10)
bag = BaggingClassifier(base_estimator=est, 
                        n_estimators = 100, 
                        max_samples = 0.35,
                        max_features = 8,
                        random_state = 1, 
                        oob_score = True)
results = cross_val_score(bag, X, target, cv=5)
print(results.mean())

0.806959071731


##### Random Forest

In [50]:
from sklearn.ensemble import RandomForestClassifier
est = RandomForestClassifier()
rf = RandomForestClassifier(n_estimators = 10, oob_score = True)
rf.fit(X_train, target_train)
print(rf.score(X_test, target_test))
print(rf.oob_score_)
print(rf.score(X_train, target_train))

0.772435897436
0.823834196891
0.891191709845


In [51]:
est = RandomForestClassifier()
rf = RandomForestClassifier(n_estimators = 10, oob_score = True)
results = cross_val_score(rf, X, target, cv=5)
print(results.mean())

0.79240875286


##### AdaBoost

In [52]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

est = AdaBoostClassifier()
model = RandomForestClassifier()
ada = AdaBoostClassifier(base_estimator = model, n_estimators = 100)
ada.fit(X_train, target_train)
print(ada.score(X_test, target_test))
print(ada.score(X_train, target_train))

0.762820512821
0.894645941278


In [67]:
est = RandomForestClassifier(n_estimators = 100)
ada = AdaBoostClassifier(base_estimator = est, n_estimators = 10)
results = cross_val_score(ada, X, target, cv=5)
print(results.mean())

0.796915830877


In [54]:
"""Choose A Model for AdaBoost That Does Not OverFit Data"""
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

est = AdaBoostClassifier()
model = svm.SVC(kernel='rbf', C=1,gamma='auto')
ada = AdaBoostClassifier(base_estimator = model, n_estimators = 100, algorithm='SAMME')
ada.fit(X_train, target_train)
print(ada.score(X_test, target_test))
print(ada.score(X_train, target_train))

0.589743589744
0.630397236615


#### Gradient Tree Boosting

In [55]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators = 100)
gbc.fit(X_train, target_train)
print(gbc.score(X_test, target_test))
print(gbc.score(X_train, target_train))

0.782051282051
0.877374784111


In [56]:
gbc = GradientBoostingClassifier(n_estimators = 100)
results = cross_val_score(gbc, X, target, cv=10)
print(results.mean())

0.806999489275


#### Voting Classifier

In [66]:
>>> from sklearn.model_selection import cross_val_score
>>> from sklearn.linear_model import LogisticRegression
>>> from sklearn.naive_bayes import GaussianNB
>>> from sklearn.ensemble import RandomForestClassifier
>>> from sklearn.ensemble import VotingClassifier

>>> clf1 = LogisticRegression()
>>> clf2 = RandomForestClassifier(n_estimators = 100)
>>> clf3 = KNeighborsClassifier(n_neighbors = 5)
>>> clf4 = GaussianNB()
>>> eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3), ('knn', clf4)], voting='hard')


for clf, label in zip([clf1, clf2, clf3, clf4, eclf], ['Logistic Regression', 'Random Forest', 'Naive Bayes', 'KNN', 'Ensemble']):
    scores = cross_val_score(clf, X, target, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f, (+/- %0.2f), [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.81, (+/- 0.01), [Logistic Regression]
Accuracy: 0.80, (+/- 0.02), [Random Forest]
Accuracy: 0.81, (+/- 0.02), [Naive Bayes]
Accuracy: 0.77, (+/- 0.02), [KNN]
Accuracy: 0.81, (+/- 0.01), [Ensemble]


##### Grid Search With Ensemble

In [58]:
from sklearn.model_selection import GridSearchCV

In [59]:
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()

eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft')

params = {'lr__C': [1.0, 100.0], 'rf__n_estimators': [20, 200],}
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
grid = grid.fit(X , target)

print(grid.best_score_, "\n")
print(grid.best_params_, "\n")
# print(grid.best_estimator_)

0.811447811448 

{'lr__C': 100.0, 'rf__n_estimators': 200} 



In [60]:
>>> clf1 = LogisticRegression()
>>> clf2 = RandomForestClassifier(n_estimators = 100)
>>> clf3 = KNeighborsClassifier(n_neighbors = 30)

>>> eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft')
results = cross_val_score(eclf, X, target, cv=5)
print(results.mean())

0.809174664611


# Test on Actual Testing Set

In [61]:
print(X.shape)
print(test.shape)
print(target.shape)

(891, 14)
(418, 14)
(891,)


In [62]:
# Note: Update Parameters For Better Accuracy
model = RandomForestClassifier()
estimator = AdaBoostClassifier(base_estimator = model, n_estimators = 100)
estimator.fit(X, target)
y_pred = estimator.predict(test)
predictions = pd.DataFrame(ID)

def predict(predictions):
    predictions["Survived"] = y_pred
    predictions = predictions.astype(int)
    return predictions

answers = predict(predictions)

In [63]:
answers.to_csv('answers.csv',index = False)