In [152]:
# Basics
import sys
import numpy as np
import pandas as pd

# Sklearn Models
import sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Sklearn Metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

In [153]:
raw_data = open("./test_sets/titanic_data.csv", 'r')
data = raw_data.read()

In [154]:
type(data)

str

In [155]:
frame = pd.read_csv("./test_sets/titanic_data.csv")

In [156]:
frame.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [157]:
# Dropping unnecessary columns (Non-quantitative data)
frame = frame.drop(axis=1, labels=["PassengerId", "Ticket", "Cabin", "Embarked", "Sex", "Name", "Parch", "SibSp"])
frame.head()

Unnamed: 0,Survived,Pclass,Age,Fare
0,0,3,22.0,7.25
1,1,1,38.0,71.2833
2,1,3,26.0,7.925
3,1,1,35.0,53.1
4,0,3,35.0,8.05


In [158]:
# Drop all rows with NaN values
frame = frame.dropna()
frame.head()
len(frame.index)

714

In [159]:
frame.to_csv('./test_sets/titanic_data_filtered.csv')

### Titanic Data Filtered Test Run

In [160]:
# Inputs
data_file_path = "./test_sets/titanic_data_filtered.csv"
split_ratio = 0.5
user_models = "KNN SVC MultinomialRB"
y_data_column = "Survived"

In [161]:
# Step 1: Put data in DataFrame and split into X and Y sets
data_frame = pd.read_csv(data_file_path, index_col=0) # Remove index column
y_data = data_frame[[y_data_column]]
x_data = data_frame.drop(axis=1,labels=y_data_column)
    
data_frame.head()

Unnamed: 0,Survived,Pclass,Age,Fare
0,0,3,22.0,7.25
1,1,1,38.0,71.2833
2,1,3,26.0,7.925
3,1,1,35.0,53.1
4,0,3,35.0,8.05


In [162]:
x_data.head()

Unnamed: 0,Pclass,Age,Fare
0,3,22.0,7.25
1,1,38.0,71.2833
2,3,26.0,7.925
3,1,35.0,53.1
4,3,35.0,8.05


In [163]:
y_data.head()

Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0


In [164]:
# Splitting Data
X_train, X_test, Y_train, Y_test = train_test_split(x_data, y_data, train_size=split_ratio, random_state=0)

requested_models = user_models.split()
requested_models = [x.lower() for x in requested_models]
print(requested_models)

available_models = ['svc', 'knn', 'decision-trees', 'random-forest', 'gradient-boosted']

svc = Pipeline([('clf', SVC())])
knn = Pipeline([('clf', KNeighborsClassifier(n_neighbors=3))])
decision_trees = Pipeline([('clf', DecisionTreeClassifier())])
random_forest = Pipeline([('clf', RandomForestClassifier())])
gradient_boosted = Pipeline([('clf', GradientBoostingClassifier())])

available_pipelines = [svc, knn, decision_trees, random_forest, gradient_boosted]

['knn', 'svc', 'multinomialrb']


In [165]:
requested_pipelines = []
requested_models_filtered = []
for model in requested_models:
    if model in available_models:
        requested_models_filtered.append(model)
        index = available_models.index(model)
        print(model + " " + str(index))
        print(available_pipelines[index])
        requested_pipelines.append(available_pipelines[index])

print("\n{0}".format(requested_models_filtered))
print(requested_pipelines)

knn 1
Pipeline(memory=None,
     steps=[('clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform'))])
svc 0
Pipeline(memory=None,
     steps=[('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

['knn', 'svc']
[Pipeline(memory=None,
     steps=[('clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform'))]), Pipeline(memory=None,
     steps=[('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])]


In [167]:
output_text_file = open('output.txt', 'w')
count = 0
for index in range(len(requested_pipelines)):
    pipeline = requested_pipelines[index]
    print(pipeline)

    pipeline.fit(X_train, Y_train)
    Y_predict = pipeline.predict(X_test)

    report = classification_report(Y_test, Y_predict)
    matrix = confusion_matrix(Y_test, Y_predict)

    output_text_file.write("Model: %s\n" % requested_models_filtered[index])
    output_text_file.write("Classification Report: %s\n" % report)
    output_text_file.write("Confusion Matrix: %s\n" % matrix)
    output_text_file.write("\n\n")
    count = count + 1

output_text_file.close()

Pipeline(memory=None,
     steps=[('clf', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform'))])
Pipeline(memory=None,
     steps=[('clf', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])
