In [1]:
import itertools #iteration tools
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # nice plots
import matplotlib.pyplot as plt # plots
from ipy_table import *

#preprocessing
from sklearn.preprocessing import LabelEncoder
# machine learning
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

# Import and suppress warnings
import warnings
warnings.filterwarnings('ignore')

### Get train Data

In [2]:
train_df = pd.read_csv(r'C:/Users/ernest.chocholowski/Desktop/Datasets/Titanic/train.csv')
test_df = pd.read_csv(r'C:/Users/ernest.chocholowski/Desktop/Datasets/Titanic/test.csv')
both_df = [train_df, test_df]
train_df.head()
super_table=[]
super_table.append(['name', 'reg_function', 'trainset_acc', 'testset_acc', 'r2score', 'conf_matrx'])
df_load = pd.read_csv(r'C:/Users/ernest.chocholowski/Desktop/GIT/Titanic/table_eCh.csv')
for row in df_load.values.tolist():
    super_table.append(row)

In [3]:
#labelling
var_mod = ['Name', 'Sex', 'Ticket','Cabin', 'Embarked']
le = LabelEncoder()
for i in var_mod:
    mask = ~train_df[i].isnull()
    train_df[i][mask] = le.fit_transform(train_df[i][mask])
    mask2 = ~test_df[i].isnull()
    test_df[i][mask2] = le.fit_transform(test_df[i][mask2])
train_df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [4]:
train_df = train_df.drop(['Cabin'], axis=1)
test_df = test_df.drop(['Cabin'], axis=1)

In [6]:
#perform imputation!
#imputation of age with imputer
from sklearn.preprocessing import Imputer
imp=Imputer(missing_values="NaN", strategy="mean" )
train_df["Age"]=imp.fit_transform(train_df[["Age"]]).ravel()
test_df["Age"]=imp.fit_transform(test_df[["Age"]]).ravel()

#impute embarked as the most frequent one
freq_port = train_df.Embarked.dropna().mode()[0]
train_df['Embarked'] = train_df['Embarked'].fillna(freq_port)

#impute missing Fare value
test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)

In [7]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,108,1,22.0,1,0,523,7.25,2
1,2,1,1,190,0,38.0,1,0,596,71.2833,0
2,3,1,3,353,0,26.0,0,0,669,7.925,2
3,4,1,1,272,0,35.0,1,0,49,53.1,2
4,5,0,3,15,1,35.0,0,0,472,8.05,2


In [9]:
def get_regr_function(logreg, y_selection, x_selection):
    coef = logreg.coef_[0]
    intercept = "{:.2f}".format(logreg.intercept_[0])
    output = y_selection + ' = ' + str(intercept) + ' + '
    for coeff, feature in zip(coef, x_selection):
        coeff_str = "{:.2f}".format(coeff)
        output += coeff_str + "*" + feature + " + "
    return output[:-3]

In [10]:
def write_date():
    from time import gmtime, strftime
    return strftime("%Y-%m-%d %H %M %S", gmtime())

In [11]:
def list2str(datalist):
    return " ".join([(str(var)+',') for var in datalist])[:-1]

In [12]:
def R2_score(logreg, X, y, sample_weight=None):
    from sklearn.metrics import r2_score
    return r2_score(y, logreg.predict(X), sample_weight=sample_weight)

In [13]:
def confusion_matrix(model, X, y):
    from sklearn.metrics import confusion_matrix
    return confusion_matrix(y, model.predict(X))

In [14]:
def logist_reg_test (train_df, test_df, x_selection, submission_name = None, x_test_drop = "PassengerId"
                     , y_selection = 'Survived'):
    to_drop = [category for category in train_df.columns.values if category not in x_selection]
    # learning set
    X_train = train_df.drop(to_drop, axis=1)
    # answers for learning set
    Y_train = train_df[y_selection]
    # testing set
    to_drop.remove(y_selection)
    to_drop.append(x_test_drop)
    X_test  = test_df.drop(to_drop, axis=1).copy()
    # Logistic Regression
    logreg = LogisticRegression()
    logreg.fit(X_train, Y_train)
    
    #Calc parameters
    function_str = get_regr_function(logreg, y_selection, x_selection)
    trainset_acc = round(logreg.score(X_train, Y_train) * 100, 2)
    r2_score = R2_score(logreg, X_train, Y_train)

    conf_matrix = confusion_matrix(logreg,X_train,Y_train)
    conf_matrix = 'TN: '+str(conf_matrix[0][0])+', FP: '+str(conf_matrix[0][1])+ \
                      ', FN: '+str(conf_matrix[1][0])+', TP: '+str(conf_matrix[1][1])
    #prints
    print('Regression function:\n', function_str)
    print('Accuracy on train set:', trainset_acc)
    print("R2 score:", r2_score)
    print("Confusion matrix:\n", conf_matrix)
    
    #make predictions
    Y_pred = logreg.predict(X_test)
    #export predictions
    submission = pd.DataFrame({
        "PassengerId": test_df[x_test_drop],
        "Survived": Y_pred
    })
    if submission_name == None:
        submission_name = "regression"+write_date()
    submission.to_csv(r'C:/Users/ernest.chocholowski/Desktop/Datasets/Titanic/output/'+submission_name+'.csv', index=False)
    
    return [submission_name, function_str, str(trainset_acc), r2_score, conf_matrix]

In [15]:
x_test_drop = "PassengerId"
y_selection = 'Survived'
x_selection = train_df.columns.values
x_selection = np.delete(x_selection, [0,1])

In [16]:
output = logist_reg_test(train_df, test_df, x_selection, submission_name='logisticReg_basicImputator')

Regression function:
 Survived = 4.04 + -0.72*Pclass + -0.00*Name + -2.48*Sex + -0.03*Age + -0.29*SibSp + -0.09*Parch + -0.00*Ticket + 0.01*Fare + -0.15*Embarked
Accuracy on train set: 80.25
R2 score: 0.164797238999
Confusion matrix:
 TN: 479, FP: 70, FN: 106, TP: 236


In [17]:
testset_acc = str('75.120')
output.insert(3, testset_acc)
super_table.append(output)

In [18]:
def get_regr_fun_lin(logreg, y_selection, x_selection):
    coef = logreg.coef_
    intercept = "{:.2f}".format(logreg.intercept_)
    output = y_selection + ' = ' + str(intercept) + ' + '
    for coeff, feature in zip(coef, x_selection):
        coeff_str = "{:.2f}".format(coeff)
        output += coeff_str + "*" + feature + " + "
    return output[:-3]

In [19]:
def linear_reg_test (train_df, test_df, x_selection, submission_name = None, x_test_drop = "PassengerId"
                     , y_selection = 'Survived'):
    to_drop = [category for category in train_df.columns.values if category not in x_selection]
    # learning set
    X_train = train_df.drop(to_drop, axis=1)
    # answers for learning set
    Y_train = train_df[y_selection]
    # testing set
    to_drop.remove(y_selection)
    to_drop.append(x_test_drop)
    X_test  = test_df.drop(to_drop, axis=1).copy()
    # Logistic Regression
    linreg = LinearRegression()
    linreg.fit(X_train, Y_train)
    #Calc parameters
    function_str = get_regr_fun_lin(linreg, y_selection, x_selection)
    trainset_acc = round(linreg.score(X_train, Y_train) * 100, 2)
    r2_score = R2_score(linreg, X_train, Y_train)

    conf_matrix = "NA"
    
    #prints
    print('Regression function:\n', function_str)
    print('Accuracy on train set:', trainset_acc)
    print("R2 score:", r2_score)
    print("Confusion matrix:\n", conf_matrix)
    
    #make predictions
    Y_pred = linreg.predict(X_test)
    #export predictions
    submission = pd.DataFrame({
        "PassengerId": test_df[x_test_drop],
        "Survived": Y_pred
    })
    if submission_name == None:
        submission_name = "regression"+write_date()
    submission.to_csv(r'C:/Users/ernest.chocholowski/Desktop/Datasets/Titanic/output/'+submission_name+'.csv', 
                      index=False)
    
    return [submission_name, function_str, str(trainset_acc), r2_score, conf_matrix]

In [20]:
x_test_drop = "PassengerId"
y_selection = 'Survived'
x_selection = train_df.columns.values
x_selection = np.delete(x_selection, [0,1])

In [21]:
output = linear_reg_test(train_df, test_df, x_selection, submission_name='linearReg_basicImputator')

Regression function:
 Survived = 1.37 + -0.15*Pclass + -0.00*Name + -0.51*Sex + -0.01*Age + -0.04*SibSp + -0.02*Parch + -0.00*Ticket + 0.00*Fare + -0.04*Embarked
Accuracy on train set: 40.08
R2 score: 0.400771185732
Confusion matrix:
 NA


In [22]:
testset_acc = str('failed_evl')
output.insert(3, testset_acc)
super_table.append(output)

In [23]:
def other_tests (model, train_df, test_df, x_selection, submission_name = None, x_test_drop = "PassengerId"
                     , y_selection = 'Survived'):
    to_drop = [category for category in train_df.columns.values if category not in x_selection]
    # learning set
    X_train = train_df.drop(to_drop, axis=1)
    # answers for learning set
    Y_train = train_df[y_selection]
    # testing set
    to_drop.remove(y_selection)
    to_drop.append(x_test_drop)
    X_test  = test_df.drop(to_drop, axis=1).copy()
    # Logistic Regression
    model.fit(X_train, Y_train)
    #Calc parameters
    trainset_acc = round(model.score(X_train, Y_train) * 100, 2)
    r2_score = R2_score(model, X_train, Y_train)
    function_str = 'NA'
    conf_matrix = confusion_matrix(model, X_train, Y_train)
    conf_matrix = 'TN: '+str(conf_matrix[0][0])+', FP: '+str(conf_matrix[0][1])+ \
                      ', FN: '+str(conf_matrix[1][0])+', TP: '+str(conf_matrix[1][1])
    
    #prints
    print('Regression function:\n', function_str)
    print('Accuracy on train set:', trainset_acc)
    print("R2 score:", r2_score)
    print("Confusion matrix:\n", conf_matrix)
    
    #make predictions
    Y_pred = model.predict(X_test)
    #export predictions
    submission = pd.DataFrame({
        "PassengerId": test_df[x_test_drop],
        "Survived": Y_pred
    })
    if submission_name == None:
        submission_name = "regression"+write_date()
    submission.to_csv(r'C:/Users/ernest.chocholowski/Desktop/Datasets/Titanic/output/'+submission_name+'.csv', 
                      index=False)
    
    return [submission_name, function_str, str(trainset_acc), r2_score, conf_matrix]

In [24]:
x_test_drop = "PassengerId"
y_selection = 'Survived'
x_selection = train_df.columns.values
x_selection = np.delete(x_selection, [0,1])

In [8]:
# model = SVC()
# model = LinearSVC()
# model = KNeighborsClassifier(n_neighbors = 3)
# model = GaussianNB()
# model = Perceptron()
# model = SGDClassifier()
# model = DecisionTreeClassifier()
# model = RandomForestClassifier(n_estimators=100)

In [38]:
model = Perceptron()
output = other_tests(model, train_df, test_df, x_selection, submission_name='Perceptron_basicImputator')

Regression function:
 NA
Accuracy on train set: 67.45
R2 score: -0.376186367558
Confusion matrix:
 TN: 533, FP: 16, FN: 274, TP: 68


In [39]:
testset_acc = str('65.072')
output.insert(3, testset_acc)

In [40]:
super_table.append(output)

In [29]:
make_table(super_table)
apply_theme('basic')

0,1,2,3,4,5
name,reg_function,trainset_acc,testset_acc,r2score,conf_matrx
logisticReg_basic,Survived = 3.99 + -0.85*Pclass + -0.00*Name + -2.35*Sex + -0.03*Age + -0.32*SibSp + -0.06*Parch + -0.00*Ticket + 0.01*Fare,79.97,failed_evl,0.1696324007807416,"TN: 369, FP: 55, FN: 88, TP: 202"
linearReg_basic,Survived = 1.40 + -0.19*Pclass + -0.00*Name + -0.49*Sex + -0.01*Age + -0.06*SibSp + -0.01*Parch + -0.00*Ticket + 0.00*Fare,40.25,failed_evl,0.4025252345231439,
SVC_basic,,99.86,failed_evl,0.9941932335718932,"TN: 424, FP: 0, FN: 1, TP: 289"
KNC_basic,,82.49,failed_evl,0.2741541964866623,"TN: 356, FP: 68, FN: 57, TP: 233"
GaussianNB_basic,,78.57,failed_evl,0.11156473649967458,"TN: 352, FP: 72, FN: 81, TP: 209"
Perceptron_basic,,62.46,failed_evl,-0.5562134027325961,"TN: 417, FP: 7, FN: 261, TP: 29"
LinearSVC_basic,,49.44,failed_evl,-1.0962426805465193,"TN: 94, FP: 330, FN: 31, TP: 259"
SGDClassifier_basic,,65.69,failed_evl,-0.42265777488614203,"TN: 330, FP: 94, FN: 151, TP: 139"
DecisionTreeClassifier_basic,,100.0,failed_evl,1.0,"TN: 424, FP: 0, FN: 0, TP: 290"


In [866]:
df = pd.DataFrame(super_table[1:], columns=super_table[0])
df.to_csv(r'C:/Users/ernest.chocholowski/Desktop/GIT/Titanic/table_eCh.csv', 
                      index=False)