In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import interact, widgets

In [2]:
df_train = pd.read_csv('./data/train.csv')

In [3]:
# Data Cleansing

In [4]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
df_train.duplicated().value_counts()

False    891
dtype: int64

In [7]:
# Based in the Data Dictionary (https://www.kaggle.com/c/titanic/data) and in the describe() values, I decided to exclude the following columns upfront because I believed that they wouldn't add analytical value to our models
df_train_curated = df_train.drop(['PassengerId', 'Name', 'Ticket', 'Fare', 'Cabin'],axis='columns')
df_train_curated.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,male,22.0,1,0,S
1,1,1,female,38.0,1,0,C
2,1,3,female,26.0,0,0,S
3,1,1,female,35.0,1,0,S
4,0,3,male,35.0,0,0,S


In [8]:
df_train_curated.isnull().sum() # Amount of null values in each column

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Embarked      2
dtype: int64

In [9]:
# Filling the null values in the Age column with the median
df_train_curated['Age'] = df_train_curated['Age'].fillna(df_train_curated['Age'].median())
display(        
    df_train_curated.describe(),
    df_train_curated.isnull().sum()
)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch
count,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.361582,0.523008,0.381594
std,0.486592,0.836071,13.019697,1.102743,0.806057
min,0.0,1.0,0.42,0.0,0.0
25%,0.0,2.0,22.0,0.0,0.0
50%,0.0,3.0,28.0,0.0,0.0
75%,1.0,3.0,35.0,1.0,0.0
max,1.0,3.0,80.0,8.0,6.0


Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Embarked    2
dtype: int64

In [10]:
# Filling the null values in the Embark column with the mode
df_train_curated['Embarked'] = df_train_curated['Embarked'].fillna(df_train_curated['Embarked'].mode()[0])
display(        
    df_train_curated.describe(),
    df_train_curated.isnull().sum()
)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch
count,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.361582,0.523008,0.381594
std,0.486592,0.836071,13.019697,1.102743,0.806057
min,0.0,1.0,0.42,0.0,0.0
25%,0.0,2.0,22.0,0.0,0.0
50%,0.0,3.0,28.0,0.0,0.0
75%,1.0,3.0,35.0,1.0,0.0
max,1.0,3.0,80.0,8.0,6.0


Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Embarked    0
dtype: int64

In [11]:
# Parsing the categorical values into multiple different columns
df_train_curated = pd.get_dummies(df_train_curated)
df_train_curated.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,0,1,0,0,1
1,1,1,38.0,1,0,1,0,1,0,0
2,1,3,26.0,0,0,1,0,0,0,1
3,1,1,35.0,1,0,1,0,0,0,1
4,0,3,35.0,0,0,0,1,0,0,1


In [12]:
# Training and Testing

In [13]:
X, y = df_train_curated.drop('Survived', axis='columns'), df_train_curated['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [14]:
def plot_predicted_values(y_pred):
    plt.figure(figsize=(12,9))
    plt.scatter(X_test['Age'], y_pred, label='predicted value', marker='x', color='red', s=[200]*len(y_pred))
    plt.scatter(X_test['Age'], y_test, label='actual value', color='blue')

    plt.title('Classification Comparison', fontsize=15)
    plt.xlabel('Age', fontsize=12)
    plt.ylabel('Survived (1=Yes, 0=No)', fontsize=12)

    plt.legend()

    plt.show()

    return None

In [15]:
def perceptron_model(max_iter, learn_rate, rand_state, show):
    perceptron = Perceptron(max_iter=max_iter, eta0=learn_rate, random_state=rand_state)
    perceptron.fit(X_train, y_train)

    if show == 'Charts':
        y_pred = perceptron.predict(X_test)
        plot_predicted_values(y_pred)
        plot_confusion_matrix(perceptron, X_test, y_test)
        plot_roc_curve(perceptron, X_test, y_test) # Receiving Operator Characteristic
    if show == 'Score':
        print(
            "Score: " + str(perceptron.score(X_train, y_train))
            )
    
    return None
    

In [16]:
interact(perceptron_model,
max_iter=widgets.IntSlider(min=20, max=10000, step=10, value=41),
learn_rate=widgets.FloatSlider(min=.05, max=1, step=.05, value=.5),
rand_state=widgets.IntSlider(min=1, max=100, step=1, value=41),
show=['Score','Charts']
)

interactive(children=(IntSlider(value=41, description='max_iter', max=10000, min=20, step=10), FloatSlider(val…

<function __main__.perceptron_model(max_iter, learn_rate, rand_state, show)>

In [17]:
def sgd_model(show, max_iter):
    sgd = SGDClassifier(max_iter=max_iter)
    sgd.fit(X_train, y_train)

    if show == 'Charts':
        y_pred = sgd.predict(X_test)
        plot_predicted_values(y_pred)
        
        plot_confusion_matrix(sgd, X_test, y_test)
        plot_roc_curve(sgd, X_test, y_test) # Receiving Operator Characteristic

    if show == 'Score':
        print(
            "Score: " + str(sgd.score(X_train, y_train))
            )
    

In [18]:
interact(sgd_model,
max_iter=widgets.IntSlider(min=1000, max=10000, step=10, value=8040),
show=['Score','Charts']
)

interactive(children=(Dropdown(description='show', options=('Score', 'Charts'), value='Score'), IntSlider(valu…

<function __main__.sgd_model(show, max_iter)>

In [19]:
def logistic_regression_model(show, max_iter, rand_state):
    lr = LogisticRegression(max_iter=max_iter, random_state=rand_state)
    lr.fit(X_train, y_train)

    if show == 'Charts':
        y_pred = lr.predict(X_test)
        y_pred_prob = lr.predict_proba(X_test)
        
        print("Test Dataset Forecasted - Top 10 values sorted descending by its forecast likelihoodness to be true")
        df_pred = X_test.copy()
        df_pred['survived_actual'] = y_test
        df_pred['survived_pred'] = y_pred
        
        prob_0, prob_1 = [], []
        for prob_array in y_pred_prob:
            prob_0.append(prob_array[0])
            prob_1.append(prob_array[1])

        df_pred['likelihood_0'] = prob_0
        df_pred['likelihood_1'] = prob_1
        
        display(
            df_pred.sort_values(by=['likelihood_1'], ascending=False).head(10)
        )

        print("Test Dataset Forecasted - Top 10 values sorted descending by its forecast likelihoodness to be false")
        
        display(
            df_pred.sort_values(by=['likelihood_0'], ascending=False).head(10)
        )

        plot_predicted_values(y_pred)
        plot_confusion_matrix(lr, X_test, y_test)
        
        plot_roc_curve(lr, X_test, y_test) # Receiving Operator Characteristic

    if show == 'Score':
        print(
            "Score: " + str(lr.score(X_train, y_train))
            )
    

In [1]:
display(
    interact(logistic_regression_model,
max_iter=widgets.IntSlider(min=200, max=100000, step=10, value=200),
rand_state=widgets.IntSlider(min=1, max=100, step=1, value=41),
show=['Score','Charts']
)
)

NameError: name 'interact' is not defined