In [1]:
import random

import numpy as ns
import pandas as pd

import seaborn as sns

from sklearn.datasets import make_moons
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import statsmodels.discrete.discrete_model as sm


# Logistic regresion

\begin{equation*}
\hat{p}=\frac{1}{1 + e^{-(\hat{w_{0}} + \hat{w_{1}} * x_{1})...}}
\end{equation*}

### Prepare dataset

In [2]:
df = pd.read_csv('input_data/train.csv', index_col=0)

# Check dataset size
df.shape

(891, 11)

In [3]:
# Show some data
df.head(3)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [4]:
# Check unique data count
df.agg(['nunique'])

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
nunique,2,3,891,2,88,7,7,681,248,147,3


In [5]:
# Check NaN
df.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [6]:
# Drop not needed columns
# Name, Ticket ---> unique values
# Cabin        ---> too much missing data
df = df.drop(columns=['Name', 'Cabin', 'Ticket'])

In [7]:
# Fill in missing data
# Age      ---> use mediane
# Embarked ---> use random value
df['Age'] = df['Age'].fillna(df.groupby('Sex')['Age'].transform('median'))
df['Embarked'] = df['Embarked'].fillna(random.choice(df['Embarked'].unique()))

In [8]:
# Change text data to numeric
df['Sex_num'] = df['Sex'].replace(['female', 'male'], [0, 1])
df['Embarked_num'] = df['Embarked'].replace(['S', 'C', 'Q'], [0, 1, 2])

### Prepare train and test set

In [9]:
# Prepare train and test data
X_train, X_test, y_train, y_test = train_test_split(
    df[['Pclass', 'Sex_num', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_num']],
    df['Survived'], 
    test_size=0.25
)

def get_X(column_names):
    return X_train[column_names], X_test[column_names]


MODELS = [
    ['Pclass', 'Sex_num', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_num'],
    ['Pclass', 'Sex_num', 'Age', 'SibSp', 'Parch'],
    ['Pclass', 'Sex_num', 'Age'],
    ['Pclass'],
    ['Sex_num'],
    ['Age'],
    ['SibSp'],
    ['Parch'],
    ['Fare'],
    ['Embarked_num'],     
]


### Fit with statsmodels

In [10]:
def fit_model(columns):
    print(f'Predict based on: {" - ".join(columns)}')
    print('\n')
    
    X_train, X_test = get_X(columns)
    logit = sm.Logit(y_train, X_train)
    f = logit.fit()
    
    y_train_pred = f.predict(X_train)
    acc_train = accuracy_score(
        y_true=y_train,
        y_pred=y_train_pred.apply(lambda x: 1 if x > 0.5 else 0)
    )
    y_test_pred = f.predict(X_test)
    acc_test = accuracy_score(
        y_true=y_test,
        y_pred=y_test_pred.apply(lambda x: 1 if x > 0.5 else 0)
    )
    
    print(f.params)
    print(f.summary())
    print('\n')
    print(f'Acc train {acc_train}')
    print(f'Acc test {acc_test}')
    print('-' * 20)
    print('\n')

    
for model in MODELS:
    fit_model(model)


Predict based on: Pclass - Sex_num - Age - SibSp - Parch - Fare - Embarked_num


Optimization terminated successfully.
         Current function value: 0.497270
         Iterations 6
Pclass          0.031799
Sex_num        -2.129206
Age             0.005367
SibSp          -0.228764
Parch          -0.001034
Fare            0.015342
Embarked_num    0.278353
dtype: float64
                           Logit Regression Results                           
Dep. Variable:               Survived   No. Observations:                  668
Model:                          Logit   Df Residuals:                      661
Method:                           MLE   Df Model:                            6
Date:                Mon, 24 May 2021   Pseudo R-squ.:                  0.2521
Time:                        21:38:19   Log-Likelihood:                -332.18
converged:                       True   LL-Null:                       -444.16
Covariance Type:            nonrobust   LLR p-value:                 1.486

Optimization terminated successfully.
         Current function value: 0.689846
         Iterations 4
Fare    0.002764
dtype: float64
                           Logit Regression Results                           
Dep. Variable:               Survived   No. Observations:                  668
Model:                          Logit   Df Residuals:                      667
Method:                           MLE   Df Model:                            0
Date:                Mon, 24 May 2021   Pseudo R-squ.:                -0.03751
Time:                        21:38:19   Log-Likelihood:                -460.82
converged:                       True   LL-Null:                       -444.16
Covariance Type:            nonrobust   LLR p-value:                       nan
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Fare           0.0028      0.001      2.034      0.042       0.000       0.0

### Fit with sklearn

In [11]:
def fit_model(columns):
    print(f'Predict based on: {" - ".join(columns)}')
    
    X_train, X_test = get_X(columns)
    m = LogisticRegression(C=1e5, max_iter=10_000)
    f = m.fit(X_train, y_train)
    
    y_train_pred = f.predict(X_train)
    acc_train = accuracy_score(
        y_true=y_train,
        y_pred=y_train_pred
    )
    y_test_pred = f.predict(X_test)
    acc_test = accuracy_score(
        y_true=y_test,
        y_pred=y_test_pred
    )

    print(f'Acc train {acc_train}')
    print(f'Acc test {acc_test}')
    print('-' * 20)
    print('\n')

    
for model in MODELS:
    fit_model(model)


Predict based on: Pclass - Sex_num - Age - SibSp - Parch - Fare - Embarked_num
Acc train 0.7949101796407185
Acc test 0.7982062780269058
--------------------


Predict based on: Pclass - Sex_num - Age - SibSp - Parch
Acc train 0.7919161676646707
Acc test 0.7937219730941704
--------------------


Predict based on: Pclass - Sex_num - Age
Acc train 0.7844311377245509
Acc test 0.7892376681614349
--------------------


Predict based on: Pclass
Acc train 0.687125748502994
Acc test 0.6547085201793722
--------------------


Predict based on: Sex_num
Acc train 0.7904191616766467
Acc test 0.7757847533632287
--------------------


Predict based on: Age
Acc train 0.6182634730538922
Acc test 0.6098654708520179
--------------------


Predict based on: SibSp
Acc train 0.6182634730538922
Acc test 0.6098654708520179
--------------------


Predict based on: Parch
Acc train 0.6137724550898204
Acc test 0.5919282511210763
--------------------


Predict based on: Fare
Acc train 0.6661676646706587
Acc test 0.