In [2]:
# global imports 
import streamlit as st 
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [37]:
# function definitions
def capitalize_columns(data):
    data.columns = data.columns.str.capitalize()
    return data
def manipulate_data(data):
    data['Sex'] = data['Sex'].map({'male':0, 'female':1})
    pclass = pd.get_dummies(data['Pclass']).rename(columns= {1:'FirstClass', 2:'SecondClass', 3:'ThirdClass'})
    data = pd.concat([data, pclass], axis=1)
    data['Age'] = data['Age'].fillna(data['Age'].mean())
    return data[['Age','Sex','FirstClass','SecondClass','ThirdClass','Survived']]

In [38]:
# import data 
train_df = pd.read_csv('data/train.csv')  

In [39]:
train_df = capitalize_columns(train_df)
train_df = manipulate_data(train_df)
print(train_df.head())

    Age  Sex  FirstClass  SecondClass  ThirdClass  Survived
0  22.0    0       False        False        True         0
1  38.0    1        True        False       False         1
2  26.0    1       False        False        True         1
3  35.0    1        True        False       False         1
4  35.0    0       False        False        True         0


In [40]:
# split data
features = ['Age', 'Sex', 'FirstClass', 'SecondClass', 'ThirdClass']
X_train, X_test, y_train, y_test = train_test_split(train_df[features], train_df['Survived'], test_size=0.2, random_state=42)

In [41]:
# scale data
scaler = StandardScaler()
train_features = scaler.fit_transform(X_train)
test_features = scaler.transform(X_test)    

In [42]:
# build the model
model = LogisticRegression()
model.fit(train_features, y_train)
train_score = model.score(train_features, y_train)
test_score = model.score(test_features, y_test)
print(f'Train score: {train_score}')
print(f'Test score: {test_score}')

Train score: 0.7949438202247191
Test score: 0.8044692737430168


In [45]:
y_pred = model.predict(test_features)
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[91 14]
 [21 53]]
