# Logistic Regression

Optimisations made:
- Adding field that sums the numebr of family members a pasenger had on board
 - Using random over sampler on the test train split due to the imblanced data

In [1]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

In [2]:
# Read in the csv file
titanic_df = pd.read_csv(Path("data_files/titanic_cleaned.csv"))
titanic_df.head() # Review the dataframe

Unnamed: 0,passenger_id,pclass,survived,name,sex,age,sibsp,parch,fare,cabin,embarked,age_updated,embarked_updated,fare_updated,deck,pclass_updated
0,1,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,211.3375,B5,S,29.0,Southampton (UK),211.3375,B,First class
1,2,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,151.55,C22 C26,S,0.9167,Southampton (UK),151.55,C,First class
2,3,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,151.55,C22 C26,S,2.0,Southampton (UK),151.55,C,First class
3,4,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,151.55,C22 C26,S,30.0,Southampton (UK),151.55,C,First class
4,5,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,151.55,C22 C26,S,25.0,Southampton (UK),151.55,C,First class


In [3]:
# Create a copy of the dataframe
titanic = titanic_df.copy()

## Prepare the data

In [4]:
# Dropping columns as they have been updated and name
titanic = titanic.drop(['passenger_id', 'name', 'age', 'embarked', 'fare', 'cabin'], axis=1)
titanic.head()

Unnamed: 0,pclass,survived,sex,sibsp,parch,age_updated,embarked_updated,fare_updated,deck,pclass_updated
0,1,1,female,0,0,29.0,Southampton (UK),211.3375,B,First class
1,1,1,male,1,2,0.9167,Southampton (UK),151.55,C,First class
2,1,0,female,1,2,2.0,Southampton (UK),151.55,C,First class
3,1,0,male,1,2,30.0,Southampton (UK),151.55,C,First class
4,1,0,female,1,2,25.0,Southampton (UK),151.55,C,First class


### Adding field that calculates the number of family members they were travelling with

In [5]:
# summing the number of family members
titanic['family_members'] = titanic['sibsp'] + titanic['parch']

# Dropping sibsp and parch columns as they are included in the new column
titanic = titanic.drop(['sibsp', 'parch'], axis=1) 

titanic.head()

Unnamed: 0,pclass,survived,sex,age_updated,embarked_updated,fare_updated,deck,pclass_updated,family_members
0,1,1,female,29.0,Southampton (UK),211.3375,B,First class,0
1,1,1,male,0.9167,Southampton (UK),151.55,C,First class,3
2,1,0,female,2.0,Southampton (UK),151.55,C,First class,3
3,1,0,male,30.0,Southampton (UK),151.55,C,First class,3
4,1,0,female,25.0,Southampton (UK),151.55,C,First class,3


### Convert categorical data

In [6]:
# converting categorical using get_dummies and dropping the first column so the importance isn't inflated
sex_dummies = pd.get_dummies(titanic['sex'], drop_first=True) 
embarked_dummies = pd.get_dummies(titanic['embarked_updated'], drop_first=True)
deck_dummies = pd.get_dummies(titanic['deck'], drop_first=True)
pclass_dummies = pd.get_dummies(titanic['pclass_updated'], drop_first=True)

In [7]:
# Concatenate the encoded dummies with the dataframe
df_titanic_transformed = pd.concat([titanic, sex_dummies, embarked_dummies, deck_dummies, pclass_dummies], axis=1)
df_titanic_transformed = df_titanic_transformed.drop(['sex', 'embarked_updated', 'deck', 'pclass_updated'], axis=1) # dropping columns not required
df_titanic_transformed.head()

Unnamed: 0,pclass,survived,age_updated,fare_updated,family_members,male,Queenstown (IE),Southampton (UK),B,C,D,E,F,G,T,Unknown,Second class,Third class
0,1,1,29.0,211.3375,0,0,0,1,1,0,0,0,0,0,0,0,0,0
1,1,1,0.9167,151.55,3,1,0,1,0,1,0,0,0,0,0,0,0,0
2,1,0,2.0,151.55,3,0,0,1,0,1,0,0,0,0,0,0,0,0
3,1,0,30.0,151.55,3,1,0,1,0,1,0,0,0,0,0,0,0,0
4,1,0,25.0,151.55,3,0,0,1,0,1,0,0,0,0,0,0,0,0


### Separate the data into labels and features

In [8]:
# Separate the features (X) from the target (y)
y = df_titanic_transformed["survived"]
X = df_titanic_transformed.drop(columns='survived')

In [9]:
# Reivew the y variable series
y.head()

0    1
1    1
2    0
3    0
4    0
Name: survived, dtype: int64

In [10]:
# Review the X variable dataframe
X.head()

Unnamed: 0,pclass,age_updated,fare_updated,family_members,male,Queenstown (IE),Southampton (UK),B,C,D,E,F,G,T,Unknown,Second class,Third class
0,1,29.0,211.3375,0,0,0,1,1,0,0,0,0,0,0,0,0,0
1,1,0.9167,151.55,3,1,0,1,0,1,0,0,0,0,0,0,0,0
2,1,2.0,151.55,3,0,0,1,0,1,0,0,0,0,0,0,0,0
3,1,30.0,151.55,3,1,0,1,0,1,0,0,0,0,0,0,0,0
4,1,25.0,151.55,3,0,0,1,0,1,0,0,0,0,0,0,0,0


In [11]:
# Check the balance of our target values
y.value_counts()

0    809
1    500
Name: survived, dtype: int64

## Split data into training and testing datasets using train_test_split

In [12]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape


(981, 17)

## Apply random over sampler due to the imblanced data

In [13]:
# Instantiate the random oversample model
ros = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_train_ros, y_train_ros= ros.fit_resample(X_train, y_train)

# Count the distinct values of the resampled labels data
y_train_ros.value_counts()

1    606
0    606
Name: survived, dtype: int64

## Create logistic regression model with the original data

In [14]:
# Create a logistic regression model
classifier = LogisticRegression(solver='lbfgs', max_iter=800, random_state=1)

# Fit and train the model using the training data
classifier.fit(X_train_ros, y_train_ros)

LogisticRegression(max_iter=800, random_state=1)

In [15]:
# Make predictions
predictions = classifier.predict(X_test)

## Evaluate the models performance

In [16]:
# Print the balanced_accuracy score of the model
print(f"The balanced accuracy score of the model is {balanced_accuracy_score(y_test, predictions)}")

The balanced accuracy score of the model is 0.77471921182266


In [17]:
# Confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=['Actual - perished', 'Acutal - survived'], columns=['Predicted - perished', 'Predicted - survived']
)
print(cm)
display(cm_df)

[[170  33]
 [ 36  89]]


Unnamed: 0,Predicted - perished,Predicted - survived
Actual - perished,170,33
Acutal - survived,36,89


In [18]:
# Classification report
target_names = ["Perished", "Survived"]
print(classification_report(y_test, predictions, target_names=target_names))


              precision    recall  f1-score   support

    Perished       0.83      0.84      0.83       203
    Survived       0.73      0.71      0.72       125

    accuracy                           0.79       328
   macro avg       0.78      0.77      0.78       328
weighted avg       0.79      0.79      0.79       328



The model has a balanced accuracy score of 76% which is a result of the low recall score for predicting survivors.

Precision:
when the model predicts a survivor it correctly does so 79% of the time.  Indicating there are some survivors incorrectly being classified as not having survived.
When the model predicts someone who perished, it correctly do so 80% of the time.

Recall
This looks at when the passenger actually survived, how often the model correcly predicts them as surviving.  This model will classify a survivor correctly just 63% of the time.
When a passenger actually perished, the model will classify them correctly as perishing 90% of the time.

the model seems slightly more accurate at predicting those who perished than those who survived.