# Logistic Regression

In [262]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

### Read in the data

In [263]:
# Read in the csv file
titanic_df = pd.read_csv(Path("data_files/titanic_cleaned.csv"))

In [264]:
# reviewing dataframe
titanic_df.head()

Unnamed: 0,survived,name,sex,sibsp,parch,ticket,fare,age_updated,embarked_updated,pclass_updated
0,1,"Allen, Miss. Elisabeth Walton",female,0,0,24160,211.3375,29.0,Southampton (UK),1st
1,1,"Allison, Master. Hudson Trevor",male,1,2,113781,151.55,0.9167,Southampton (UK),1st
2,0,"Allison, Miss. Helen Loraine",female,1,2,113781,151.55,2.0,Southampton (UK),1st
3,0,"Allison, Mr. Hudson Joshua Creighton",male,1,2,113781,151.55,30.0,Southampton (UK),1st
4,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,1,2,113781,151.55,25.0,Southampton (UK),1st


### Prepare the data

In [265]:
# Dropping name and ticket number columns as they are not required  
titanic = titanic_df.drop(['name', 'ticket'], axis=1)
titanic.head()

Unnamed: 0,survived,sex,sibsp,parch,fare,age_updated,embarked_updated,pclass_updated
0,1,female,0,0,211.3375,29.0,Southampton (UK),1st
1,1,male,1,2,151.55,0.9167,Southampton (UK),1st
2,0,female,1,2,151.55,2.0,Southampton (UK),1st
3,0,male,1,2,151.55,30.0,Southampton (UK),1st
4,0,female,1,2,151.55,25.0,Southampton (UK),1st


<h7> Chaning categorical data into dummies

In [266]:
# converting categorical using get_dummies and dropping the first column so the importance isn't inflated
sex_dummies = pd.get_dummies(titanic['sex'], drop_first=True) 
embarked_dummies = pd.get_dummies(titanic['embarked_updated'], drop_first=True)
pclass_dummies = pd.get_dummies(titanic['pclass_updated'], drop_first=True)


In [267]:
# Concatenate the encoded dummies with the data frame and drop uncessary columns
df_titanic_transformed = pd.concat([titanic, sex_dummies, embarked_dummies, pclass_dummies], axis=1)
df_titanic_transformed = df_titanic_transformed.drop(['sex', 'embarked_updated', 'pclass_updated'], axis=1) # dropping columns not required
df_titanic_transformed.head()

Unnamed: 0,survived,sibsp,parch,fare,age_updated,male,Queenstown (IE),Southampton (UK),2nd,3rd
0,1,0,0,211.3375,29.0,0,0,1,0,0
1,1,1,2,151.55,0.9167,1,0,1,0,0
2,0,1,2,151.55,2.0,0,0,1,0,0
3,0,1,2,151.55,30.0,1,0,1,0,0
4,0,1,2,151.55,25.0,0,0,1,0,0


### Separate the data into labels and features

In [268]:
# Separate the features (X) from the target (y)
y = df_titanic_transformed["survived"]
X = df_titanic_transformed.drop(columns='survived')

In [269]:
# Reivew the y variable series
y.head()

0    1
1    1
2    0
3    0
4    0
Name: survived, dtype: int64

In [270]:
# Review the X variable dataframe
X.head()

Unnamed: 0,sibsp,parch,fare,age_updated,male,Queenstown (IE),Southampton (UK),2nd,3rd
0,0,0,211.3375,29.0,0,0,1,0,0
1,1,2,151.55,0.9167,1,0,1,0,0
2,1,2,151.55,2.0,0,0,1,0,0
3,1,2,151.55,30.0,1,0,1,0,0
4,1,2,151.55,25.0,0,0,1,0,0


In [271]:
# Check the balance of our target values
y.value_counts()

0    809
1    500
Name: survived, dtype: int64

### Split data into training and testing datasets using train_test_split

In [273]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape


(981, 9)

### Standardise data

In [274]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# https://towardsdatascience.com/feature-scaling-and-normalisation-in-a-nutshell-5319af86f89b

In [275]:
# # Standarise training set
# X_train_scaled = StandardScaler().fit_transform(X_train[['sibsp', 'parch', 'fare', 'age_updated']])
# X_train_scaled[0:3]

# # Create a DataFrame called with the scaled data
# X_train_scaled = pd.DataFrame(
#     X_train_scaled,
#     columns=['sibsp', 'parch', 'fare', 'age_updated'])
# X_train_scaled

# X_train_transformed = pd.concat([X_train_scaled, X_train['male'], X_train['Queenstown (IE)'], X_train['Southampton (UK)'], X_train['2nd'], X_train['3rd']], axis=1)
# X_train_transformed

# the above code works until I try and concat it together - not sure what I'm doing wrong

### Create logistic regression model with the original data

In [276]:
# Create a logistic regression model
# classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)
# classifier = LogisticRegression(solver='lbfgs', max_iter=200, random_state=1)
classifier = LogisticRegression(solver='lbfgs',random_state=1)

## QUESTION- - what max_iter should i use?

# Fit and train the model using the training data
classifier.fit(X_train_scaled, y_train)

LogisticRegression(random_state=1)

In [277]:
# Make predictions
predictions = classifier.predict(X_test_scaled)

### Evaluate the models performance

In [278]:
# Print the balanced_accuracy score of the model
print(f"The balanced accuracy score of the model is {balanced_accuracy_score(y_test, predictions)}")

The balanced accuracy score of the model is 0.7553497536945812


In [279]:
# Confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=['Actual - perished', 'Acutal - survived'], columns=['Predicted - perished', 'Predicted - survived']
)
print(cm)
display(cm_df)

[[180  23]
 [ 47  78]]


Unnamed: 0,Predicted - perished,Predicted - survived
Actual - perished,180,23
Acutal - survived,47,78


In [280]:
# Classification report
target_names = ["Perished", "Survived"]
print(classification_report(y_test, predictions, target_names=target_names))


              precision    recall  f1-score   support

    Perished       0.79      0.89      0.84       203
    Survived       0.77      0.62      0.69       125

    accuracy                           0.79       328
   macro avg       0.78      0.76      0.76       328
weighted avg       0.79      0.79      0.78       328



The model has a balanced accuracy score of 76% which is a result of the low recall score for predicting survivors.

Precision:
when the model predicts a survivor it correctly does so 79% of the time.  Indicating there are some survivors incorrectly being classified as not having survived.
When the model predicts someone who perished, it correctly do so 80% of the time.

Recall
This looks at when the passenger actually survived, how often the model correcly predicts them as surviving.  This model will classify a survivor correctly just 63% of the time.
When a passenger actually perished, the model will classify them correctly as perishing 90% of the time.

the model seems slightly more accurate at predicting those who perished than those who survived.