# Logistic Regression - optimisation 3

Optimisations made:
- removing embarked location

In [1]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

In [2]:
# Read in the csv file
titanic_df = pd.read_csv(Path("data_files/titanic_cleaned.csv"))
titanic_df.head() # Reviewing dataframe

Unnamed: 0,passenger_id,pclass,survived,name,sex,age,sibsp,parch,fare,cabin,embarked,age_updated,embarked_updated,fare_updated,deck,pclass_updated
0,1,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,211.3375,B5,S,29.0,Southampton (UK),211.3375,B,First class
1,2,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,151.55,C22 C26,S,0.9167,Southampton (UK),151.55,C,First class
2,3,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,151.55,C22 C26,S,2.0,Southampton (UK),151.55,C,First class
3,4,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,151.55,C22 C26,S,30.0,Southampton (UK),151.55,C,First class
4,5,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,151.55,C22 C26,S,25.0,Southampton (UK),151.55,C,First class


In [3]:
# Create a copy of the dataframe
titanic = titanic_df.copy()

## Prepare the data

In [4]:
# Dropping columns as they have been updated
titanic = titanic.drop(['passenger_id', 'pclass', 'age', 'embarked', 'fare', 'cabin', 'deck'], axis=1)
titanic.head()

Unnamed: 0,survived,name,sex,sibsp,parch,age_updated,embarked_updated,fare_updated,pclass_updated
0,1,"Allen, Miss. Elisabeth Walton",female,0,0,29.0,Southampton (UK),211.3375,First class
1,1,"Allison, Master. Hudson Trevor",male,1,2,0.9167,Southampton (UK),151.55,First class
2,0,"Allison, Miss. Helen Loraine",female,1,2,2.0,Southampton (UK),151.55,First class
3,0,"Allison, Mr. Hudson Joshua Creighton",male,1,2,30.0,Southampton (UK),151.55,First class
4,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,1,2,25.0,Southampton (UK),151.55,First class


## Adding field that includes passenger title that has been extracted from name

In [5]:
# Getting the titles from the passenger names and assigning them to a new column
name_split = titanic['name'].str.split(',', expand=True)[1] # splitting name in two based on the ',' and taking the second column which contains the title
title = name_split.str.split('.', expand=True)[0] # splitting name in two based on the full stop and taking the first column that contains only the title
title = title.str.strip() # removing the additional space at the start of the title
titanic['title'] = title # adding title as a new column in the data frame

In [6]:
titanic['title'].value_counts(dropna=False)

Mr              757
Miss            260
Mrs             197
Master           61
Dr                8
Rev               8
Col               4
Mlle              2
Ms                2
Major             2
Lady              1
Capt              1
Sir               1
Mme               1
Dona              1
the Countess      1
Don               1
Jonkheer          1
Name: title, dtype: int64

In [7]:
# Creating a function to classify the titles into different groups
def title_grouped(title):
        if title == 'Miss' or title == 'Mlle' or title == 'Ms':
            return "Miss"
        elif title == 'Mrs' or title =='Mme':
            return "Mrs"
        elif title == "Mr" or title == 'Dr' or title == 'Rev':
            return "Mr"
        elif title == "Master":
            return "male_child"
        elif title == "Col" or title == "Major" or title == "Capt":
            return "Military"
        elif title == "Don" or title == "Jonkheer" or title == "Sir" or title == "the Countess" or title == "Lady" or title == "Dona":
            return "Noblility"
        else:
            return title   



In [8]:
# Calling that function and assigning results to new column in dataframe
titanic['title_grouped'] = titanic['title'].apply(title_grouped)
titanic = titanic.drop(['title', 'name'], axis=1) # dropping the title and name columns as not required
titanic.head()

Unnamed: 0,survived,sex,sibsp,parch,age_updated,embarked_updated,fare_updated,pclass_updated,title_grouped
0,1,female,0,0,29.0,Southampton (UK),211.3375,First class,Miss
1,1,male,1,2,0.9167,Southampton (UK),151.55,First class,male_child
2,0,female,1,2,2.0,Southampton (UK),151.55,First class,Miss
3,0,male,1,2,30.0,Southampton (UK),151.55,First class,Mr
4,0,female,1,2,25.0,Southampton (UK),151.55,First class,Mrs


In [9]:
titanic['title_grouped'].value_counts(dropna=False)

Mr            773
Miss          264
Mrs           198
male_child     61
Military        7
Noblility       6
Name: title_grouped, dtype: int64

### Adding field that calculates the number of family members they were travelling with

In [10]:
# summing the number of travel companions
titanic['travel_companions'] = titanic['sibsp'] + titanic['parch']

# Dropping sibsp and parch columns as they are included in the new column
titanic = titanic.drop(['sibsp', 'parch'], axis=1) 

titanic.head()

Unnamed: 0,survived,sex,age_updated,embarked_updated,fare_updated,pclass_updated,title_grouped,travel_companions
0,1,female,29.0,Southampton (UK),211.3375,First class,Miss,0
1,1,male,0.9167,Southampton (UK),151.55,First class,male_child,3
2,0,female,2.0,Southampton (UK),151.55,First class,Miss,3
3,0,male,30.0,Southampton (UK),151.55,First class,Mr,3
4,0,female,25.0,Southampton (UK),151.55,First class,Mrs,3


### Convert categorical data

In [11]:
# converting categorical using get_dummies and dropping the first column so the importance isn't inflated
sex_dummies = pd.get_dummies(titanic['sex'], drop_first=True) 
title_dummies = pd.get_dummies(titanic['title_grouped'], drop_first=True)
pclass_dummies = pd.get_dummies(titanic['pclass_updated'], drop_first=True)
embarked_dummies = pd.get_dummies(titanic['embarked_updated'], drop_first=True)

# Concatenate the encoded dummies with the data frame and drop uncessary columns
df_titanic_transformed = pd.concat([titanic, sex_dummies, title_dummies, pclass_dummies, embarked_dummies], axis=1)

# dropping columns not required
df_titanic_transformed = df_titanic_transformed.drop(['sex', 'title_grouped', 'pclass_updated', 'embarked_updated'], axis=1) 

df_titanic_transformed.head()

Unnamed: 0,survived,age_updated,fare_updated,travel_companions,male,Miss,Mr,Mrs,Noblility,male_child,Second class,Third class,Queenstown (IE),Southampton (UK)
0,1,29.0,211.3375,0,0,1,0,0,0,0,0,0,0,1
1,1,0.9167,151.55,3,1,0,0,0,0,1,0,0,0,1
2,0,2.0,151.55,3,0,1,0,0,0,0,0,0,0,1
3,0,30.0,151.55,3,1,0,1,0,0,0,0,0,0,1
4,0,25.0,151.55,3,0,0,0,1,0,0,0,0,0,1


### Separate the data into labels and features

In [12]:
# Separate the features (X) from the target (y)
y = df_titanic_transformed["survived"]
X = df_titanic_transformed.drop(columns='survived')

In [13]:
# Reivew the y variable series
y.head()

0    1
1    1
2    0
3    0
4    0
Name: survived, dtype: int64

In [14]:
# Review the X variable dataframe
X.head()

Unnamed: 0,age_updated,fare_updated,travel_companions,male,Miss,Mr,Mrs,Noblility,male_child,Second class,Third class,Queenstown (IE),Southampton (UK)
0,29.0,211.3375,0,0,1,0,0,0,0,0,0,0,1
1,0.9167,151.55,3,1,0,0,0,0,1,0,0,0,1
2,2.0,151.55,3,0,1,0,0,0,0,0,0,0,1
3,30.0,151.55,3,1,0,1,0,0,0,0,0,0,1
4,25.0,151.55,3,0,0,0,1,0,0,0,0,0,1


In [15]:
# Check the balance of our target values
y.value_counts()

0    809
1    500
Name: survived, dtype: int64

### Split data into training and testing datasets using train_test_split

In [16]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape


(981, 13)

## Apply random over sampler due to the imblanced data

In [17]:
# Instantiate the random oversample model
ros = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_train_ros, y_train_ros= ros.fit_resample(X_train, y_train)


In [18]:
# Count the distinct values of the resampled labels data
y_train_ros.value_counts()

1    606
0    606
Name: survived, dtype: int64

## Create logistic regression model with the original data

In [19]:
# Create a logistic regression model
classifier = LogisticRegression(solver='lbfgs', max_iter=700, random_state=1)

# Fit and train the model using the training data
classifier.fit(X_train_ros, y_train_ros)

LogisticRegression(max_iter=700, random_state=1)

In [20]:
# Make predictions
predictions = classifier.predict(X_test)

## Evaluate the models performance

In [21]:
# Print the balanced_accuracy score of the model
print(f"The balanced accuracy score of the model is {balanced_accuracy_score(y_test, predictions)}")

The balanced accuracy score of the model is 0.7756453201970444


In [22]:
# Confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=['Actual - perished', 'Acutal - survived'], columns=['Predicted - perished', 'Predicted - survived']
)
print(cm)
display(cm_df)

[[172  31]
 [ 37  88]]


Unnamed: 0,Predicted - perished,Predicted - survived
Actual - perished,172,31
Acutal - survived,37,88


In [23]:
# Classification report
target_names = ["Perished", "Survived"]
print(classification_report(y_test, predictions, target_names=target_names))


              precision    recall  f1-score   support

    Perished       0.82      0.85      0.83       203
    Survived       0.74      0.70      0.72       125

    accuracy                           0.79       328
   macro avg       0.78      0.78      0.78       328
weighted avg       0.79      0.79      0.79       328



The model has a balanced accuracy score of 76% which is a result of the low recall score for predicting survivors.

Precision:
when the model predicts a survivor it correctly does so 79% of the time.  Indicating there are some survivors incorrectly being classified as not having survived.
When the model predicts someone who perished, it correctly do so 80% of the time.

Recall
This looks at when the passenger actually survived, how often the model correcly predicts them as surviving.  This model will classify a survivor correctly just 63% of the time.
When a passenger actually perished, the model will classify them correctly as perishing 90% of the time.

the model seems slightly more accurate at predicting those who perished than those who survived.