In [13]:
%matplotlib inline
import matplotlib.pyplot as plt 
import numpy as np
import pandas as pd 
import seaborn as sns

train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
train.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [16]:
train.info()
train.describe()
train.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [28]:
#Some columns aren't useful for prediction or have too many missing values
columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin']
train.drop(columns=[col for col in columns_to_drop if col in train.columns], inplace=True)

#fill missing data andconvert categorical variables to numeric
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
Sex_feature = ["Sex"]
Sex_transformer = Pipeline(steps=[
    
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
Embarked_feature = ["Embarked"]
Embarked_transformer = Pipeline(steps=[
    
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
Age_feature = ["Age"]
Age_transformer = Pipeline(steps=[
    
    ('imputer', SimpleImputer(strategy='mean'))
])
Preprocessor = ColumnTransformer(
    transformers=[
        ('sex' , Sex_transformer,Sex_feature),
        ('embarked' , Embarked_transformer,Embarked_feature),
        ('age' , Age_transformer,Age_feature)])

#import the models Logistic Regression, Random Forest
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score ,confusion_matrix, classification_report, recall_score, precision_score
# Separate features and target
X = train.drop('Survived', axis=1)
y = train['Survived']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# check the shape of the data
X_train.shape, X_test.shape, y_train.shape, y_test.shape
#create a dictionary to store the models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100)
}
# Create a dictionary to store the results
results = {}
# Loop through the models
for model_name, model in models.items():
    # Create a pipeline with preprocessing and the model
    pipeline = Pipeline(steps=[('preprocessor', Preprocessor),
                               ('model', model)])
    
    # Fit the model
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(X_test)

    print(" Accuracy:", accuracy_score(y_test, y_pred))
    print(" Precision:", precision_score(y_test, y_pred))
    print(" Recall:", recall_score(y_test, y_pred))
    print(" Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print(" Classification Report:\n", classification_report(y_test, y_pred))
    
    # Calculate accuracy
    #accuracy = accuracy_score(y_test, y_pred)
    
    # Store the results
   # results[model_name] = accuracy
# Print the results
#for model_name, accuracy in results.items():
   # print(f"{model_name}: {accuracy:.4f}")

#evaluate use confusion matrix and precison and recall

#hyperparameter tuning
from sklearn.model_selection import GridSearchCV
#create a pipeline with preprocessing and the model
pipeline = Pipeline(steps=[('preprocessor', Preprocessor),
                               ('model', RandomForestClassifier(random_state=42))])
# Define the parameter grid
param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=2 , scoring='accuracy')
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
# Get the best parameters and best score 
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)
best_model = grid_search.best_estimator_
# Make predictions on the test set
y_pred = best_model.predict(X_test)
# Evaluate the model    
print(" Accuracy:", accuracy_score(y_test, y_pred))
print(" Precision:", precision_score(y_test, y_pred))
print(" Recall:", recall_score(y_test, y_pred))
print(" Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

#save and load the model
from joblib import dump, load
# Save the model    
dump(best_model, 'best_model.joblib')
# Load the model
loaded_model = load('best_model.joblib')
# Make predictions on the test set
y_pred = loaded_model.predict(X_test)
# Evaluate the model
print(" Accuracy:", accuracy_score(y_test, y_pred))
print(" Precision:", precision_score(y_test, y_pred))
print(" Recall:", recall_score(y_test, y_pred))




 Accuracy: 0.7821229050279329
 Precision: 0.7536231884057971
 Recall: 0.7027027027027027
 Confusion Matrix:
 [[88 17]
 [22 52]]
 Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.84      0.82       105
           1       0.75      0.70      0.73        74

    accuracy                           0.78       179
   macro avg       0.78      0.77      0.77       179
weighted avg       0.78      0.78      0.78       179

 Accuracy: 0.7150837988826816
 Precision: 0.6619718309859155
 Recall: 0.6351351351351351
 Confusion Matrix:
 [[81 24]
 [27 47]]
 Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.77      0.76       105
           1       0.66      0.64      0.65        74

    accuracy                           0.72       179
   macro avg       0.71      0.70      0.70       179
weighted avg       0.71      0.72      0.71       179

Fitting 5 folds for each of 108 candidat

In [29]:
print(loaded_model)


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('sex',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Sex']),
                                                 ('embarked',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Embarked']),
                                                 ('age',
                                                  Pipeline(steps=[('imputer',
                           