# Model Deployment and Prediction Pipeline
`Objective:`

Create a simple prediction pipeline to deploy your tuned Random Forest model from Day 6 (with or without SMOTE) and use it to predict survival for new, unseen passenger data. Save the model and evaluate its performance on a test set.

`Save the Best Model:`
Use joblib or pickle to save the best Random Forest model from Day 6.

`Create a Prediction Function:1`
Build a function that preprocesses new data and makes predictions using the saved model.

`Test the Pipeline:`
Simulate new passenger data and use the function to predict survival.

`Evaluate on Test Set:`

Load the model and evaluate its performance on the X_test and y_test from Day 6.

Create a basic web interface or script to input passenger data manually.



In [1]:
# Import required libraries
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

In [2]:
def wrangle(filepath):
    df = pd.read_csv(filepath)

    return df

In [3]:
df = wrangle(r"C:\Users\User\Desktop\100DayOfCode\Titanic_clean.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [5]:
def preprocessing(df):
    # Keep original data
    df_processed = df.copy()
    # Drop unwanted column
    df_processed.drop(columns=["Unnamed: 0"],inplace = True)
    # Subset the data
    df_processed["FamilySize"] = df_processed["SibSp"] + df_processed["Parch"] + 1
    df_processed["Title"] = df_processed["Name"].str.extract(" ([A-Za-z]+)\.", expand = False)
    # Drop columns after subset
    df_processed.drop(columns=["SibSp", "Parch", "Name"], inplace = True)
    # Drop multicolinearlity columns
    df_processed.drop(columns=["PassengerId", "Ticket"], inplace = True)
    # Encode categorical features using OneHotEncoding
    cat_cols = ["Sex","Embarked","Title"]
    df_processed = pd.get_dummies(df_processed, columns=cat_cols, drop_first = 1)
    # covert bool to int of the encoded variables
    for col in df_processed.columns:
        if df_processed[col].dtype == "bool":
            df_processed[col] = df_processed[col].astype(int)

    # scale numerical features
    numerical_cols = ["Pclass", "Age","Age", "FamilySize"]
    scaler=StandardScaler()
    df_processed[numerical_cols] = scaler.fit_transform(df_processed[numerical_cols])
             
    return df_processed, scaler

In [6]:
df_clean, scaler = preprocessing(df)

In [7]:
df_clean.head()

Unnamed: 0,Survived,Pclass,Age,Fare,FamilySize,Sex_male,Embarked_Q,Embarked_S,Title_Col,Title_Countess,...,Title_Major,Title_Master,Title_Miss,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir
0,0,0.827377,-0.565736,7.25,0.05916,1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,-1.566107,0.663861,71.2833,0.05916,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,1,0.827377,-0.258337,7.925,-0.560975,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
3,1,-1.566107,0.433312,53.1,0.05916,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0.827377,0.433312,8.05,-0.560975,1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [8]:
# Sepparate features and targets
features = (col for col in df_clean.columns if col not in("Survived"))
X = df_clean[features]
y = df_clean["Survived"]

# train_test spit
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 42)
# Apply smote
# smote = SMOTE(random_state = 42)
# X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Train and tune using Random forest
rf_model = RandomForestClassifier(random_state=42)
params = {
    "n_estimators": [50,100, 200],
    "max_depth": [None, 10, 100]
}
grid_search_smote = GridSearchCV(rf_model,params, cv=5, scoring= "accuracy")
grid_search_smote.fit(X_train, y_train)

In [9]:
print(y_train.value_counts())

Survived
0    444
1    268
Name: count, dtype: int64


In [10]:
X.head()

Unnamed: 0,Pclass,Age,Fare,FamilySize,Sex_male,Embarked_Q,Embarked_S,Title_Col,Title_Countess,Title_Don,...,Title_Major,Title_Master,Title_Miss,Title_Mlle,Title_Mme,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir
0,0.827377,-0.565736,7.25,0.05916,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,-1.566107,0.663861,71.2833,0.05916,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0.827377,-0.258337,7.925,-0.560975,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,-1.566107,0.433312,53.1,0.05916,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0.827377,0.433312,8.05,-0.560975,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [11]:
# Save the best model
best_rf_model = grid_search_smote.best_estimator_
joblib.dump(best_rf_model, "Titanic_rf_model.joblib")
joblib.dump(scaler, "Titanic_scaler.joblib")

['Titanic_scaler.joblib']

In [12]:
# Create a Prediction Function
def predict_survival(new_data, model_path = "Titanic_rf_model.joblib", scaler_path = "Titanic_scaler.joblib"):
    # Load model and scaler
    loaded_model = joblib.load(model_path)
    loaded_scaler = joblib.load(scaler_path)

    # Convert new data to dataframe incase its list or dictionaries
    if not isinstance(new_data, pd.DataFrame):# Check if the new_data is a Dataframe
        new_data = pd.DataFrame([new_data])
    # Prepocess new data to match model
    new_data = new_data[["Pclass",	"Age",	"Fare", "Sex", "Name","SibSp",	"Parch", "Embarked"]]
    new_data["FamilySize"] = new_data["SibSp"] + new_data["Parch"] + 1
    new_data["Title"] = new_data["Name"].str.extract(" ([A-Za-z]+)\.", expand = False)
    # Encode categorical features using OneHotEncoding
    cat_cols = ["Sex","Embarked","Title"]
    new_data = pd.get_dummies(new_data, columns=cat_cols, drop_first = 1)
    # covert bool to int of the encoded variables
    for col in new_data.columns:
        if new_data[col].dtype == "bool":
            new_data[col] = new_data[col].astype(int)
            
    #Align columns with training data
    missing_cols = set(X_train.columns) - set(new_data.columns)
    for col in missing_cols:
        new_data[col] = 0
    new_data = new_data[X_train.columns]

    # Scale numerical features
    numerical_cols = ["Pclass", "Age","Age", "FamilySize"]
    new_data[numerical_cols] = loaded_scaler.fit_transform(new_data[numerical_cols])

    # To make prediction
    prediction = loaded_model.predict(new_data)
    probability = loaded_model.predict_proba(new_data)[:,1]

    return prediction[0], probability[0]
    

In [13]:
# To test the pipeline with new_data
new_passenger ={
    "Pclass": 3,
    "Sex": "male",
    "Age": 23,	
    "SibSp": 0,
    "Parch": 0,
    "Fare": 18.5,
    "Embarked": "S",
    "Name": "Mrs. Terry Marco"
    
}

prediction, probability = predict_survival(new_passenger)
print(f"Predicted Survival: {prediction} (1 = Survived, 0 = Not Survived)")
print(f"Survival Probability: {probability:.2f}")

Predicted Survival: 1 (1 = Survived, 0 = Not Survived)
Survival Probability: 0.86


In [14]:
# Evaluate on Test Set
loaded_model = joblib.load("Titanic_rf_model.joblib")
y_pred = loaded_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy: .2f}")
print("Classification report: \n", classification_report(y_test, y_pred))

Accuracy:  0.84
Classification report: 
               precision    recall  f1-score   support

           0       0.85      0.89      0.87       105
           1       0.83      0.77      0.80        74

    accuracy                           0.84       179
   macro avg       0.84      0.83      0.83       179
weighted avg       0.84      0.84      0.84       179



In [15]:
import streamlit as st
import pandas as pd
import joblib

In [23]:
import streamlit as st
import pandas as pd
import joblib

# Load the saved model and scaler
model_path = 'titanic_rf_model.joblib'
scaler_path = 'titanic_scaler.joblib'
loaded_model = joblib.load(model_path)
loaded_scaler = joblib.load(scaler_path)

# Create a Prediction Function
def predict_survival(new_data, model_path = "Titanic_rf_model.joblib", scaler_path = "Titanic_scaler.joblib"):
    # Load model and scaler
    loaded_model = joblib.load(model_path)
    loaded_scaler = joblib.load(scaler_path)

    # Convert new data to dataframe incase its list or dictionaries
    if not isinstance(new_data, pd.DataFrame):# Check if the new_data is a Dataframe
        new_data = pd.DataFrame([new_data])
    # Prepocess new data to match model
    new_data = new_data[["Pclass",	"Age",	"Fare", "Sex", "Name","SibSp",	"Parch", "Embarked"]]
    new_data["FamilySize"] = new_data["SibSp"] + new_data["Parch"] + 1
    new_data["Title"] = new_data["Name"].str.extract(" ([A-Za-z]+)\.", expand = False)
    # Encode categorical features using OneHotEncoding
    cat_cols = ["Sex","Embarked","Title"]
    new_data = pd.get_dummies(new_data, columns=cat_cols, drop_first = 1)
    # covert bool to int of the encoded variables
    for col in new_data.columns:
        if new_data[col].dtype == "bool":
            new_data[col] = new_data[col].astype(int)
            
    #Align columns with training data
    missing_cols = set(X_train.columns) - set(new_data.columns)
    for col in missing_cols:
        new_data[col] = 0
    new_data = new_data[X_train.columns]

    # Scale numerical features
    numerical_cols = ["Pclass", "Age","Age", "FamilySize"]
    new_data[numerical_cols] = loaded_scaler.fit_transform(new_data[numerical_cols])

    # To make prediction
    prediction = loaded_model.predict(new_data)
    probability = loaded_model.predict_proba(new_data)[:,1]

    return prediction[0], probability[0]
    

# Hardcode X_train.columns (replace with saved columns if available)
# Hardcode X_train.columns (you might need to pass this or save it with the model)
X_train_columns = ['Pclass', 'Age', 'FamilySize', 'Fare', 'Sex_male', 'Embarked_Q', 'Embarked_S', 'Title_Mrs']  # Adjust based on your training data
X_train = pd.DataFrame(columns=X_train_columns)  # Placeholder, replace with actual X_train.columns if saved

# Streamlit app
st.title("Titanic Survival Predictor")

# Input fields
Pclass = st.selectbox("Passenger Class", options=[1, 2, 3], index=2)
Sex = st.selectbox("Sex", options=["male", "female"], index=0)
aAge = st.slider("Age", min_value=0, max_value=100, value=30)
Sibsp = st.number_input("Number of Siblings/Spouses", min_value=0, value=0)
Parch = st.number_input("Number of Parents/Children", min_value=0, value=0)
Fare = st.number_input("Fare", min_value=0.0, value=7.25, step=0.01)
Embarked = st.selectbox("Port of Embarkation", options=["C", "Q", "S"], index=2)
Name = st.text_input("Name", value="Mr. John Doe")

# Create new_passenger dictionary
new_passenger = {
    "pclass": Pclass,
    "Sex": Sex,
    #"Age": Age,
    "Sibsp": Sibsp,
    "Parch": Parch,
    "Fare": Fare,
    "Embarked": Embarked,
    "Name": Name
}

# Predict button
if st.button("Predict Survival"):
    prediction, probability = predict_survival(new_passenger)
    st.write(f"**Predicted Survival:** {prediction} (1 = Survived, 0 = Not Survived)")
    st.write(f"**Survival Probability:** {probability:.2f}")

# Optional: Add a note or styling
st.markdown("---")
st.write("Enter passenger details and click 'Predict Survival' to see the result.")




In [20]:
# Hardcode X_train.columns (you might need to pass this or save it with the model)
X_train_columns = ['Pclass', 'Age', 'FamilySize', 'Fare', 'Sex_male', 'Embarked_Q', 'Embarked_S', 'Title_Mrs']  # Adjust based on your training data
X_train = pd.DataFrame(columns=X_train_columns)  # Placeholder, replace with actual X_train.columns if saved

# Streamlit app
st.title("Titanic Survival Predictor")

# Input fields
Pclass = st.selectbox("Passenger Class", options=[1, 2, 3], index=2)
Sex = st.selectbox("Sex", options=["male", "female"], index=0)
aAge = st.slider("Age", min_value=0, max_value=100, value=30)
Sibsp = st.number_input("Number of Siblings/Spouses", min_value=0, value=0)
Parch = st.number_input("Number of Parents/Children", min_value=0, value=0)
Fare = st.number_input("Fare", min_value=0.0, value=7.25, step=0.01)
Embarked = st.selectbox("Port of Embarkation", options=["C", "Q", "S"], index=2)
Name = st.text_input("Name", value="Mr. John Doe")

# Create new_passenger dictionary
new_passenger = {
    "pclass": Pclass,
    "Sex": Sex,
    "Age": Age,
    "Sibsp": Sibsp,
    "Parch": Parch,
    "Fare": Fare,
    "Embarked": Embarked,
    "Name": Name
}

# Predict button
if st.button("Predict Survival"):
    prediction, probability = predict_survival(new_passenger)
    st.write(f"**Predicted Survival:** {prediction} (1 = Survived, 0 = Not Survived)")
    st.write(f"**Survival Probability:** {probability:.2f}")

# Optional: Add a note or styling
st.markdown("---")
st.write("Enter passenger details and click 'Predict Survival' to see the result.")

