In [None]:
CAPSTONE TWO MODELING - HECTOR SANCHEZ

In [9]:
# Begin by importing necessary libraries for modeling

import pandas as pd  # for data manipulation and analysis
from sklearn.model_selection import train_test_split, cross_val_score # contains tools for model selection and evaluation
from sklearn.linear_model import LogisticRegression # contains linear models like Logistic Regression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier # contains ensemble algorithms like Random Forest and Gradient Boosting
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix # provides metrics to evaluate model performance
from sklearn.preprocessing import StandardScaler 

In [11]:
# Next, we will load the preprocessed data that we worked on in the previous preprocessing step
# We saved our preprocessed data as CSV files in the previous step

# Load features (X) and target (y) for training and testing sets

X_train = pd.read_csv('C:/Users/hecsa/Springboard/Springboard Github/Springboard/DataScienceCapstoneTwo/X_train_preprocessed.csv')
X_test = pd.read_csv('C:/Users/hecsa/Springboard/Springboard Github/Springboard/DataScienceCapstoneTwo/X_test_preprocessed.csv')
y_train = pd.read_csv('C:/Users/hecsa/Springboard/Springboard Github/Springboard/DataScienceCapstoneTwo/y_train_preprocessed.csv')
y_test = pd.read_csv('C:/Users/hecsa/Springboard/Springboard Github/Springboard/DataScienceCapstoneTwo/y_test_preprocessed.csv')

In [13]:
# If y_train and y_test are DataFrames, we need to convert them to Series

y_train = y_train.iloc[:, 0]  # Assuming the target variable is the first column
y_test = y_test.iloc[:, 0]

In [15]:
# Verify the data shape of the data by calling .shape on the training and test sets.
# This ensures that the data was loaded correctly
# We will also check the head of the datasets by calling .head()

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (815, 68)
X_test shape: (204, 68)
y_train shape: (815,)
y_test shape: (204,)


In [17]:
# Use .head() to print the first five rows of X_train

print("\nFirst five rows of X_train:")
print(X_train.head())


First five rows of X_train:
   Sexo_-2.1316282072803005e-16  Sexo_0.7828813612588126  \
0                          True                    False   
1                          True                    False   
2                          True                    False   
3                          True                    False   
4                          True                    False   

   Faixa_etaria_-1.0206207261596576  Faixa_etaria_0.0  \
0                             False             False   
1                             False             False   
2                             False             False   
3                             False             False   
4                             False             False   

   Faixa_etaria_3.1086244689504386e-17  Faixa_etaria_1.0206207261596576  \
0                                 True                            False   
1                                 True                            False   
2                                 True    

In [19]:
# Use .head() to print the first five rows of y_train

print("\nFirst five rows of y_train:")
print(y_train.head())


First five rows of y_train:
0    False
1    False
2    False
3    False
4    False
Name: Desfecho_final_3.0, dtype: bool


In [None]:
# We are going to initialize and Train three Models:
# 1. Logistic Regression
# 2. Random Forest Classifier
# 3. Gradiant Boosting Classifier

In [21]:
# MODEL 1: LOGISTIC REGRESSION 
# This model is suitable for binary classification problems
# Setting max_iter=1000 ensures that the model converges

# Start by initializing the Logistic Regression model
# We set max_iter to a higher value to ensure convergence

log_model = LogisticRegression(max_iter=1000, random_state=42)

In [23]:
# Fit the model on the training data

log_model.fit(X_train, y_train)

In [25]:
# MODEL 2: RANDOM FOREST CLASSIFIER
# This is an ensemble method that uses multiple decision trees
# Setting n_estimators=100 specifies the number of trees in the forest

# Similar to the last model, start by initializing the Random Forest model

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [29]:
# Next, fit the model on the training data

rf_model.fit(X_train, y_train)

In [31]:
# MODEL 3: GRADIENT BOOSTING CLASSIFIER
# This is an ensemble method that builds trees sequentially to correct errosrs made by previous trees
# n_estimators=100 specifies the number of boosting stages

# Initialize the Gradient Boosting model as in the last 2 models

gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)

In [33]:
# Now fit the model on the training data

gb_model.fit(X_train, y_train)

In [None]:
# Now that we've worked through our 3 models, we will Evaluate the models on the test set

In [35]:
# We will create a function to evaluate the performance of each model

def evaluate_model(model, X_test, y_test):
    """
    Evaluates the model's performance on the test set and returns evaluation metrics.
    """
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions) # the proportion of correct predictions over total predictions
    precision = precision_score(y_test, predictions, zero_division=0) # the ability of the classifier not to label a negative sample as positive
    recall = recall_score(y_test, predictions, zero_division=0) # the ability of the classifier to find all the positive samples
    f1 = f1_score(y_test, predictions, zero_division=0) # the mean of precision and recall
    return accuracy, precision, recall, f1

In [None]:
# We will now use this function to individually evaluate each model

In [37]:
# Evaluate Logistic Regression model

log_accuracy, log_precision, log_recall, log_f1 = evaluate_model(log_model, X_test, y_test)

In [39]:
# Evaluate Random Forest model

rf_accuracy, rf_precision, rf_recall, rf_f1 = evaluate_model(rf_model, X_test, y_test)

In [41]:
# Evaluate Gradient Boosting model

gb_accuracy, gb_precision, gb_recall, gb_f1 = evaluate_model(gb_model, X_test, y_test)

In [43]:
# Next, we need to display the results

print("\nModel Evaluation Metrics:")
print("--------------------------------------------------")
print("Logistic Regression:")
print(f"Accuracy: {log_accuracy:.4f}")
print(f"Precision: {log_precision:.4f}")
print(f"Recall: {log_recall:.4f}")
print(f"F1 Score: {log_f1:.4f}")
print("--------------------------------------------------")
print("Random Forest:")
print(f"Accuracy: {rf_accuracy:.4f}")
print(f"Precision: {rf_precision:.4f}")
print(f"Recall: {rf_recall:.4f}")
print(f"F1 Score: {rf_f1:.4f}")
print("--------------------------------------------------")
print("Gradient Boosting:")
print(f"Accuracy: {gb_accuracy:.4f}")
print(f"Precision: {gb_precision:.4f}")
print(f"Recall: {gb_recall:.4f}")
print(f"F1 Score: {gb_f1:.4f}")
print("--------------------------------------------------")


Model Evaluation Metrics:
--------------------------------------------------
Logistic Regression:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
--------------------------------------------------
Random Forest:
Accuracy: 0.9951
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
--------------------------------------------------
Gradient Boosting:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
--------------------------------------------------


In [None]:
# WE now must choose the best model based on these evaluation metrics

In [45]:
# We are choosing the model that has the highest F1 Score

model_performance = {
    'Logistic Regression': log_f1,
    'Random Forest': rf_f1,
    'Gradient Boosting': gb_f1
}

best_model_name = max(model_performance, key=model_performance.get)
print(f"\nBest model based on F1 Score: {best_model_name}")


Best model based on F1 Score: Logistic Regression


In [47]:
# Now let's save the best model for future use

import joblib

if best_model_name == 'Logistic Regression':
    best_model = log_model
elif best_model_name == 'Random Forest':
    best_model = rf_model
else:
    best_model = gb_model

In [49]:
# Save the best model
joblib.dump(best_model, 'best_model.pkl')

print("\nBest model saved as 'best_model.pkl'")


Best model saved as 'best_model.pkl'


In [None]:
# At this point, would it best for me to perform some hyperparameter tuning like GridSearch or Random Search with cross validatoin? 
# Also, should I have started by combining my preprocessed data or does the approach in this notebook acceptable?