In [1]:
import pickle

# Load preprocessed data from the pickle file
with open('preprocessed_data.pkl', 'rb') as f:
    X_train, X_test, y_train, y_test = pickle.load(f)

# Verify the shapes
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (8000, 8)
X_test shape: (2000, 8)
y_train shape: (8000,)
y_test shape: (2000,)


In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42, n_estimators=100, class_weight='balanced')

# Train the model on the training data
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model performance
print("RandomForest Classification Report:")
print(classification_report(y_test, y_pred_rf))


RandomForest Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1939
           1       0.92      0.38      0.53        61

    accuracy                           0.98      2000
   macro avg       0.95      0.69      0.76      2000
weighted avg       0.98      0.98      0.98      2000



In [3]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("After SMOTE, training set shape:", X_train_smote.shape)

# Train RandomForest on SMOTE-resampled data
rf_model_smote = RandomForestClassifier(random_state=42, n_estimators=100, class_weight='balanced')
rf_model_smote.fit(X_train_smote, y_train_smote)

# Predict on the original test set
y_pred_rf_smote = rf_model_smote.predict(X_test)

# Evaluate performance
print("RandomForest with SMOTE Classification Report:")
print(classification_report(y_test, y_pred_rf_smote))


After SMOTE, training set shape: (15444, 8)
RandomForest with SMOTE Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      1939
           1       0.48      0.75      0.59        61

    accuracy                           0.97      2000
   macro avg       0.74      0.86      0.79      2000
weighted avg       0.98      0.97      0.97      2000



In [6]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Define the pipeline steps
pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42, class_weight='balanced'))
])

param_grid_pipeline = {
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [None, 10, 20],
    'rf__min_samples_split': [2, 5],
    'rf__min_samples_leaf': [1, 2]
}

grid_search_pipeline = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid_pipeline,
    cv=5,
    scoring='f1',  # F1 is a good balance between precision and recall for imbalanced data
    n_jobs=-1,
    verbose=2
)

grid_search_pipeline.fit(X_train, y_train)

print("Best parameters (with SMOTE):", grid_search_pipeline.best_params_)
print("Best cross-validation F1 score (with SMOTE):", grid_search_pipeline.best_score_)


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters (with SMOTE): {'rf__max_depth': 20, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 200}
Best cross-validation F1 score (with SMOTE): 0.6181977626439449


In [7]:
# Get the best RandomForest model with SMOTE from the grid search
best_rf_pipeline = grid_search_pipeline.best_estimator_

# Predict on the test set
y_pred_best_rf_pipeline = best_rf_pipeline.predict(X_test)

# Print the classification report for evaluation
print("Classification Report for Best RandomForest Model (with SMOTE):")
print(classification_report(y_test, y_pred_best_rf_pipeline))


Classification Report for Best RandomForest Model (with SMOTE):
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      1939
           1       0.49      0.75      0.59        61

    accuracy                           0.97      2000
   macro avg       0.74      0.86      0.79      2000
weighted avg       0.98      0.97      0.97      2000

