In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# dataset
df = pd.read_csv("C:/Users/harsh/ensemble-learning-project/data/Ola_dataset.csv")
df.head()

Unnamed: 0,no_of_reportings,Driver_ID,Age,Gender,City,Education_Level,Grade,Total Business Value,Income,Joining Designation,Quarterly Rating,joining_month,joining_year,target,quaterly_rating_raise,Income_raised
0,3,1,28,0,C23,2,1,1715580,172161,1,2,12,2018,1,0,0
1,2,2,31,0,C7,2,2,0,134032,2,1,11,2020,0,0,0
2,5,4,43,0,C13,2,2,350000,328015,2,1,12,2019,1,0,0
3,3,5,29,0,C9,0,1,120360,139104,1,1,1,2019,1,0,0
4,5,6,31,1,C11,1,3,1265000,393640,3,2,7,2020,0,1,0


In [None]:
# Check for missing values
df.isnull().sum()



#  fill missing values 
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Income'] = df['Income'].fillna(df['Income'].median())


In [3]:
from sklearn.preprocessing import LabelEncoder

# Label encoding for binary columns like 'Gender'
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])


In [4]:
df = pd.get_dummies(df, columns=['Education_Level', 'City'], drop_first=True)


In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Select the numerical columns to scale
numerical_cols = ['Age', 'Income', 'Total Business Value', 'Quarterly Rating']

# Apply scaling
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])


In [6]:
df = df.drop(['Driver_ID'], axis=1)  # Drop columns that are irrelevant


In [7]:
df.columns

Index(['no_of_reportings', 'Age', 'Gender', 'Grade', 'Total Business Value',
       'Income', 'Joining Designation', 'Quarterly Rating', 'joining_month',
       'joining_year', 'target', 'quaterly_rating_raise', 'Income_raised',
       'Education_Level_1', 'Education_Level_2', 'City_C10', 'City_C11',
       'City_C12', 'City_C13', 'City_C14', 'City_C15', 'City_C16', 'City_C17',
       'City_C18', 'City_C19', 'City_C2', 'City_C20', 'City_C21', 'City_C22',
       'City_C23', 'City_C24', 'City_C25', 'City_C26', 'City_C27', 'City_C28',
       'City_C29', 'City_C3', 'City_C4', 'City_C5', 'City_C6', 'City_C7',
       'City_C8', 'City_C9'],
      dtype='object')

In [8]:
from sklearn.model_selection import train_test_split

# Define features (X) and target (y)
X = df.drop('target', axis=1)  # Drop the target column for features
y = df['target']  # Target column

# Split data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [None]:
# Check the distribution of target classes
y.value_counts()

# class imbalance is severe, consider using SMOTE (Synthetic Minority Over-sampling Technique) 
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)


In [10]:
df['target'].value_counts(normalize=True)

1    0.678706
0    0.321294
Name: target, dtype: float64

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Initialize Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the resampled (SMOTE) data
rf_model.fit(X_train_smote, y_train_smote)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9266247379454927
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.87      0.88       150
           1       0.94      0.95      0.95       327

    accuracy                           0.93       477
   macro avg       0.92      0.91      0.91       477
weighted avg       0.93      0.93      0.93       477

Confusion Matrix:
 [[131  19]
 [ 16 311]]


In [19]:
from sklearn.model_selection import GridSearchCV

# Hyperparameters for Random Forest
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

# Grid Search with cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_smote, y_train_smote)

# Best parameters and model
print("Best Parameters:", grid_search.best_params_)

# Evaluate the best model
best_model_rf = grid_search.best_estimator_
y_pred_best = best_model_rf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_best))
print("Classification Report:\n", classification_report(y_test, y_pred_best))


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Accuracy: 0.9224318658280922
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.87      0.88       150
           1       0.94      0.95      0.94       327

    accuracy                           0.92       477
   macro avg       0.91      0.91      0.91       477
weighted avg       0.92      0.92      0.92       477



In [14]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize Gradient Boosting model
gb_model = GradientBoostingClassifier()

# Train the model on the resampled (SMOTE) data
gb_model.fit(X_train_smote, y_train_smote)

# Make predictions on the test set
y_pred_gb = gb_model.predict(X_test)

# Evaluate the Gradient Boosting model
print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred_gb))
print("Classification Report:\n", classification_report(y_test, y_pred_gb))


Gradient Boosting Accuracy: 0.9203354297693921
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.88      0.87       150
           1       0.94      0.94      0.94       327

    accuracy                           0.92       477
   macro avg       0.91      0.91      0.91       477
weighted avg       0.92      0.92      0.92       477



In [15]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

# Initialize Gradient Boosting model
gb_model = GradientBoostingClassifier()

# Define hyperparameters for tuning
param_grid_gb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

# Perform Grid Search with cross-validation
grid_search_gb = GridSearchCV(estimator=gb_model, param_grid=param_grid_gb, cv=5, n_jobs=-1, verbose=2)
grid_search_gb.fit(X_train_smote, y_train_smote)

# Get the best hyperparameters
print("Best Gradient Boosting Hyperparameters:", grid_search_gb.best_params_)

# Best Gradient Boosting model
best_gb_model = grid_search_gb.best_estimator_


Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best Gradient Boosting Hyperparameters: {'learning_rate': 0.1, 'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}


In [16]:
y_pred_gb = best_gb_model.predict(X_test)

# Print the accuracy of the tuned Gradient Boosting model
accuracy = accuracy_score(y_test, y_pred_gb)
print("Accuracy of Hyperparameter-Tuned Gradient Boosting Model:", accuracy)

Accuracy of Hyperparameter-Tuned Gradient Boosting Model: 0.9140461215932913


In [17]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# Initialize XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

# Define hyperparameters for tuning
param_grid_xgb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

# Perform Grid Search with cross-validation
grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb, cv=5, n_jobs=-1, verbose=2)
grid_search_xgb.fit(X_train_smote, y_train_smote)

# Get the best hyperparameters
print("Best XGBoost Hyperparameters:", grid_search_xgb.best_params_)

# Best XGBoost model
best_xgb_model = grid_search_xgb.best_estimator_


Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best XGBoost Hyperparameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}


In [18]:
from sklearn.metrics import accuracy_score, classification_report

# Evaluate the best XGBoost model on the test set
y_pred_xgb = best_xgb_model.predict(X_test)

# Print the accuracy of the tuned XGBoost model
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print("Accuracy of Hyperparameter-Tuned XGBoost Model:", accuracy_xgb)

# Print the classification report for the tuned XGBoost model
print("Classification Report for Hyperparameter-Tuned XGBoost Model:")
print(classification_report(y_test, y_pred_xgb))


Accuracy of Hyperparameter-Tuned XGBoost Model: 0.9182389937106918
Classification Report for Hyperparameter-Tuned XGBoost Model:
              precision    recall  f1-score   support

           0       0.87      0.87      0.87       150
           1       0.94      0.94      0.94       327

    accuracy                           0.92       477
   macro avg       0.91      0.90      0.91       477
weighted avg       0.92      0.92      0.92       477



In [20]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report

# Create a Voting Classifier using the best models from Random Forest, Gradient Boosting, and XGBoost
voting_clf = VotingClassifier(estimators=[
    ('rf', best_model_rf),  # Random Forest
    ('gb', best_gb_model),  # Gradient Boosting
    ('xgb', best_xgb_model)  # XGBoost
], voting='hard')  # Use 'hard' voting (majority class voting)

# Train the ensemble model
voting_clf.fit(X_train_smote, y_train_smote)

# Evaluate the ensemble model on the test data
y_pred_voting = voting_clf.predict(X_test)

# Print the accuracy of the ensemble Voting Classifier
accuracy_voting = accuracy_score(y_test, y_pred_voting)
print("Accuracy of Voting Classifier (Random Forest, Gradient Boosting, XGBoost):", accuracy_voting)

# Print the classification report for the ensemble Voting Classifier
print("Classification Report for Voting Classifier (Random Forest, Gradient Boosting, XGBoost):")
print(classification_report(y_test, y_pred_voting))


Accuracy of Voting Classifier (Random Forest, Gradient Boosting, XGBoost): 0.9203354297693921
Classification Report for Voting Classifier (Random Forest, Gradient Boosting, XGBoost):
              precision    recall  f1-score   support

           0       0.88      0.87      0.87       150
           1       0.94      0.94      0.94       327

    accuracy                           0.92       477
   macro avg       0.91      0.91      0.91       477
weighted avg       0.92      0.92      0.92       477



In [22]:
import joblib
joblib.dump(le, 'C:/Users/harsh/ensemble-learning-project/models/label_encoder.pkl')
#joblib.dump(columns_used_for_dummies, 'models/one_hot_columns.pkl')
joblib.dump(scaler, 'C:/Users/harsh/ensemble-learning-project/models/scaler.pkl')
joblib.dump(smote, 'C:/Users/harsh/ensemble-learning-project/models/smote.pkl')

joblib.dump(best_model_rf, 'C:/Users/harsh/ensemble-learning-project/models/random_forest_model.pkl')
joblib.dump(best_gb_model, 'C:/Users/harsh/ensemble-learning-project/models/gradient_boosting_model.pkl')
joblib.dump(best_xgb_model, 'C:/Users/harsh/ensemble-learning-project/models/xgboost_model.pkl')
joblib.dump(voting_clf, 'C:/Users/harsh/ensemble-learning-project/models/voting_classifier_model.pkl')


['C:/Users/harsh/ensemble-learning-project/models/voting_classifier_model.pkl']

In [23]:
# Save column names used for pd.get_dummies()
columns_used_for_dummies = ['Education_Level', 'City']  # List of columns that were one-hot encoded
joblib.dump(columns_used_for_dummies, 'C:/Users/harsh/ensemble-learning-project/models/one_hot_columns.pkl')


['C:/Users/harsh/ensemble-learning-project/models/one_hot_columns.pkl']