In [6]:
# pip3 install pandas os numpy scikit-learn matplotlib seaborn torch 

In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Models

In [5]:
x_train = pd.read_csv("../data/x_train.csv")
x_test = pd.read_csv("../data/x_test.csv")
y_train = pd.read_csv("../data/y_train.csv")
y_test = pd.read_csv("../data/y_test.csv")

In [81]:
# Random Forest Classifier

rf = RandomForestClassifier(random_state=83, n_estimators=100)

param_dist = {
    'n_estimators': [100, 250],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5], 
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

random_search = RandomizedSearchCV(estimator=rf, param_distribution=param_dist, cv=3, n_jobs=-1, scoring='accuracy')
random_search.fit(x_train, y_train)

print(random_search.best_params_)

best_rf = random_search.best_estimator_
y_pred_rf = best_rf.predict(x_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred_rf)*100:.2f}%")
print("Recall Score: ", recall_score(y_test, y_pred_rf))
print("Precision Score: ", precision_score(y_test, y_pred_rf))
print("F1 Score: ", f1_score(y_test, y_pred_rf))
print("Confusion Matrix: \n ", confusion_matrix(y_test, y_pred_rf))

{'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 250}
Accuracy: 78.47%
Recall Score:  0.8101886574741822
Precision Score:  0.6664293849658315
F1 Score:  0.7313110272099987
Confusion Matrix [[23577  7029]
 [ 3290 14043]]


In [115]:
# Gradient Boosting Classifier 

gb = GradientBoostingClassifier(random_state=92)

param_dist = {
    'n_estimators': [100, 250],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.6, 1.0]
}

random_search = RandomizedSearchCV(estimator=gb, param_distributions=param_dist, cv=5, n_jobs=-1, scoring='accuracy')

random_search.fit(x_train, y_train)

print(random_search.best_params_)

best_gb = random_search.best_estimator_
y_pred_gb = best_gb.predict(x_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred_gb)*100:.2f}%")
print("Recall Score: ", recall_score(y_test, y_pred_gb))
print("Precision Score: ", precision_score(y_test, y_pred_gb))
print("F1 Score: ", f1_score(y_test, y_pred_gb))
print("Confusion Matrix", confusion_matrix(y_test, y_pred_gb))

{'subsample': 0.6, 'n_estimators': 100, 'max_depth': 7, 'learning_rate': 0.1}
Accuracy: 78.69%
Recall Score:  0.6716021260440395
Precision Score:  0.811049343801937
F1 Score:  0.7347680485968693
Confusion Matrix [[23570  3297]
 [ 6920 14152]]


In [75]:
# MLP Classifier 

mlp = MLPClassifier(random_state=75)
mlp.fit(x_train, y_train)
y_pred_mlp = mlp.predict(x_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred_mlp)*100:.2f}%")
print("Recall Score: ", recall_score(y_test, y_pred_mlp))
print("Precision Score: ", precision_score(y_test, y_pred_mlp))
print("F1 Score: ", f1_score(y_test, y_pred_mlp))
print("Confusion Matrix", confusion_matrix(y_test, y_pred_mlp))

Accuracy: 79.10%
Recall Score:  0.8261998937363481
Precision Score:  0.6641514806378133
F1 Score:  0.7363657888505959
Confusion Matrix [[23923  7077]
 [ 2944 13995]]


In [112]:
# Logistic Regression 

log = LogisticRegression(random_state=4, max_iter=1000)

param_dist = {
    'penalty': ['none', 'l1', 'l2'],
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs', 'saga']
}

random_search = RandomizedSearchCV(estimator=log, param_distributions=param_dist, cv=5, scoring='accuracy')
random_search.fit(x_train, y_train)

print(random_search.best_params_)

y_pred_log = random_search.predict(x_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred_log)*100:.2f}%")
print("Recall Score: ", recall_score(y_test, y_pred_log))
print("Precision Score: ", precision_score(y_test, y_pred_log))
print("F1 Score: ", f1_score(y_test, y_pred_log))
print("Confusion Matrix", confusion_matrix(y_test, y_pred_log))

15 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 1356, in wrapper
    estimator._validate_params()
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/sklearn/base.py", line 469, in _validate_params
    validate_parameter_constraints(
  File "/Library/Frameworks/Python.framework/V

{'solver': 'liblinear', 'penalty': 'l1', 'C': 10}
Accuracy: 76.99%
Recall Score:  0.6739749430523918
Precision Score:  0.7733188129594337
F1 Score:  0.7202373405685016
Confusion Matrix [[22704  4163]
 [ 6870 14202]]


# Ensemble Model

In [35]:
X = dataset.drop(columns=["target"])
y = dataset["target"]

x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=20, test_size=0.2)

In [36]:
numerical_features = x_train.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_features = x_train.select_dtypes(include=["object", "category"]).columns.tolist()

In [37]:
# Defining core models

log_reg = LogisticRegression(max_iter=500)
svm = SVC(probability=True)
rf = RandomForestClassifier(n_estimators=300)
xgb_clf = xgb.XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss'
)

In [38]:
preprocessor_ohe = ColumnTransformer([
    ("num", StandardScaler(), numerical_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

preprocessor_oe = ColumnTransformer([
    ("num", "passthrough", numerical_features),
    ("cat", OrdinalEncoder(), categorical_features)
])

In [39]:
# Creating pipelines 

log_reg_pipeline = Pipeline([
    ("preprocess", preprocessor_ohe),
    ("model", log_reg)
])
svm_pipeline = Pipeline([
    ("preprocess", preprocessor_ohe),
    ("model", svm)
])

rf_pipeline = Pipeline([
    ("preprocess", preprocessor_oe),
    ("model", rf)
])
xgb_pipeline = Pipeline([
    ("preprocess", preprocessor_oe),
    ("model", xgb_clf)
])

In [40]:
rf_pipeline.fit(x_train, y_train)
log_reg_pipeline.fit(x_train, y_train)
# svm_pipeline.fit(x_train, y_train)
xgb_pipeline.fit(x_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [42]:
voting_clf = VotingClassifier(
    estimators=[
        ('rf', rf_pipeline),
        ('log_reg', log_reg_pipeline),
        # ('svm', svm_pipeline)
        ('xgb_clf', xgb_pipeline)
    ],
    voting='soft'
)

In [43]:
voting_clf.fit(x_train, y_train)

0,1,2
,estimators,"[('rf', ...), ('log_reg', ...), ...]"
,voting,'soft'
,weights,
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,500

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [44]:
v_pred = voting_clf.predict(x_test)
print("Voting Accuracy:", accuracy_score(y_test, v_pred))
print("Voting F1:", f1_score(y_test, v_pred))

Voting Accuracy: 0.7856859759277415
Voting F1: 0.7323642805043242


parameter tuning