In [1]:
#Count-based approach
import pandas as pd

df = pd.read_csv("data/dataframe.csv")
df.head()

X = df.drop("y", axis=1).values
y = df["y"].values

In [2]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from imblearn.over_sampling import BorderlineSMOTE

# Apply SMOTE to balance the dataset
sm = BorderlineSMOTE(k_neighbors=4, random_state=42)
X_res, y_res = sm.fit_resample(X, y)

#oversampler = RandomOverSampler(random_state=42)
#X_res, y_res = oversampler.fit_resample(X,y)

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, random_state = 42, test_size = 0.33)

In [4]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(criterion = "entropy", n_estimators = 40, max_features=7, min_samples_split=3, min_samples_leaf=1)
model.fit(X_train, y_train)

print(model.score(X_test, y_test))

0.8954510703363915


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
import joblib

# Define the scoring metric
scorer = make_scorer(f1_score)

model = RandomForestClassifier()

grid_space={'max_depth':[3,5,10,None],
              'n_estimators':[10,20,30,40,50],
              'max_features':[1,3,5,7],
              'min_samples_leaf':[1,2,3],
              'min_samples_split':[1,2,3],
              'criterion': ["entropy","gini","log_loss"]
           }

grid = GridSearchCV(model,param_grid=grid_space,cv=3,scoring="f1_micro",verbose=10)

# Enable verbose logging using joblib
#joblib.parallel_backend('threading')

model_grid = grid.fit(X_train,y_train)

In [None]:
print('Best hyperparameters are: '+str(model_grid.best_params_))
print('Best score is: '+str(model_grid.best_score_))

In [5]:
y_pred = model.predict(X_test)

In [6]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           7       0.89      0.92      0.90       451
          15       0.89      0.95      0.92       427
          16       0.96      0.98      0.97       454
          21       0.89      0.98      0.94       446
          33       0.92      0.82      0.87       453
          34       1.00      1.00      1.00       420
          39       0.98      0.97      0.98       433
          56       0.90      0.87      0.88       444
          68       0.72      0.80      0.76       438
          95       0.94      0.97      0.95       409
        1000       0.90      0.73      0.81       408
        1001       0.78      0.75      0.77       449

    accuracy                           0.90      5232
   macro avg       0.90      0.90      0.90      5232
weighted avg       0.90      0.90      0.89      5232



In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred, labels=model_grid.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=model.classes_)
disp.plot()
plt.show()