In [1]:
from sklearn.preprocessing import OneHotEncoder
import csv
import numpy as np
import pandas as pd

# Read data for X and y
X = pd.read_csv("data/X.csv").values
y = pd.read_csv("data/y.csv").values

# Flatten list y
y = [item for sublist in y for item in sublist]

print(len(X))

8168


In [2]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from imblearn.over_sampling import SMOTE

# Apply SMOTE to balance the dataset
sm = SMOTE(k_neighbors=4, random_state=42)
X_res, y_res = sm.fit_resample(X, y)

#oversampler = RandomOverSampler(random_state=42)
#X_res, y_res = oversampler.fit_resample(X,y)

In [3]:
# One-hot-encoding for sequences in X
import numpy as np

list_of_sequences = X_res

# Create a set of unique IDs
unique_ids = set()
for sequence in list_of_sequences:
    unique_ids.update(sequence)

# Convert the set to a sorted list
sorted_unique_ids = sorted(unique_ids)

# Create a dictionary mapping each ID to its index
id_to_index = {id: index for index, id in enumerate(sorted_unique_ids)}

# One-hot encode each sequence separately
encoded_sequences = []
for sequence in list_of_sequences:
    encoded_sequence = np.zeros((len(sequence), len(sorted_unique_ids)), dtype=int)
    for i, id in enumerate(sequence):
        index = id_to_index[id]
        encoded_sequence[i, index] = 1
    encoded_sequences.append(encoded_sequence)

X_res = encoded_sequences

In [None]:
print(len(unique_ids))

In [4]:
# Flatten nested list in order to pass it to the model

X_flattened = np.array([np.array(row).flatten() for row in X_res])

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_flattened, y_res, random_state = 42, test_size = 0.33)

In [None]:
#from sklearn.ensemble import RandomForestClassifier

#model = RandomForestClassifier(criterion = "entropy", n_estimators = 30)
#model.fit(X_train, y_train)

#print(model.score(X_test, y_test))

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import joblib

model = RandomForestClassifier()

grid_space={'max_depth':[3,5,10,None],
              'n_estimators':[10,20,30,40,50],
              'max_features':[1,3,5,7],
              'min_samples_leaf':[1,2,3],
              'min_samples_split':[1,2,3],
              'criterion': ["entropy","gini","log_loss"]
           }

grid = GridSearchCV(model,param_grid=grid_space,cv=3,scoring="f1_micro",verbose=10)

model_grid = grid.fit(X_flattened,y_res)

Fitting 3 folds for each of 2160 candidates, totalling 6480 fits
[CV 1/3; 1/2160] START criterion=entropy, max_depth=3, max_features=1, min_samples_leaf=1, min_samples_split=1, n_estimators=10
[CV 1/3; 1/2160] END criterion=entropy, max_depth=3, max_features=1, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   1.7s
[CV 2/3; 1/2160] START criterion=entropy, max_depth=3, max_features=1, min_samples_leaf=1, min_samples_split=1, n_estimators=10
[CV 2/3; 1/2160] END criterion=entropy, max_depth=3, max_features=1, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   3.5s
[CV 3/3; 1/2160] START criterion=entropy, max_depth=3, max_features=1, min_samples_leaf=1, min_samples_split=1, n_estimators=10
[CV 3/3; 1/2160] END criterion=entropy, max_depth=3, max_features=1, min_samples_leaf=1, min_samples_split=1, n_estimators=10;, score=nan total time=   2.0s
[CV 1/3; 2/2160] START criterion=entropy, max_depth=3, max_features=1, min_sampl

KeyboardInterrupt: 

In [8]:
print('Best hyperparameters are: '+str(model_grid.best_params_))
print('Best score is: '+str(model_grid.best_score_))

Best hyperparameters are: {'criterion': 'entropy', 'max_depth': None, 'max_features': 7, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 40}
Best score is: 0.6738385376999237


In [None]:
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=model.classes_)
disp.plot()
plt.show()