In [8]:
import pandas as pd
import numpy as np

df = pd.read_csv("clusterized_df.csv", encoding = "ISO-8859-1")
df.head()

Unnamed: 0,article,cluster
0,"['musician', 'tackle', 'u', 'red', 'tape', 'mu...",1
1,"['u2s', 'desire', 'number', 'one', 'u2', 'thre...",2
2,"['rocker', 'doherty', 'onstage', 'fight', 'roc...",2
3,"['snicket', 'top', 'u', 'box', 'office', 'char...",2
4,"['ocean', 'twelve', 'raid', 'box', 'office', '...",2


In [9]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# load the training and test data into dataframes
# extract the feature and target columns

X = df["article"]
y = df["cluster"]

# split the dataframe into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

from sklearn.feature_extraction.text import CountVectorizer

# convert the "article" column into a bag of words representation
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

k_range = list(range(1, 21, 2))
param_grid_knn = {'n_neighbors': k_range, 'algorithm': ['brute'], 'weights': ['uniform', 'distance'],
              'metric': ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan']}

param_grid_dt = {'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}

param_grid_rf = {'n_estimators' : [5, 10, 20, 30, 50, 100, 150, 200],
                'max_depth': [None, 10, 15, 20, 25, 50, 100],
                'max_samples': [None, 0.2, 0.35, 0.5, 0.8],
                'max_leaf_nodes': [None, 5, 10, 20, 30, 50, 100]}

# initialize the classifiers
#knn = KNeighborsClassifier()
#decision_tree = DecisionTreeClassifier()
#random_forest = RandomForestClassifier()

# tune the parameters with GridSearchCV
#grid_knn = GridSearchCV(knn, param_grid_knn, cv=5)
#grid_dt = GridSearchCV(decision_tree, param_grid_dt, cv=5)
#grid_rf = GridSearchCV(random_forest, param_grid_rf, cv=5)

knn_best_params = {'algorithm': 'brute', 'metric': 'cosine', 'n_neighbors': 3, 'weights': 'distance'}
dt_best_params = {'max_depth': 10}
rf_best_params = {'max_depth': None, 'max_leaf_nodes': None, 'max_samples': None, 'n_estimators': 150}

grid_knn = KNeighborsClassifier(**knn_best_params)
grid_dt = DecisionTreeClassifier(**dt_best_params)
grid_rf = RandomForestClassifier(**rf_best_params)

# train the classifiers on the training data
grid_knn.fit(X_train, y_train)
grid_dt.fit(X_train, y_train)
grid_rf.fit(X_train, y_train)

# make predictions on the test data
y_pred_knn = grid_knn.predict(X_test)
y_pred_dt = grid_dt.predict(X_test)
y_pred_rf = grid_rf.predict(X_test)

# evaluate the classifiers' performance
print("KNN:")
print(classification_report(y_test, y_pred_knn))
print("Decision Tree:")
print(classification_report(y_test, y_pred_dt))
print("Random Forest:")
print(classification_report(y_test, y_pred_rf))

KNN:
              precision    recall  f1-score   support

           0       0.68      0.98      0.80        48
           1       0.96      0.80      0.87       154
           2       0.90      0.91      0.91        70
           3       0.90      0.95      0.92        75
           4       0.99      0.99      0.99        98

    accuracy                           0.90       445
   macro avg       0.89      0.93      0.90       445
weighted avg       0.92      0.90      0.90       445

Decision Tree:
              precision    recall  f1-score   support

           0       0.80      0.83      0.82        48
           1       0.72      0.82      0.77       154
           2       0.79      0.80      0.79        70
           3       0.89      0.68      0.77        75
           4       0.87      0.83      0.85        98

    accuracy                           0.80       445
   macro avg       0.82      0.79      0.80       445
weighted avg       0.80      0.80      0.80       445

Ra