# Uploading the Dataset and Converting to Data Frame

In [1]:
import pandas as pd
from google.colab import files

Upload = files.upload()

df_train = pd.read_csv('train.csv')
df_valid = pd.read_csv('valid.csv')
df_test = pd.read_csv('test.csv')

display(df_train.head())

Saving test.csv to test.csv
Saving train.csv to train.csv
Saving valid.csv to valid.csv


Unnamed: 0,label,text
0,2,"2-D STUDY,1. Mild aortic stenosis, widely calc..."
1,1,"PREOPERATIVE DIAGNOSES: , Dysphagia and esopha..."
2,2,"CHIEF COMPLAINT:, The patient comes for three..."
3,1,"PROCEDURE: , Bilateral L5, S1, S2, and S3 radi..."
4,2,"DISCHARGE DIAGNOSES:,1. Chronic obstructive pu..."


# Checking for Missing Values

In [2]:
# Spliting label and text
x_train, y_train = df_train['text'], df_train['label']
x_valid, y_valid = df_valid['text'], df_valid['label']
x_test,  y_test = df_test['text'], df_test['label']

# Checking for null data
print("Train - text nulls:", x_train.isna().sum(), " -  label nulls:", y_train.isna().sum())
print("Valid - text nulls:", x_valid.isna().sum(), " -  label nulls:", y_valid.isna().sum())
print("Test  - text nulls:", x_test.isna().sum(),  " -  label nulls:", y_test.isna().sum())

Train - text nulls: 0  -  label nulls: 0
Valid - text nulls: 0  -  label nulls: 0
Test  - text nulls: 0  -  label nulls: 0


# Preparing the Text
 * Remove punctuations
 * Convert all text to lower case
 * Tokenize the text

In [3]:
import re
from sklearn.feature_extraction.text import CountVectorizer
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
def text_preprocessor(text):
    # This function removes punctuation and digits, and converts to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Only keep alphabets and spaces
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    return tokens


x_train_clean = x_train.apply(text_preprocessor)
x_valid_clean = x_valid.apply(text_preprocessor)
x_test_clean = x_test.apply(text_preprocessor)

print(x_train_clean)

0       [2d, study1, mild, aortic, stenosis, widely, c...
1       [preoperative, diagnoses, dysphagia, and, esop...
2       [chief, complaint, the, patient, comes, for, t...
3       [procedure, bilateral, l5, s1, s2, and, s3, ra...
4       [discharge, diagnoses1, chronic, obstructive, ...
                              ...                        
3995    [preoperative, diagnosis, recurring, bladder, ...
3996    [procedure, colonoscopypreoperative, diagnoses...
3997    [preoperative, diagnoses1, senile, nuclear, ca...
3998    [presentation, patient, 13, years, old, comes,...
3999    [preoperative, diagnosis, left, renal, mass, 5...
Name: text, Length: 4000, dtype: object


In [5]:
from collections import Counter

# Flattening the tokenized lists into one big list of words
all_texts = [word for tokens in x_train_clean for word in tokens]

# frequency of words
word_freq = Counter(all_texts)

print("Most common words:", word_freq.most_common(5))

Most common words: [('the', 117063), ('and', 66785), ('was', 56124), ('of', 48284), ('to', 40591)]


**Deliverable Vocab Format**

In [6]:
Vocab = [(word, rank, freq) for rank, (word, freq) in enumerate(word_freq.most_common(10000), start=1)]

print(Vocab[:5])

# Save Vocab as txt file
with open("Vocab.txt", "w") as f:
    for word, rank, freq in Vocab:
        f.write(f"{word} {rank} {freq}\n")

print("File saved as Vocab.txt")

[('the', 1, 117063), ('and', 2, 66785), ('was', 3, 56124), ('of', 4, 48284), ('to', 5, 40591)]
File saved as Vocab.txt


**In this cell we replace words with their repetition number (frequency) through the train data**

In [7]:
word_to_rank = {word: rank for (word, rank, freq) in Vocab}

x_train_ranked = x_train_clean.apply(lambda toks: [word_to_rank[w] for w in toks if w in word_to_rank])
x_valid_ranked = x_valid_clean.apply(lambda toks: [word_to_rank[w] for w in toks if w in word_to_rank])
x_test_ranked  = x_test_clean.apply(lambda toks: [word_to_rank[w] for w in toks if w in word_to_rank])

# Checking
print(x_train_ranked.iloc[0:3][:])        # first 20 ranks of sample 0
print("Empty rows (train):", (x_train_ranked.str.len() == 0).sum())

# Saving as .txt file
def save_ranked(series, filename):
    with open(filename, "w") as f:
        for tokens in series:
            f.write(" ".join(map(str, tokens)) + "\n")

# Save train/valid/test
save_ranked(x_train_ranked, "train_ranked.txt")
save_ranked(x_valid_ranked, "valid_ranked.txt")
save_ranked(x_test_ranked,  "test_ranked.txt")

print("Files saved: train_ranked.txt, valid_ranked.txt, test_ranked.txt")

0    [5849, 9083, 143, 384, 244, 2433, 2138, 3725, ...
1    [120, 685, 1525, 2, 1104, 685, 2534, 2, 1104, ...
2    [599, 480, 1, 10, 1341, 13, 3532, 5583, 1569, ...
Name: text, dtype: object
Empty rows (train): 0
Files saved: train_ranked.txt, valid_ranked.txt, test_ranked.txt


# BBoW

In [8]:
from tkinter.constants import X
import numpy as np

# Vocab is a list of (word, rank, freq) sorted by rank (rank starts at 1)
vocab_words = [w for (w, r, f) in Vocab]
word_to_col = {w: j for j, w in enumerate(vocab_words)}

def bbow_matrix(token, word_to_col, n_vocab=10000):
    n_docs = len(token)
    X = np.zeros((n_docs, n_vocab), dtype=np.uint8)
    for i, toks in enumerate(token):
        for w in set(toks):
            j = word_to_col.get(w)
            if j is not None:
                X[i, j] = 1
    return X

# Build BBoW for train/valid/test data
X_train_bbow = bbow_matrix(x_train_clean, word_to_col, n_vocab=len(vocab_words))
X_valid_bbow = bbow_matrix(x_valid_clean, word_to_col, n_vocab=len(vocab_words))
X_test_bbow  = bbow_matrix(x_test_clean,  word_to_col, n_vocab=len(vocab_words))

print("Train:", X_train_bbow.shape)
print("Valid:", X_valid_bbow.shape)
print("Test :", X_test_bbow.shape)

x_train_bbow = np.array(X_train_bbow)
x_valid_bbow = np.array(X_valid_bbow)
x_test_bbow  = np.array(X_test_bbow)

print("\nx_train_BBoW:\n", x_train_bbow)

Train: (4000, 10000)
Valid: (499, 10000)
Test : (500, 10000)

x_train_BBoW:
 [[1 1 0 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 ...
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]]


# FBoW

In [9]:
# 1) Build vocab (top 10k by frequency on train)
all_words = [w for toks in x_train_clean for w in toks]
word_freq = Counter(all_words)
vocab = [w for w, _ in word_freq.most_common(10000)]
word_to_col = {w: j for j, w in enumerate(vocab)}

# 2) Build FBoW matrices
def fbow_matrix(token_series, word_to_col, n_vocab=10000):
    n_docs = len(token_series)
    X= np.zeros((n_docs, n_vocab), dtype=np.float32)
    for i, toks in enumerate(token_series):
        n = len(toks)
        if n == 0:
            continue
        inv_n = 1.0 / n
        counts = Counter(toks)
        for w, c in counts.items():
            j = word_to_col.get(w)
            if j is not None:
                X[i, j] = c *inv_n
    return X

x_train_fbow = fbow_matrix(x_train_clean, word_to_col, len(vocab))
x_valid_fbow = fbow_matrix(x_valid_clean, word_to_col, len(vocab))
x_test_fbow  = fbow_matrix(x_test_clean,  word_to_col, len(vocab))

print("Train:", x_train_fbow.shape, "Valid:", x_valid_fbow.shape, "Test:", x_test_fbow.shape)
print("\nTrain:\n", x_train_fbow)

Train: (4000, 10000) Valid: (499, 10000) Test: (500, 10000)

Train:
 [[0.01041667 0.02083333 0.         ... 0.         0.         0.        ]
 [0.12903225 0.03225806 0.05734767 ... 0.         0.         0.        ]
 [0.01678657 0.03597122 0.00479616 ... 0.         0.         0.        ]
 ...
 [0.10288582 0.0250941  0.05395232 ... 0.         0.         0.        ]
 [0.02644964 0.03560529 0.00813835 ... 0.         0.         0.        ]
 [0.13204509 0.01449275 0.06119163 ... 0.         0.         0.        ]]


# Logistic Regression
**BBoW**

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix


x_train, x_valid, x_test = x_train_bbow, x_valid_bbow, x_test_bbow
y_train, y_valid, y_test = np.array(y_train), np.array(y_valid), np.array(y_test)

# Train LR
LR = LogisticRegression(max_iter=2000)
LR.fit(x_train, y_train)

y_pred_train = LR.predict(x_train)
y_pred_valid = LR.predict(x_valid)
y_pred_test = LR.predict(x_test)

# Accuracy
acc_train = accuracy_score(y_train, y_pred_train)
acc_valid = accuracy_score(y_valid,y_pred_valid)
acc_test = accuracy_score(y_test, y_pred_test)

# F1 Score
f1_train_lr1 = f1_score(y_train, y_pred_train, average="macro")
f1_valid_lr1 = f1_score(y_valid, y_pred_valid, average="macro")
f1_test_lr1 = f1_score(y_test, y_pred_test, average="macro")

# Confusion Matrix
cm_train = confusion_matrix(y_train,y_pred_train)
cm_valid = confusion_matrix(y_valid,y_pred_valid)
cm_test = confusion_matrix(y_test, y_pred_test)


print("Accuracy Train", acc_train*100,"\nAccuracy Valid:", acc_valid*100,"\nAccuracy Test:",acc_test*100)
print("\nf1 Train:", f1_train_lr1, "\nf1 Valid:", f1_valid_lr1,"\nf1 Test:", f1_test_lr1)
print("\nConfusion Matrix Train:\n", cm_train, "\nConfusion Matrix Valid:\n", cm_valid, "\nConfusion Matrix Test:\n", cm_test)


Accuracy Train 83.22500000000001 
Accuracy Valid: 50.70140280561122 
Accuracy Test: 54.2

f1 Train: 0.8256598907818986 
f1 Valid: 0.48333614013409193 
f1 Test: 0.506596043285073

Confusion Matrix Train:
 [[1142   25   50   60]
 [   6  805   92  105]
 [  54  108  766    6]
 [  87   72    6  616]] 
Confusion Matrix Valid:
 [[109  14  19  23]
 [  5  63  24  26]
 [ 13  35  53   5]
 [ 42  37   3  28]] 
Confusion Matrix Test:
 [[133  10  22  33]
 [  7  48  19  28]
 [ 19  28  57   5]
 [ 25  29   4  33]]


**FBoW**

In [13]:
x_train, x_valid, x_test = x_train_fbow, x_valid_fbow, x_test_fbow
y_train, y_valid, y_test = np.array(y_train), np.array(y_valid), np.array(y_test)

# Train LR
LR = LogisticRegression(max_iter=2000)
LR.fit(x_train, y_train)

y_pred_train = LR.predict(x_train)
y_pred_valid = LR.predict(x_valid)
y_pred_test = LR.predict(x_test)

# Accuracy
acc_train = accuracy_score(y_train, y_pred_train)
acc_valid = accuracy_score(y_valid,y_pred_valid)
acc_test = accuracy_score(y_test, y_pred_test)

# F1 Score
f1_train_lr2 = f1_score(y_train, y_pred_train, average="macro")
f1_valid_lr2 = f1_score(y_valid, y_pred_valid, average="macro")
f1_test_lr2 = f1_score(y_test, y_pred_test, average="macro")

# Confusion Matrix
cm_train = confusion_matrix(y_train,y_pred_train)
cm_valid = confusion_matrix(y_valid,y_pred_valid)
cm_test = confusion_matrix(y_test, y_pred_test)


print("Accuracy Train", acc_train*100,"\nAccuracy Valid:", acc_valid*100,"\nAccuracy Test:",acc_test*100)
print("\nf1 Train:", f1_train_lr2, "\nf1 Valid:", f1_valid_lr2,"\nf1 Test:", f1_test_lr2)
print("\nConfusion Matrix Train:\n", cm_train, "\nConfusion Matrix Valid:\n", cm_valid, "\nConfusion Matrix Test:\n", cm_test)

Accuracy Train 50.775000000000006 
Accuracy Valid: 49.69939879759519 
Accuracy Test: 52.800000000000004

f1 Train: 0.3291584343682517 
f1 Valid: 0.32303404842731515 
f1 Test: 0.3295054637099849

Confusion Matrix Train:
 [[1137  137    3    0]
 [ 119  878   11    0]
 [ 392  526   16    0]
 [ 364  417    0    0]] 
Confusion Matrix Valid:
 [[147  18   0   0]
 [ 15  99   4   0]
 [ 38  66   2   0]
 [ 53  57   0   0]] 
Confusion Matrix Test:
 [[175  23   0   0]
 [ 14  87   1   0]
 [ 60  47   2   0]
 [ 46  45   0   0]]


# Decision Tree
**BBoW**

In [54]:
from sklearn.tree import DecisionTreeClassifier

# ===== Decision Tree on BBoW =====
x_train, x_valid, x_test = x_train_bbow, x_valid_bbow, x_test_bbow
y_train, y_valid, y_test = np.array(y_train), np.array(y_valid), np.array(y_test)

DT = DecisionTreeClassifier(max_depth=4, random_state=42)
DT.fit(x_train, y_train)

y_pred_train = DT.predict(x_train)
y_pred_valid = DT.predict(x_valid)
y_pred_test  = DT.predict(x_test)

acc_train = accuracy_score(y_train, y_pred_train)
acc_valid = accuracy_score(y_valid, y_pred_valid)
acc_test  = accuracy_score(y_test,  y_pred_test)

f1_train_dt1 = f1_score(y_train, y_pred_train, average="macro")
f1_valid_dt1 = f1_score(y_valid, y_pred_valid, average="macro")
f1_test_dt1  = f1_score(y_test,  y_pred_test,  average="macro")

cm_train = confusion_matrix(y_train, y_pred_train)
cm_valid = confusion_matrix(y_valid, y_pred_valid)
cm_test  = confusion_matrix(y_test,  y_pred_test)

print("Decision Tree (BBoW)")
print("Accuracy Train:", acc_train*100, "\nAccuracy Valid:", acc_valid*100, "\nAccuracy Test:", acc_test*100)
print("\nf1 Train:", f1_train_dt1, "\nf1 Valid:", f1_valid_dt1, "\nf1 Test:", f1_test_dt1)
print("\nConfusion Matrix Train:\n", cm_train, "\nConfusion Matrix Valid:\n", cm_valid, "\nConfusion Matrix Test:\n", cm_test)

Decision Tree (BBoW)
Accuracy Train: 54.1 
Accuracy Valid: 53.50701402805611 
Accuracy Test: 55.60000000000001

f1 Train: 0.457960155217277 
f1 Valid: 0.45474329281852427 
f1 Test: 0.4455270525227616

Confusion Matrix Train:
 [[997   7 271   2]
 [ 31 361 615   1]
 [140  21 772   1]
 [263  32 452  34]] 
Confusion Matrix Valid:
 [[130   4  31   0]
 [  6  45  67   0]
 [ 15   4  87   0]
 [ 47   6  52   5]] 
Confusion Matrix Test:
 [[151   1  46   0]
 [  2  35  65   0]
 [ 18   1  90   0]
 [ 35   3  51   2]]


**FBoW**

In [55]:
x_train, x_valid, x_test = x_train_fbow, x_valid_fbow, x_test_fbow
y_train, y_valid, y_test = np.array(y_train), np.array(y_valid), np.array(y_test)

DT = DecisionTreeClassifier(max_depth=4, random_state=42)
DT.fit(x_train, y_train)

y_pred_train = DT.predict(x_train)
y_pred_valid = DT.predict(x_valid)
y_pred_test  = DT.predict(x_test)

acc_train = accuracy_score(y_train, y_pred_train)
acc_valid = accuracy_score(y_valid, y_pred_valid)
acc_test  = accuracy_score(y_test,  y_pred_test)

f1_train_dt2 = f1_score(y_train, y_pred_train, average="macro")
f1_valid_dt2 = f1_score(y_valid, y_pred_valid, average="macro")
f1_test_dt2  = f1_score(y_test,  y_pred_test,  average="macro")

cm_train = confusion_matrix(y_train, y_pred_train)
cm_valid = confusion_matrix(y_valid, y_pred_valid)
cm_test  = confusion_matrix(y_test,  y_pred_test)

print("\nDecision Tree (FBoW)")
print("Accuracy Train:", acc_train*100, "\nAccuracy Valid:", acc_valid*100, "\nAccuracy Test:", acc_test*100)
print("\nf1 Train:", f1_train_dt2, "\nf1 Valid:", f1_valid_dt2, "\nf1 Test:", f1_test_dt2)
print("\nConfusion Matrix Train:\n", cm_train, "\nConfusion Matrix Valid:\n", cm_valid, "\nConfusion Matrix Test:\n", cm_test)


Decision Tree (FBoW)
Accuracy Train: 54.825 
Accuracy Valid: 51.503006012024045 
Accuracy Test: 55.60000000000001

f1 Train: 0.43307619145604337 
f1 Valid: 0.40444653476265013 
f1 Test: 0.43686854453478374

Confusion Matrix Train:
 [[1008  240   29    0]
 [  39  894   75    0]
 [ 210  433  291    0]
 [ 288  472   21    0]] 
Confusion Matrix Valid:
 [[127  32   6   0]
 [  7 101  10   0]
 [ 26  51  29   0]
 [ 49  59   2   0]] 
Confusion Matrix Test:
 [[148  47   3   0]
 [  6  91   5   0]
 [ 24  46  39   0]
 [ 36  52   3   0]]


# Random Forrest
**BBoW**

In [61]:
from sklearn.ensemble import RandomForestClassifier

x_train, x_valid, x_test = x_train_bbow, x_valid_bbow, x_test_bbow
y_train, y_valid, y_test = np.array(y_train), np.array(y_valid), np.array(y_test)

RF = RandomForestClassifier(n_estimators=100, random_state=42)
RF.fit(x_train, y_train)

y_pred_train = RF.predict(x_train)
y_pred_valid = RF.predict(x_valid)
y_pred_test  = RF.predict(x_test)

acc_train = accuracy_score(y_train, y_pred_train)
acc_valid = accuracy_score(y_valid, y_pred_valid)
acc_test  = accuracy_score(y_test,  y_pred_test)

f1_train_rf1 = f1_score(y_train, y_pred_train, average="macro")
f1_valid_rf1 = f1_score(y_valid, y_pred_valid, average="macro")
f1_test_rf1  = f1_score(y_test,  y_pred_test,  average="macro")

cm_train = confusion_matrix(y_train, y_pred_train)
cm_valid = confusion_matrix(y_valid, y_pred_valid)
cm_test  = confusion_matrix(y_test,  y_pred_test)

print("Random Forest (BBoW)")
print("Accuracy Train:", acc_train*100, "\nAccuracy Valid:", acc_valid*100, "\nAccuracy Test:", acc_test*100)
print("\nf1 Train:", f1_train_rf1, "\nf1 Valid:", f1_valid_rf1, "\nf1 Test:", f1_test_rf1)
print("\nConfusion Matrix Train:\n", cm_train, "\nConfusion Matrix Valid:\n", cm_valid, "\nConfusion Matrix Test:\n", cm_test)

Random Forest (BBoW)
Accuracy Train: 83.25 
Accuracy Valid: 29.458917835671343 
Accuracy Test: 33.4

f1 Train: 0.8243069498994473 
f1 Valid: 0.26276219750388635 
f1 Test: 0.26945890676631706

Confusion Matrix Train:
 [[1178   17   39   43]
 [  14  803   98   93]
 [  66  101  761    6]
 [ 103   83    7  588]] 
Confusion Matrix Valid:
 [[83 14 38 30]
 [ 5 35 37 41]
 [28 60 17  1]
 [52 40  6 12]] 
Confusion Matrix Test:
 [[115  14  30  39]
 [ 13  27  31  31]
 [ 42  45  14   8]
 [ 36  38   6  11]]


**FBoW**

In [60]:
x_train, x_valid, x_test = x_train_fbow, x_valid_fbow, x_test_fbow
y_train, y_valid, y_test = np.array(y_train), np.array(y_valid), np.array(y_test)

RF = RandomForestClassifier(n_estimators=100, random_state=42)
RF.fit(x_train, y_train)

y_pred_train = RF.predict(x_train)
y_pred_valid = RF.predict(x_valid)
y_pred_test  = RF.predict(x_test)

acc_train = accuracy_score(y_train, y_pred_train)
acc_valid = accuracy_score(y_valid, y_pred_valid)
acc_test  = accuracy_score(y_test,  y_pred_test)

f1_train_rf2 = f1_score(y_train, y_pred_train, average="macro")
f1_valid_rf2 = f1_score(y_valid, y_pred_valid, average="macro")
f1_test_rf2  = f1_score(y_test,  y_pred_test,  average="macro")

cm_train = confusion_matrix(y_train, y_pred_train)
cm_valid = confusion_matrix(y_valid, y_pred_valid)
cm_test  = confusion_matrix(y_test,  y_pred_test)

print("\nRandom Forest (FBoW)")
print("Accuracy Train:", acc_train*100, "\nAccuracy Valid:", acc_valid*100, "\nAccuracy Test:", acc_test*100)
print("\nf1 Train:", f1_train_rf2, "\nf1 Valid:", f1_valid_rf2, "\nf1 Test:", f1_test_rf2)
print("\nConfusion Matrix Train:\n", cm_train, "\nConfusion Matrix Valid:\n", cm_valid, "\nConfusion Matrix Test:\n", cm_test)


Random Forest (FBoW)
Accuracy Train: 84.05 
Accuracy Valid: 29.458917835671343 
Accuracy Test: 32.4

f1 Train: 0.8323051779546364 
f1 Valid: 0.26231519924743346 
f1 Test: 0.2610693407768139

Confusion Matrix Train:
 [[1186   20   29   42]
 [  12  826   84   86]
 [  62  103  765    4]
 [ 101   89    6  585]] 
Confusion Matrix Valid:
 [[79 16 39 31]
 [ 6 41 35 36]
 [29 60 16  1]
 [52 41  6 11]] 
Confusion Matrix Test:
 [[111  14  32  41]
 [ 12  29  32  29]
 [ 44  45  12   8]
 [ 36  39   6  10]]


# XGBoost
**BBoW**


In [51]:
from xgboost import XGBClassifier

# In order to avoid error in xgb (classes from 0 to 3 instead of 1 to 4)
y_train_enc = y_train - 1
y_valid_enc = y_valid - 1
y_test_enc  = y_test - 1

x_train, x_valid, x_test = x_train_bbow, x_valid_bbow, x_test_bbow

XG = XGBClassifier(
    n_estimators=100,
    learning_rate=0.01,
    max_depth=3, # keep the model simpler to prevent overfitting
    random_state=32,
)
XG.fit(x_train, y_train_enc)

y_pred_train = XG.predict(x_train)
y_pred_valid = XG.predict(x_valid)
y_pred_test  = XG.predict(x_test)

acc_train = accuracy_score(y_train_enc, y_pred_train)
acc_valid = accuracy_score(y_valid_enc, y_pred_valid)
acc_test  = accuracy_score(y_test_enc,  y_pred_test)

f1_train_xg1 = f1_score(y_train_enc, y_pred_train, average="macro")
f1_valid_xg1 = f1_score(y_valid_enc, y_pred_valid, average="macro")
f1_test_xg1  = f1_score(y_test_enc,  y_pred_test,  average="macro")

cm_train = confusion_matrix(y_train_enc, y_pred_train)
cm_valid = confusion_matrix(y_valid_enc, y_pred_valid)
cm_test  = confusion_matrix(y_test_enc,  y_pred_test)

print("XGBoost (BBoW)")
print("Accuracy Train:", acc_train*100, "\nAccuracy Valid:", acc_valid*100, "\nAccuracy Test:", acc_test*100)
print("\nf1 Train:", f1_train_xg1, "\nf1 Valid:", f1_valid_xg1, "\nf1 Test:", f1_test_xg1)
print("\nConfusion Matrix Train:\n", cm_train, "\nConfusion Matrix Valid:\n", cm_valid, "\nConfusion Matrix Test:\n", cm_test)

XGBoost (BBoW)
Accuracy Train: 65.025 
Accuracy Valid: 61.72344689378757 
Accuracy Test: 67.80000000000001

f1 Train: 0.5995773334572257 
f1 Valid: 0.5678942833699916 
f1 Test: 0.6110989373329448

Confusion Matrix Train:
 [[1140   86   42    9]
 [  39  791   92   86]
 [ 201  251  471   11]
 [ 258  278   46  199]] 
Confusion Matrix Valid:
 [[143  13   7   2]
 [  9  88   9  12]
 [ 18  34  54   0]
 [ 44  40   3  23]] 
Confusion Matrix Test:
 [[180  14   3   1]
 [  4  79   7  12]
 [ 24  29  55   1]
 [ 33  26   7  25]]


**FBoW**

In [52]:
y_train_enc = y_train - 1
y_valid_enc = y_valid - 1
y_test_enc  = y_test - 1


x_train, x_valid, x_test = x_train_fbow, x_valid_fbow, x_test_fbow

XG = XGBClassifier(
    n_estimators=100,
    learning_rate=0.01,
    max_depth=3,
    random_state=32,
)
XG.fit(x_train, y_train_enc)

y_pred_train = XG.predict(x_train)
y_pred_valid = XG.predict(x_valid)
y_pred_test  = XG.predict(x_test)

acc_train = accuracy_score(y_train_enc, y_pred_train)
acc_valid = accuracy_score(y_valid_enc, y_pred_valid)
acc_test  = accuracy_score(y_test_enc,  y_pred_test)

f1_train_xg2 = f1_score(y_train_enc, y_pred_train, average="macro")
f1_valid_xg2 = f1_score(y_valid_enc, y_pred_valid, average="macro")
f1_test_xg2  = f1_score(y_test_enc,  y_pred_test,  average="macro")

cm_train = confusion_matrix(y_train_enc, y_pred_train)
cm_valid = confusion_matrix(y_valid_enc, y_pred_valid)
cm_test  = confusion_matrix(y_test_enc,  y_pred_test)

print("XGBoost (FBoW)")
print("Accuracy Train:", acc_train*100, "\nAccuracy Valid:", acc_valid*100, "\nAccuracy Test:", acc_test*100)
print("\nf1 Train:", f1_train_xg2, "\nf1 Valid:", f1_valid_xg2, "\nf1 Test:", f1_test_xg2)
print("\nConfusion Matrix Train:\n", cm_train, "\nConfusion Matrix Valid:\n", cm_valid, "\nConfusion Matrix Test:\n", cm_test)

XGBoost (FBoW)
Accuracy Train: 64.55 
Accuracy Valid: 61.12224448897795 
Accuracy Test: 66.4

f1 Train: 0.5913344972934461 
f1 Valid: 0.5585687746823854 
f1 Test: 0.5991459257588291

Confusion Matrix Train:
 [[1145   93   36    3]
 [  35  797   89   87]
 [ 198  275  455    6]
 [ 273  285   38  185]] 
Confusion Matrix Valid:
 [[142  15   6   2]
 [  5  90   9  14]
 [ 23  31  52   0]
 [ 46  40   3  21]] 
Confusion Matrix Test:
 [[175  17   4   2]
 [  4  77   7  14]
 [ 23  29  56   1]
 [ 34  27   6  24]]


# Results

**Based on the results from 4 models, tuning hyperparameters is really crucial for preventing overfitting. in the Decision Tree and XGBoost models it is obvious that by reduction in max_depth, the model will be extremely simple and prevent overfitting. It shoud be noted that max_depth should not be that low to cause underfitting.**<br>

**In the following cell we have shown the results of the models. The XGBoost is the strongest candidate.**<br>

**In my opinion XGBoost with BBoW is the best combination. With the same amount of F-1 score compared to XGBoost (FBoW), it has far more less run time.**

In [62]:
f1_table = pd.DataFrame({
    "Logistic Regression(BBoW)": [f1_train_lr1, f1_valid_lr1, f1_test_lr1],
    "Logistic Regression(FBoW)": [f1_train_lr2, f1_valid_lr2, f1_test_lr2],
    "Decision Tree(BBoW)":       [f1_train_dt1, f1_valid_dt1, f1_test_dt1],
    "Decision Tree(FBoW)":       [f1_train_dt2, f1_valid_dt2, f1_test_dt2],
    "Random Forest(BBoW)":       [f1_train_rf1, f1_valid_rf1, f1_test_rf1],
    "Random Forest(FBoW)":       [f1_train_rf2, f1_valid_rf2, f1_test_rf2],
    "XGBoost(BBoW)":             [f1_train_xg1, f1_valid_xg1, f1_test_xg1],
    "XGBoost(FBoW)":             [f1_train_xg2, f1_valid_xg2, f1_test_xg2],
}, index=["F1 (Train)", "F1 (Valid)", "F1 (Test)"]).round(4)

display(f1_table)


Unnamed: 0,Logistic Regression(BBoW),Logistic Regression(FBoW),Decision Tree(BBoW),Decision Tree(FBoW),Random Forest(BBoW),Random Forest(FBoW),XGBoost(BBoW),XGBoost(FBoW)
F1 (Train),0.8257,0.3292,0.458,0.4331,0.8243,0.8323,0.5996,0.5913
F1 (Valid),0.4833,0.323,0.4547,0.4044,0.2628,0.2623,0.5679,0.5586
F1 (Test),0.5066,0.3295,0.4455,0.4369,0.2695,0.2611,0.6111,0.5991
