#### Connecting Google Drive

In [None]:
from google.colab import drive
drive.mount("gdrive/")

#### Avoids scroll-in-the-scroll in the entire Notebook

In [None]:
from IPython.display import Javascript
def resize_colab_cell():
  display(Javascript("google.colab.output.setIframeHeight(0, true, {maxHeight: 5000})"))
get_ipython().events.register("pre_run_cell", resize_colab_cell)

#### Preprocessing Dataset with `Word2Vec` encoding

In [None]:
import numpy as np

import pandas
df = pandas.read_csv("/content/gdrive/MyDrive/Colab Notebooks/datasets/file1.csv")
x = df["Text"].values
y = df["Language"].values

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

def tokenize_text(text):
    return text.lower().split()

def encode_text(text, model):
    tokens = tokenize_text(text)
    tokens = [i for i in tokens if i in model.wv.key_to_index]
    if len(tokens) > 0: encoding = np.mean(model.wv[tokens], axis=0)
    else: encoding = np.zeros(model.vector_size)
    return encoding

x_train_tokenized = [tokenize_text(text) for text in x_train]

from gensim.models import Word2Vec
W2V = Word2Vec(x_train_tokenized, vector_size=100, window=5, min_count=1, workers=4)
x_train_encoded = np.array([encode_text(i, W2V) for i in x_train])

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_train_normalized = scaler.fit_transform(x_train_encoded)

#### 1. Using `Multinomial NB`

In [None]:
from sklearn.naive_bayes import MultinomialNB
MNB = MultinomialNB()
MNB.fit(x_train_normalized, y_train)

x_test_encoded_MNB = np.array([encode_text(i, W2V) for i in x_test])
x_test_normalized_MNB = scaler.transform(x_test_encoded_MNB)

print(f"MNB & Word2Vec: {MNB.score(x_test_normalized_MNB, y_test) * 100 :.2f}%")

In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score, 
    roc_auc_score, confusion_matrix, classification_report, balanced_accuracy_score, matthews_corrcoef
)

y_pred_MNB = MNB.predict(x_test_normalized_MNB)

accuracy = accuracy_score(y_test, y_pred_MNB)
precision = precision_score(y_test, y_pred_MNB, average="macro")
recall = recall_score(y_test, y_pred_MNB, average="macro")
f1 = f1_score(y_test, y_pred_MNB, average="macro")
kappa = cohen_kappa_score(y_test, y_pred_MNB)
confusion_mat = confusion_matrix(y_test, y_pred_MNB)
class_report = classification_report(y_test, y_pred_MNB)
mcc = matthews_corrcoef(y_test, y_pred_MNB)
balanced_acc = balanced_accuracy_score(y_test, y_pred_MNB)

from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y_test_binary = lb.fit_transform(y_test)
y_pred_binary = lb.transform(y_pred_MNB)
roc_auc = roc_auc_score(y_test_binary, y_pred_binary, average="macro")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1-Score: {f1 * 100:.2f}%")
print(f"Cohen's Kappa: {kappa * 100:.2f}%")
print(f"ROC AUC Score: {roc_auc * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_mat)
print("Classification Report:")
print(class_report)
print(f"Matthews Correlation Coefficient: {mcc * 100:.2f}%")
print(f"Balanced Accuracy: {balanced_acc * 100:.2f}%")

#### 2. Using `SVM`

In [None]:
from sklearn.svm import SVC
SVM = SVC(kernel="linear")
SVM.fit(x_train_normalized, y_train)

x_test_encoded_SVM = np.array([encode_text(i, W2V) for i in x_test])
x_test_normalized_SVM = scaler.transform(x_test_encoded_SVM)

print(f"SVM & Word2Vec: {SVM.score(x_test_normalized_SVM, y_test) * 100 :.2f}%")

In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score,
    roc_auc_score, confusion_matrix, classification_report, balanced_accuracy_score, matthews_corrcoef
)

y_pred_SVM = SVM.predict(x_test_normalized_SVM)

accuracy = accuracy_score(y_test, y_pred_SVM)
precision = precision_score(y_test, y_pred_SVM, average="macro")
recall = recall_score(y_test, y_pred_SVM, average="macro")
f1 = f1_score(y_test, y_pred_SVM, average="macro")
kappa = cohen_kappa_score(y_test, y_pred_SVM)
confusion_mat = confusion_matrix(y_test, y_pred_SVM)
class_report = classification_report(y_test, y_pred_SVM)
mcc = matthews_corrcoef(y_test, y_pred_SVM)
balanced_acc = balanced_accuracy_score(y_test, y_pred_SVM)

from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y_test_binary = lb.fit_transform(y_test)
y_pred_binary = lb.transform(y_pred_SVM)
roc_auc = roc_auc_score(y_test_binary, y_pred_binary, average="macro")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1-Score: {f1 * 100:.2f}%")
print(f"Cohen's Kappa: {kappa * 100:.2f}%")
print(f"ROC AUC Score: {roc_auc * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_mat)
print("Classification Report:")
print(class_report)
print(f"Matthews Correlation Coefficient: {mcc * 100:.2f}%")
print(f"Balanced Accuracy: {balanced_acc * 100:.2f}%")

#### 3. Using `Random Forest`

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier()
RF.fit(x_train_normalized, y_train)

x_test_encoded_RF = np.array([encode_text(i, W2V) for i in x_test])
x_test_normalized_RF = scaler.transform(x_test_encoded_RF)

print(f"Random Forest & Word2Vec: {RF.score(x_test_normalized_RF, y_test) * 100 :.2f}%")

In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score,
    roc_auc_score, confusion_matrix, classification_report, balanced_accuracy_score, matthews_corrcoef
)

y_pred_RF = RF.predict(x_test_normalized_RF)

accuracy = accuracy_score(y_test, y_pred_RF)
precision = precision_score(y_test, y_pred_RF, average="macro")
recall = recall_score(y_test, y_pred_RF, average="macro")
f1 = f1_score(y_test, y_pred_RF, average="macro")
kappa = cohen_kappa_score(y_test, y_pred_RF)
confusion_mat = confusion_matrix(y_test, y_pred_RF)
class_report = classification_report(y_test, y_pred_RF)
mcc = matthews_corrcoef(y_test, y_pred_RF)
balanced_acc = balanced_accuracy_score(y_test, y_pred_RF)

from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y_test_binary = lb.fit_transform(y_test)
y_pred_binary = lb.transform(y_pred_RF)
roc_auc = roc_auc_score(y_test_binary, y_pred_binary, average="macro")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1-Score: {f1 * 100:.2f}%")
print(f"Cohen's Kappa: {kappa * 100:.2f}%")
print(f"ROC AUC Score: {roc_auc * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_mat)
print("Classification Report:")
print(class_report)
print(f"Matthews Correlation Coefficient: {mcc * 100:.2f}%")
print(f"Balanced Accuracy: {balanced_acc * 100:.2f}%")

#### 4. Using `Gradient Boosting Classifier`

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
GB = GradientBoostingClassifier()
GB.fit(x_train_normalized, y_train)

x_test_encoded_GB = np.array([encode_text(text, W2V) for text in x_test])
x_test_normalized_GB = scaler.transform(x_test_encoded_GB)

print(f"Gradient Boosting Classifier & Word2Vec: {GB.score(x_test_normalized_GB, y_test) * 100 :.2f}%")

In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score,
    roc_auc_score, confusion_matrix, classification_report, balanced_accuracy_score, matthews_corrcoef
)

y_pred_GB = GB.predict(x_test_normalized_GB)

accuracy = accuracy_score(y_test, y_pred_GB)
precision = precision_score(y_test, y_pred_GB, average="macro")
recall = recall_score(y_test, y_pred_GB, average="macro")
f1 = f1_score(y_test, y_pred_GB, average="macro")
kappa = cohen_kappa_score(y_test, y_pred_GB)
confusion_mat = confusion_matrix(y_test, y_pred_GB)
class_report = classification_report(y_test, y_pred_GB)
mcc = matthews_corrcoef(y_test, y_pred_GB)
balanced_acc = balanced_accuracy_score(y_test, y_pred_GB)

from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y_test_binary = lb.fit_transform(y_test)
y_pred_binary = lb.transform(y_pred_GB)
roc_auc = roc_auc_score(y_test_binary, y_pred_binary, average="macro")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1-Score: {f1 * 100:.2f}%")
print(f"Cohen's Kappa: {kappa * 100:.2f}%")
print(f"ROC AUC Score: {roc_auc * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_mat)
print("Classification Report:")
print(class_report)
print(f"Matthews Correlation Coefficient: {mcc * 100:.2f}%")
print(f"Balanced Accuracy: {balanced_acc * 100:.2f}%")

#### 5. Using `Decision Tree Classifier`

In [None]:
from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier()
DT.fit(x_train_normalized, y_train)

x_test_encoded_DT = np.array([encode_text(i, W2V) for i in x_test])
x_test_normalized_DT = scaler.transform(x_test_encoded_DT)

print(f"Decision Tree Classifier & Word2Vec: {DT.score(x_test_normalized_DT, y_test) * 100 :.2f}%")

In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score,
    roc_auc_score, confusion_matrix, classification_report, balanced_accuracy_score, matthews_corrcoef
)

y_pred_DT = DT.predict(x_test_normalized_DT)

accuracy = accuracy_score(y_test, y_pred_DT)
precision = precision_score(y_test, y_pred_DT, average="macro")
recall = recall_score(y_test, y_pred_DT, average="macro")
f1 = f1_score(y_test, y_pred_DT, average="macro")
kappa = cohen_kappa_score(y_test, y_pred_DT)
confusion_mat = confusion_matrix(y_test, y_pred_DT)
class_report = classification_report(y_test, y_pred_DT)
mcc = matthews_corrcoef(y_test, y_pred_DT)
balanced_acc = balanced_accuracy_score(y_test, y_pred_DT)

from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y_test_binary = lb.fit_transform(y_test)
y_pred_binary = lb.transform(y_pred_DT)
roc_auc = roc_auc_score(y_test_binary, y_pred_binary, average="macro")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1-Score: {f1 * 100:.2f}%")
print(f"Cohen's Kappa: {kappa * 100:.2f}%")
print(f"ROC AUC Score: {roc_auc * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_mat)
print("Classification Report:")
print(class_report)
print(f"Matthews Correlation Coefficient: {mcc * 100:.2f}%")
print(f"Balanced Accuracy: {balanced_acc * 100:.2f}%")

#### 6. Using `K-Nearest Neighbors Classifier`

In [None]:
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier()
KNN.fit(x_train_normalized, y_train)

x_test_encoded_KNN = np.array([encode_text(i, W2V) for i in x_test])
x_test_normalized_KNN = scaler.transform(x_test_encoded_KNN)

print(f"K-Nearest Neighbors Classifier & Word2Vec: {KNN.score(x_test_normalized_KNN, y_test) * 100 :.2f}%")

In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score,
    roc_auc_score, confusion_matrix, classification_report, balanced_accuracy_score, matthews_corrcoef
)

y_pred_KNN = KNN.predict(x_test_normalized_KNN)

accuracy = accuracy_score(y_test, y_pred_KNN)
precision = precision_score(y_test, y_pred_KNN, average="macro")
recall = recall_score(y_test, y_pred_KNN, average="macro")
f1 = f1_score(y_test, y_pred_KNN, average="macro")
kappa = cohen_kappa_score(y_test, y_pred_KNN)
confusion_mat = confusion_matrix(y_test, y_pred_KNN)
class_report = classification_report(y_test, y_pred_KNN)
mcc = matthews_corrcoef(y_test, y_pred_KNN)
balanced_acc = balanced_accuracy_score(y_test, y_pred_KNN)

from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y_test_binary = lb.fit_transform(y_test)
y_pred_binary = lb.transform(y_pred_KNN)
roc_auc = roc_auc_score(y_test_binary, y_pred_binary, average="macro")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1-Score: {f1 * 100:.2f}%")
print(f"Cohen's Kappa: {kappa * 100:.2f}%")
print(f"ROC AUC Score: {roc_auc * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_mat)
print("Classification Report:")
print(class_report)
print(f"Matthews Correlation Coefficient: {mcc * 100:.2f}%")
print(f"Balanced Accuracy: {balanced_acc * 100:.2f}%")

#### 7. Using `AdaBoost Classifier`

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ADB = AdaBoostClassifier()
ADB.fit(x_train_normalized, y_train)

x_test_encoded_ADB = np.array([encode_text(i, W2V) for i in x_test])
x_test_normalized_ADB = scaler.transform(x_test_encoded_ADB)

print(f"AdaBoost Classifier & Word2Vec: {ADB.score(x_test_normalized_ADB, y_test) * 100 :.2f}%")

In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score,
    roc_auc_score, confusion_matrix, classification_report, balanced_accuracy_score, matthews_corrcoef
)

y_pred_ADB = ADB.predict(x_test_normalized_ADB)

accuracy = accuracy_score(y_test, y_pred_ADB)
precision = precision_score(y_test, y_pred_ADB, average="macro")
recall = recall_score(y_test, y_pred_ADB, average="macro")
f1 = f1_score(y_test, y_pred_ADB, average="macro")
kappa = cohen_kappa_score(y_test, y_pred_ADB)
confusion_mat = confusion_matrix(y_test, y_pred_ADB)
class_report = classification_report(y_test, y_pred_ADB)
mcc = matthews_corrcoef(y_test, y_pred_ADB)
balanced_acc = balanced_accuracy_score(y_test, y_pred_ADB)

from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y_test_binary = lb.fit_transform(y_test)
y_pred_binary = lb.transform(y_pred_ADB)
roc_auc = roc_auc_score(y_test_binary, y_pred_binary, average="macro")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1-Score: {f1 * 100:.2f}%")
print(f"Cohen's Kappa: {kappa * 100:.2f}%")
print(f"ROC AUC Score: {roc_auc * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_mat)
print("Classification Report:")
print(class_report)
print(f"Matthews Correlation Coefficient: {mcc * 100:.2f}%")
print(f"Balanced Accuracy: {balanced_acc * 100:.2f}%")

#### 8. Using `Logistic Regression`

In [None]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(max_iter=10000)
LR.fit(x_train_normalized, y_train)

x_test_encoded_LR = np.array([encode_text(i, W2V) for i in x_test])
x_test_normalized_LR = scaler.transform(x_test_encoded_LR)

print(f"Logistic Regression & Word2Vec: {LR.score(x_test_normalized_LR, y_test) * 100 :.2f}%")

In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score,
    roc_auc_score, confusion_matrix, classification_report, balanced_accuracy_score, matthews_corrcoef
)

y_pred_LR = LR.predict(x_test_normalized_LR)

accuracy = accuracy_score(y_test, y_pred_LR)
precision = precision_score(y_test, y_pred_LR, average="macro")
recall = recall_score(y_test, y_pred_LR, average="macro")
f1 = f1_score(y_test, y_pred_LR, average="macro")
kappa = cohen_kappa_score(y_test, y_pred_LR)
confusion_mat = confusion_matrix(y_test, y_pred_LR)
class_report = classification_report(y_test, y_pred_LR)
mcc = matthews_corrcoef(y_test, y_pred_LR)
balanced_acc = balanced_accuracy_score(y_test, y_pred_LR)

from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y_test_binary = lb.fit_transform(y_test)
y_pred_binary = lb.transform(y_pred_LR)
roc_auc = roc_auc_score(y_test_binary, y_pred_binary, average="macro")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1-Score: {f1 * 100:.2f}%")
print(f"Cohen's Kappa: {kappa * 100:.2f}%")
print(f"ROC AUC Score: {roc_auc * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_mat)
print("Classification Report:")
print(class_report)
print(f"Matthews Correlation Coefficient: {mcc * 100:.2f}%")
print(f"Balanced Accuracy: {balanced_acc * 100:.2f}%")

#### 9. Using `Extra Trees Classifier`

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
ET = ExtraTreesClassifier()
ET.fit(x_train_normalized, y_train)

x_test_encoded_ET = np.array([encode_text(i, W2V) for i in x_test])
x_test_normalized_ET = scaler.transform(x_test_encoded_ET)

print(f"Extra Trees Classifier & Word2Vec: {ET.score(x_test_normalized_ET, y_test) * 100 :.2f}%")

In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score,
    roc_auc_score, confusion_matrix, classification_report, balanced_accuracy_score, matthews_corrcoef
)

y_pred_ET = ET.predict(x_test_normalized_ET)

accuracy = accuracy_score(y_test, y_pred_ET)
precision = precision_score(y_test, y_pred_ET, average="macro")
recall = recall_score(y_test, y_pred_ET, average="macro")
f1 = f1_score(y_test, y_pred_ET, average="macro")
kappa = cohen_kappa_score(y_test, y_pred_ET)
confusion_mat = confusion_matrix(y_test, y_pred_ET)
class_report = classification_report(y_test, y_pred_ET)
mcc = matthews_corrcoef(y_test, y_pred_ET)
balanced_acc = balanced_accuracy_score(y_test, y_pred_ET)

from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y_test_binary = lb.fit_transform(y_test)
y_pred_binary = lb.transform(y_pred_ET)
roc_auc = roc_auc_score(y_test_binary, y_pred_binary, average="macro")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1-Score: {f1 * 100:.2f}%")
print(f"Cohen's Kappa: {kappa * 100:.2f}%")
print(f"ROC AUC Score: {roc_auc * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_mat)
print("Classification Report:")
print(class_report)
print(f"Matthews Correlation Coefficient: {mcc * 100:.2f}%")
print(f"Balanced Accuracy: {balanced_acc * 100:.2f}%")

#### 10. Using `Gaussian Naive Bayes`

In [None]:
# from sklearn.gaussian_process import GaussianProcessRegressor
# GPR = GaussianProcessRegressor()

from sklearn.naive_bayes import GaussianNB
GPR = GaussianNB()
GPR.fit(x_train_normalized, y_train)

x_test_encoded_GPR = np.array([encode_text(i, W2V) for i in x_test])
x_test_normalized_GPR = scaler.transform(x_test_encoded_GPR)

print(f"Gaussian Naive Bayes & Word2Vec: {GPR.score(x_test_normalized_GPR, y_test) * 100 :.2f}%")

In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score,
    roc_auc_score, confusion_matrix, classification_report, balanced_accuracy_score, matthews_corrcoef
)

y_pred_GPR = GPR.predict(x_test_normalized_GPR)

accuracy = accuracy_score(y_test, y_pred_GPR)
precision = precision_score(y_test, y_pred_GPR, average="macro")
recall = recall_score(y_test, y_pred_GPR, average="macro")
f1 = f1_score(y_test, y_pred_GPR, average="macro")
kappa = cohen_kappa_score(y_test, y_pred_GPR)
confusion_mat = confusion_matrix(y_test, y_pred_GPR)
class_report = classification_report(y_test, y_pred_GPR)
mcc = matthews_corrcoef(y_test, y_pred_GPR)
balanced_acc = balanced_accuracy_score(y_test, y_pred_GPR)

from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y_test_binary = lb.fit_transform(y_test)
y_pred_binary = lb.transform(y_pred_GPR)
roc_auc = roc_auc_score(y_test_binary, y_pred_binary, average="macro")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1-Score: {f1 * 100:.2f}%")
print(f"Cohen's Kappa: {kappa * 100:.2f}%")
print(f"ROC AUC Score: {roc_auc * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_mat)
print("Classification Report:")
print(class_report)
print(f"Matthews Correlation Coefficient: {mcc * 100:.2f}%")
print(f"Balanced Accuracy: {balanced_acc * 100:.2f}%")

#### 11. Using `Ridge Classifier`

In [None]:
from sklearn.linear_model import RidgeClassifier
RR = RidgeClassifier()
RR.fit(x_train_normalized, y_train)

x_test_encoded_RR = np.array([encode_text(i, W2V) for i in x_test])
x_test_normalized_RR = scaler.transform(x_test_encoded_RR)

print(f"Ridge Classifier & Word2Vec: {RR.score(x_test_normalized_RR, y_test) * 100 :.2f}%")

In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score,
    roc_auc_score, confusion_matrix, classification_report, balanced_accuracy_score, matthews_corrcoef
)

y_pred_RR = RR.predict(x_test_normalized_RR)

accuracy = accuracy_score(y_test, y_pred_RR)
precision = precision_score(y_test, y_pred_RR, average="macro")
recall = recall_score(y_test, y_pred_RR, average="macro")
f1 = f1_score(y_test, y_pred_RR, average="macro")
kappa = cohen_kappa_score(y_test, y_pred_RR)
confusion_mat = confusion_matrix(y_test, y_pred_RR)
class_report = classification_report(y_test, y_pred_RR)
mcc = matthews_corrcoef(y_test, y_pred_RR)
balanced_acc = balanced_accuracy_score(y_test, y_pred_RR)

from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y_test_binary = lb.fit_transform(y_test)
y_pred_binary = lb.transform(y_pred_RR)
roc_auc = roc_auc_score(y_test_binary, y_pred_binary, average="macro")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1-Score: {f1 * 100:.2f}%")
print(f"Cohen's Kappa: {kappa * 100:.2f}%")
print(f"ROC AUC Score: {roc_auc * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_mat)
print("Classification Report:")
print(class_report)
print(f"Matthews Correlation Coefficient: {mcc * 100:.2f}%")
print(f"Balanced Accuracy: {balanced_acc * 100:.2f}%")

#### 12. Using `Elastic Net Classifier`

In [None]:
from sklearn.linear_model import ElasticNet
EN = ElasticNet()
EN.fit(x_train_normalized, y_train)

x_test_encoded_EN = np.array([encode_text(i, W2V) for i in x_test])
x_test_normalized_EN = scaler.transform(x_test_encoded_EN)

y_pred_EN = EN.predict(x_test_normalized_EN)
y_pred_EN = np.round(y_pred_EN).astype(int)

# print(f"Elastic Net Classifier & Word2Vec: {EN.score(x_test_normalized_EN, y_test) * 100 :.4f}%")
print(f"Elastic Net Classifier & Word2Vec: {accuracy_score(y_test, y_pred_EN) * 100 :.2f}%")

In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score,
    roc_auc_score, confusion_matrix, classification_report, balanced_accuracy_score, matthews_corrcoef
)

accuracy = accuracy_score(y_test, y_pred_EN)
precision = precision_score(y_test, y_pred_EN, average="macro")
recall = recall_score(y_test, y_pred_EN, average="macro")
f1 = f1_score(y_test, y_pred_EN, average="macro")
kappa = cohen_kappa_score(y_test, y_pred_EN)
confusion_mat = confusion_matrix(y_test, y_pred_EN)
class_report = classification_report(y_test, y_pred_EN)
mcc = matthews_corrcoef(y_test, y_pred_EN)
balanced_acc = balanced_accuracy_score(y_test, y_pred_EN)

from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y_test_binary = lb.fit_transform(y_test)
y_pred_binary = lb.transform(y_pred_EN)
roc_auc = roc_auc_score(y_test_binary, y_pred_binary, average="macro")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1-Score: {f1 * 100:.2f}%")
print(f"Cohen's Kappa: {kappa * 100:.2f}%")
print(f"ROC AUC Score: {roc_auc * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_mat)
print("Classification Report:")
print(class_report)
print(f"Matthews Correlation Coefficient: {mcc * 100:.2f}%")
print(f"Balanced Accuracy: {balanced_acc * 100:.2f}%")

#### 13 Using `Multilayer Perceptron`

In [None]:
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(max_iter=10000)
MLP.fit(x_train_normalized, y_train)

x_test_encoded_MLP = np.array([encode_text(i, W2V) for i in x_test])
x_test_normalized_MLP = scaler.transform(x_test_encoded_MLP)

print(f"Multilayer Perceptron & Word2Vec: {MLP.score(x_test_normalized_MLP, y_test) * 100 :.2f}%")

In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score,
    roc_auc_score, confusion_matrix, classification_report, balanced_accuracy_score, matthews_corrcoef
)

y_pred_MLP = MLP.predict(x_test_normalized_MLP)

accuracy = accuracy_score(y_test, y_pred_MLP)
precision = precision_score(y_test, y_pred_MLP, average="macro")
recall = recall_score(y_test, y_pred_MLP, average="macro")
f1 = f1_score(y_test, y_pred_MLP, average="macro")
kappa = cohen_kappa_score(y_test, y_pred_MLP)
confusion_mat = confusion_matrix(y_test, y_pred_MLP)
class_report = classification_report(y_test, y_pred_MLP)
mcc = matthews_corrcoef(y_test, y_pred_MLP)
balanced_acc = balanced_accuracy_score(y_test, y_pred_MLP)

from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y_test_binary = lb.fit_transform(y_test)
y_pred_binary = lb.transform(y_pred_MLP)
roc_auc = roc_auc_score(y_test_binary, y_pred_binary, average="macro")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1-Score: {f1 * 100:.2f}%")
print(f"Cohen's Kappa: {kappa * 100:.2f}%")
print(f"ROC AUC Score: {roc_auc * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_mat)
print("Classification Report:")
print(class_report)
print(f"Matthews Correlation Coefficient: {mcc * 100:.2f}%")
print(f"Balanced Accuracy: {balanced_acc * 100:.2f}%")

#### 14. Using `Lasso Regressor`

In [None]:
from sklearn.linear_model import Lasso
LASSO = Lasso()
LASSO.fit(x_train_normalized, y_train)

x_test_encoded_LASSO = np.array([encode_text(i, W2V) for i in x_test])
x_test_normalized_LASSO = scaler.transform(x_test_encoded_LASSO)

y_pred_LASSO = LASSO.predict(x_test_normalized_LASSO)
y_pred_LASSO = np.round(y_pred_LASSO).astype(int)

print(f"Lasso Regressor & Word2Vec: {accuracy_score(y_test, y_pred_LASSO) * 100 :.2f}%")

In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score,
    roc_auc_score, confusion_matrix, classification_report, balanced_accuracy_score, matthews_corrcoef
)

accuracy = accuracy_score(y_test, y_pred_LASSO)
precision = precision_score(y_test, y_pred_LASSO, average="macro")
recall = recall_score(y_test, y_pred_LASSO, average="macro")
f1 = f1_score(y_test, y_pred_LASSO, average="macro")
kappa = cohen_kappa_score(y_test, y_pred_LASSO)
confusion_mat = confusion_matrix(y_test, y_pred_LASSO)
class_report = classification_report(y_test, y_pred_LASSO)
mcc = matthews_corrcoef(y_test, y_pred_LASSO)
balanced_acc = balanced_accuracy_score(y_test, y_pred_LASSO)

from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y_test_binary = lb.fit_transform(y_test)
y_pred_binary = lb.transform(y_pred_LASSO)
roc_auc = roc_auc_score(y_test_binary, y_pred_binary, average="macro")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1-Score: {f1 * 100:.2f}%")
print(f"Cohen's Kappa: {kappa * 100:.2f}%")
print(f"ROC AUC Score: {roc_auc * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_mat)
print("Classification Report:")
print(class_report)
print(f"Matthews Correlation Coefficient: {mcc * 100:.2f}%")
print(f"Balanced Accuracy: {balanced_acc * 100:.2f}%")

#### 15 Using `XG Boost`

In [None]:
import xgboost as xgb
XGB = xgb.XGBClassifier()
XGB.fit(x_train_normalized, y_train)

x_test_encoded_XGB = np.array([encode_text(i, W2V) for i in x_test])
x_test_normalized_XGB = scaler.transform(x_test_encoded_XGB)

print(f"XG Boost & Word2Vec: {XGB.score(x_test_normalized_XGB, y_test) * 100 :.2f}%")

In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score,
    roc_auc_score, confusion_matrix, classification_report, balanced_accuracy_score, matthews_corrcoef
)

y_pred_XGB = XGB.predict(x_test_normalized_XGB)

accuracy = accuracy_score(y_test, y_pred_XGB)
precision = precision_score(y_test, y_pred_XGB, average="macro")
recall = recall_score(y_test, y_pred_XGB, average="macro")
f1 = f1_score(y_test, y_pred_XGB, average="macro")
kappa = cohen_kappa_score(y_test, y_pred_XGB)
confusion_mat = confusion_matrix(y_test, y_pred_XGB)
class_report = classification_report(y_test, y_pred_XGB)
mcc = matthews_corrcoef(y_test, y_pred_XGB)
balanced_acc = balanced_accuracy_score(y_test, y_pred_XGB)

from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y_test_binary = lb.fit_transform(y_test)
y_pred_binary = lb.transform(y_pred_XGB)
roc_auc = roc_auc_score(y_test_binary, y_pred_binary, average="macro")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1-Score: {f1 * 100:.2f}%")
print(f"Cohen's Kappa: {kappa * 100:.2f}%")
print(f"ROC AUC Score: {roc_auc * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_mat)
print("Classification Report:")
print(class_report)
print(f"Matthews Correlation Coefficient: {mcc * 100:.2f}%")
print(f"Balanced Accuracy: {balanced_acc * 100:.2f}%")

#### 16. Using `CNN`

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

x_train_reshaped_CNN = x_train_encoded.reshape((x_train_encoded.shape[0], x_train_encoded.shape[1], 1))

CNN = Sequential()
CNN.add(Conv1D(filters=64, kernel_size=3, padding="same", activation="relu", input_shape=(x_train_encoded.shape[1], 1)))
CNN.add(GlobalMaxPooling1D())
CNN.add(Dense(units=64, activation="relu"))
CNN.add(Dropout(rate=0.2))
CNN.add(Dense(units=len(le.classes_), activation="softmax"))

CNN.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
CNN.fit(x_train_reshaped_CNN, y_train, epochs=100, batch_size=32, validation_split=0.1)

x_test_tokenized_CNN = [tokenize_text(text) for text in x_test]
x_test_encoded_CNN = np.array([encode_text(i, W2V) for i in x_test])
x_test_reshaped_CNN = x_test_encoded_CNN.reshape((x_test_encoded_CNN.shape[0], x_test_encoded_CNN.shape[1], 1))

loss, accuracy = CNN.evaluate(x_test_reshaped_CNN, y_test)
print(f"CNN & Word2Vec: {accuracy * 100:.2f}%")

In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score,
    roc_auc_score, confusion_matrix, classification_report, balanced_accuracy_score, matthews_corrcoef
)

y_pred_CNN = np.argmax(CNN.predict(x_test_reshaped_CNN), axis=1)

accuracy = accuracy_score(y_test, y_pred_CNN)
precision = precision_score(y_test, y_pred_CNN, average="macro")
recall = recall_score(y_test, y_pred_CNN, average="macro")
f1 = f1_score(y_test, y_pred_CNN, average="macro")
kappa = cohen_kappa_score(y_test, y_pred_CNN)
confusion_mat = confusion_matrix(y_test, y_pred_CNN)
class_report = classification_report(y_test, y_pred_CNN)
mcc = matthews_corrcoef(y_test, y_pred_CNN)
balanced_acc = balanced_accuracy_score(y_test, y_pred_CNN)

from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y_test_binary = lb.fit_transform(y_test)
y_pred_binary = lb.transform(y_pred_CNN)
roc_auc = roc_auc_score(y_test_binary, y_pred_binary, average="macro")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1-Score: {f1 * 100:.2f}%")
print(f"Cohen's Kappa: {kappa * 100:.2f}%")
print(f"ROC AUC Score: {roc_auc * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_mat)
print("Classification Report:")
print(class_report)
print(f"Matthews Correlation Coefficient: {mcc * 100:.2f}%")
print(f"Balanced Accuracy: {balanced_acc * 100:.2f}%")

#### 17. Using `RNN (LSTM)`

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

x_train_reshaped_RNN = x_train_normalized.reshape((x_train_normalized.shape[0], 1, x_train_normalized.shape[1]))

RNN = Sequential()
RNN.add(LSTM(units=64, input_shape=(x_train_reshaped_RNN.shape[1], x_train_reshaped_RNN.shape[2])))
RNN.add(Dropout(rate=0.2))
RNN.add(Dense(units=len(le.classes_), activation="softmax"))

RNN.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
RNN.fit(x_train_reshaped_RNN, y_train, epochs=100, batch_size=32, validation_split=0.1)

x_test_encoded_RNN = np.array([encode_text(i, W2V) for i in x_test])
x_test_reshaped_RNN = x_test_encoded_RNN.reshape((x_test_encoded_RNN.shape[0], 1, x_test_encoded_RNN.shape[1]))

accuracy = RNN.evaluate(x_test_reshaped_RNN, y_test)[1]
print(f"RNN (LSTM) & Word2Vec: {accuracy * 100:.2f}%")

In [None]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score,
    roc_auc_score, confusion_matrix, classification_report, balanced_accuracy_score, matthews_corrcoef
)

y_pred_RNN = np.argmax(RNN.predict(x_test_reshaped_RNN), axis=1)

accuracy = accuracy_score(y_test, y_pred_RNN)
precision = precision_score(y_test, y_pred_RNN, average="macro")
recall = recall_score(y_test, y_pred_RNN, average="macro")
f1 = f1_score(y_test, y_pred_RNN, average="macro")
kappa = cohen_kappa_score(y_test, y_pred_RNN)
confusion_mat = confusion_matrix(y_test, y_pred_RNN)
class_report = classification_report(y_test, y_pred_RNN)
mcc = matthews_corrcoef(y_test, y_pred_RNN)
balanced_acc = balanced_accuracy_score(y_test, y_pred_RNN)

from sklearn.preprocessing import LabelBinarizer
lb = LabelBinarizer()
y_test_binary = lb.fit_transform(y_test)
y_pred_binary = lb.transform(y_pred_RNN)
roc_auc = roc_auc_score(y_test_binary, y_pred_binary, average="macro")

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1-Score: {f1 * 100:.2f}%")
print(f"Cohen's Kappa: {kappa * 100:.2f}%")
print(f"ROC AUC Score: {roc_auc * 100:.2f}%")
print("Confusion Matrix:")
print(confusion_mat)
print("Classification Report:")
print(class_report)
print(f"Matthews Correlation Coefficient: {mcc * 100:.2f}%")
print(f"Balanced Accuracy: {balanced_acc * 100:.2f}%")