In [None]:
import sys
import os

import pandas as pd
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../')))

from lib.data_preparation import DataPreparation
from lib.preprocessing_data import Preprocessing
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression


def train(preprocessing_method, vectorizer, model, description):
    data_set = DataPreparation().load_data()
    data_set = preprocessing_method(data_set)
    data_set = data_set.sample(frac=1, random_state=42).reset_index(drop=True)

    split_index = int(len(data_set) * 0.8)
    train_df = data_set[:split_index]
    test_df = data_set[split_index:]

    X_train = train_df['processed_tweet']
    y_train = train_df['label'].to_numpy().astype(int)
    X_test = test_df['processed_tweet']
    y_test = test_df['label'].to_numpy().astype(int)
    
    X_train_vector = vectorizer.fit_transform(X_train.apply(lambda row: ' '.join(row)))
    X_test_vector = vectorizer.transform(X_test.apply(lambda row: ' '.join(row)))
    model.fit(X_train_vector, y_train)
    y_pred = model.predict(X_test_vector)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy for {description}: {accuracy.round(3)}")

#     report = classification_report(y_test, y_pred, output_dict=True)

#     formatted_report = pd.DataFrame(report).transpose()
#     print(formatted_report.round(3))    
    return description, accuracy





In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer



preprocessing_methods = {
    "lemmatization": Preprocessing().lemmatization,
    "tokenization": Preprocessing().tokenization,
    "stemming": Preprocessing().stemming
}
vectorizers = {
    "TF-IDF": TfidfVectorizer(),
    "CountVectorizer (binary=False)": CountVectorizer(binary=False),
    "CountVectorizer (binary=True)": CountVectorizer(binary=True)
}
models = {
    "Logistic Regression": LogisticRegression(),
    "MultinomialNB": MultinomialNB(),
    "Linear SVC": LinearSVC()
}

# Dynamically run all combinations
for preprocessing_name, preprocessing_method in preprocessing_methods.items():
    for vectorizer_name, vectorizer in vectorizers.items():
        for model_name, model in models.items():
            description = f"{vectorizer_name} with {preprocessing_name} using {model_name}"
            train(preprocessing_method, vectorizer, model, description)

Accuracy for TF-IDF with lemmatization using Logistic Regression: 0.8539325842696629
Accuracy for TF-IDF with lemmatization using MultinomialNB: 0.7921348314606742
Accuracy for TF-IDF with lemmatization using Linear SVC: 0.8258426966292135
Accuracy for CountVectorizer (binary=False) with lemmatization using Logistic Regression: 0.848314606741573
Accuracy for CountVectorizer (binary=False) with lemmatization using MultinomialNB: 0.8174157303370787
Accuracy for CountVectorizer (binary=False) with lemmatization using Linear SVC: 0.797752808988764
Accuracy for CountVectorizer (binary=True) with lemmatization using Logistic Regression: 0.848314606741573
Accuracy for CountVectorizer (binary=True) with lemmatization using MultinomialNB: 0.8174157303370787
Accuracy for CountVectorizer (binary=True) with lemmatization using Linear SVC: 0.7893258426966292
Accuracy for TF-IDF with tokenization using Logistic Regression: 0.8370786516853933
Accuracy for TF-IDF with tokenization using MultinomialNB:

In [12]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer



preprocessing_methods = {
    "lemmatization": Preprocessing().lemmatization,
    "tokenization": Preprocessing().tokenization,
    "stemming": Preprocessing().stemming
}
vectorizers = {
    "TF-IDF": TfidfVectorizer(),
    "CountVectorizer (binary=False)": CountVectorizer(binary=False),
    "CountVectorizer (binary=True)": CountVectorizer(binary=True)
}
models = {
    "Logistic Regression": LogisticRegression(),
    "MultinomialNB": MultinomialNB(),
    "Linear SVC": LinearSVC()
}

# Dynamically run all combinations
for preprocessing_name, preprocessing_method in preprocessing_methods.items():
    for vectorizer_name, vectorizer in vectorizers.items():
        for model_name, model in models.items():
            description = f"{vectorizer_name} with {preprocessing_name} using {model_name}"
            train(preprocessing_method, vectorizer, model, description)

Accuracy for TF-IDF with lemmatization using Logistic Regression: 0.8721910112359551
Accuracy for TF-IDF with lemmatization using MultinomialNB: 0.8216292134831461
Accuracy for TF-IDF with lemmatization using Linear SVC: 0.8637640449438202
Accuracy for CountVectorizer (binary=False) with lemmatization using Logistic Regression: 0.8651685393258427
Accuracy for CountVectorizer (binary=False) with lemmatization using MultinomialNB: 0.8469101123595506
Accuracy for CountVectorizer (binary=False) with lemmatization using Linear SVC: 0.8370786516853933
Accuracy for CountVectorizer (binary=True) with lemmatization using Logistic Regression: 0.8693820224719101
Accuracy for CountVectorizer (binary=True) with lemmatization using MultinomialNB: 0.8441011235955056
Accuracy for CountVectorizer (binary=True) with lemmatization using Linear SVC: 0.8412921348314607
Accuracy for TF-IDF with tokenization using Logistic Regression: 0.8707865168539326
Accuracy for TF-IDF with tokenization using Multinomial

# Visualize output

In [None]:

results = []

for preprocessing_name, preprocessing_method in preprocessing_methods.items():
    for vectorizer_name, vectorizer in vectorizers.items():
        for model_name, model in models.items():
            description = f"{vectorizer_name} + {preprocessing_name} + {model_name}"
            desc, accuracy = train(preprocessing_method, vectorizer, model, description)
            results.append((desc, accuracy))

results_df = pd.DataFrame(results, columns=["Description", "Accuracy"])

plt.figure(figsize=(12, 8))
results_df.sort_values(by="Accuracy", ascending=False, inplace=True)
plt.barh(results_df["Description"], results_df["Accuracy"], color="skyblue")
plt.xlabel("Accuracy")
plt.title("Accuracy of Different Preprocessing, Vectorizer, and Model Combinations")
plt.tight_layout()
plt.show()
