In [33]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import os
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


from scipy.sparse import csr_matrix
import pandas as pd

In [34]:


# def train(train_df, test_df, vectorizer, model, description):
#     raw_train_df = pd.read_csv('../data/raw_splits/train.csv')
#     raw_test_df = pd.read_csv('../data/raw_splits/test.csv')

#     y_train = raw_train_df['label'].to_numpy().astype(int)
#     y_test = raw_test_df['label'].to_numpy().astype(int)
    

#     X_train_vector = csr_matrix(train_df.values)
#     X_test_vector = csr_matrix(test_df.values)
    
#     model.fit(X_train_vector, y_train)
#     y_pred = model.predict(X_test_vector)

#     accuracy = accuracy_score(y_test, y_pred)
#     return accuracy



from sklearn.model_selection import GridSearchCV
from scipy.sparse import csr_matrix
from sklearn.metrics import accuracy_score
import pandas as pd

def train_with_gridsearch(train_df, test_df, vectorizer, model, param_grid, description):
    with open('../data/raw_splits/train.csv', 'r') as f:
        raw_train_df = pd.read_csv(f)
    
    with open('../data/raw_splits/test.csv', 'r') as f:
        raw_test_df = pd.read_csv(f)

    y_train = raw_train_df['label'].to_numpy().astype(int)
    y_test = raw_test_df['label'].to_numpy().astype(int)

    X_train_vector = csr_matrix(train_df.values)
    X_test_vector = csr_matrix(test_df.values)

    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train_vector, y_train)

    best_model = grid_search.best_estimator_

    best_model.fit(X_train_vector, y_train)
    y_pred = best_model.predict(X_test_vector)

    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"Best Parameters for \033[95m{description}\033[00m: \033[93m{grid_search.best_params_}\033[00m with accuracy: \033[92m{accuracy}\033[00m")
    return accuracy, grid_search.best_params_


In [36]:
param_grids = {
    "Logistic Regression": {
        "C": [0.1, 1, 10],  # Regularization strength
        "max_iter": [100, 200, 500]
    },
    "MultinomialNB": {
        "alpha": [0.1, 0.5, 1.0]  # Smoothing parameter
    },
    "SVC": {
        "C": [0.1, 1, 10],
        "kernel": ["linear", "rbf"]  # Different kernel functions
    }
}


vectorizers = {
    "TF-IDF": TfidfVectorizer(),
    "Bow": CountVectorizer(binary=False),
    "BinaryVectorizer": CountVectorizer(binary=True),
}

models = {
    "Logistic Regression": LogisticRegression(),
    "MultinomialNB": MultinomialNB(),
    "SVC": SVC(),
}


results = pd.DataFrame(columns=["model", "processing_method", "Accuracy", "Best Params"])

train_dir = "../data/prepocessed_training_data"
test_dir = "../data/prepocessed_testing_data"

train_files = sorted(os.listdir(train_dir))
test_files = sorted(os.listdir(test_dir))

for train_file, test_file in zip(train_files, test_files):
    vectorizer_name = train_file.split("_")[-1].replace(".csv", "")

    train_df = pd.read_csv(os.path.join(train_dir, train_file))
    test_df = pd.read_csv(os.path.join(test_dir, test_file))

    for model_name, model in models.items():
        param_grid = param_grids.get(model_name, {})  # Get hyperparameter grid for the model
        description = f"{train_file} + {model_name}"

        accuracy, best_params = train_with_gridsearch(
            train_df, test_df, vectorizers[vectorizer_name], model, param_grid, description
        )

        if results.empty:
            results = pd.DataFrame([{
                "model": model_name,
                "processing_method": train_file,
                "Accuracy": accuracy,
                "Best Params": str(best_params)
            }])
        else:
            results = pd.concat([results, pd.DataFrame([{
                "model": model_name,
                "processing_method": train_file,
                "Accuracy": accuracy,
                "Best Params": str(best_params)
            }])], ignore_index=True)


Best Parameters for [95mlemmatization_BinaryVectorizer.csv + Logistic Regression[00m: [93m{'C': 1, 'max_iter': 100}[00m with accuracy: [92m0.8567415730337079[00m
Best Parameters for [95mlemmatization_BinaryVectorizer.csv + MultinomialNB[00m: [93m{'alpha': 0.5}[00m with accuracy: [92m0.8609550561797753[00m
Best Parameters for [95mlemmatization_BinaryVectorizer.csv + SVC[00m: [93m{'C': 1, 'kernel': 'rbf'}[00m with accuracy: [92m0.8525280898876404[00m
Best Parameters for [95mlemmatization_Bow.csv + Logistic Regression[00m: [93m{'C': 1, 'max_iter': 100}[00m with accuracy: [92m0.8553370786516854[00m
Best Parameters for [95mlemmatization_Bow.csv + MultinomialNB[00m: [93m{'alpha': 0.5}[00m with accuracy: [92m0.8595505617977528[00m
Best Parameters for [95mlemmatization_Bow.csv + SVC[00m: [93m{'C': 0.1, 'kernel': 'linear'}[00m with accuracy: [92m0.848314606741573[00m
Best Parameters for [95mlemmatization_TF-IDF.csv + Logistic Regression[00m: [93m{'C': 10, 