In [None]:
import os, random, re

DATA_PATH  = "data/aclImdb/"

def load_inidv_dataset(set_path: str) -> tuple:
    texts = []
    labels = []
    
    for label in ['pos', 'neg']:
        cat_path = os.path.join(set_path, label)
        for file_name in os.listdir(cat_path):
            file_path = os.path.join(cat_path, file_name)
            with open(file_path, 'r') as file:
                text = file.read()
                texts.append(text)
            labels.append(0 if label=='neg' else 1)
            
    return (texts, labels)


def load_dataset(train_path: str, test_path: str, seed=1) -> tuple:
    train_texts = []
    train_labels = []
    
    (train_texts, train_labels) = load_inidv_dataset(set_path=train_path)
    (test_texts, test_labels)   = load_inidv_dataset(set_path=test_path)

    random.seed(seed)
    random.shuffle(train_texts)
    random.shuffle(test_texts)
    
    random.seed(seed)
    random.shuffle(train_labels)
    random.shuffle(test_labels)
    
    #remove html tags from the texts
    train_texts = [re.sub('<.*?>', '', text) for text in train_texts]
    test_texts  = [re.sub('<.*?>', '', text) for text in test_texts]
    
    return ((train_texts, train_labels), (test_texts, test_labels))

In [None]:
train_path = os.path.join(DATA_PATH, "train")
test_path  = os.path.join(DATA_PATH, "test")

((train_texts, train_labels), (test_texts, test_labels)) = load_dataset(train_path=train_path, test_path=test_path)

In [None]:
def get_smaller_dataset(size: int, texts: list[str], labels: list[int], seed=10) -> tuple:
    
    random.seed(seed)
    smaller_texts = random.sample(texts, size)
    random.seed(seed)
    smaller_labels = random.sample(labels, size)

    return (smaller_texts, smaller_labels)

In [None]:
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_extraction.text import TfidfVectorizer

random.seed()
seed = random.randint(1, 1000)

(sm_texts, sm_labels) = get_smaller_dataset(10000, train_texts, train_labels, seed=seed)

selector = SelectKBest(score_func=f_classif, k=10000)
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=5)

X_sm = vectorizer.fit_transform(sm_texts)
y_sm = np.array(sm_labels)

X_sm = selector.fit_transform(X_sm, y_sm)

X_sm_train, X_sm_test, y_sm_train, y_sm_test = train_test_split(X_sm, y_sm, test_size=0.33, random_state=42)

In [None]:
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import accuracy_score

clf = ComplementNB()
clf.fit(X_sm_train, y_sm_train)
y_pred = clf.predict(X_sm_test)

accuracy_score(y_sm_test, y_pred)

In [None]:
from torch import nn