In [None]:
import os, random, re

train_path = "data/aclImdb/train/"
test_path  = "data/aclImdb/test/"

#adapted from https://developers.google.com/machine-learning/guides/text-classification/step-2
def load_train_dataset(path: str) -> tuple:
    train_texts = []
    train_labels = []
    
    for label in ['pos', 'neg']:
        cat_path = os.path.join(path, label)
        for file_name in os.listdir(cat_path):
            file_path = os.path.join(cat_path, file_name)
            with open(file_path, 'r') as file:
                text = file.read()
                train_texts.append(text)
            train_labels.append(0 if label=='neg' else 1)
    

    random.seed(1)
    random.shuffle(train_texts)
    random.seed(1)
    random.shuffle(train_labels)
    
    #remove html tags from the texts
    train_texts = [re.sub('<.*?>', '', text) for text in train_texts]
    
    return (train_texts, train_labels)

(train_texts, train_labels) = load_train_dataset(train_path)

In [None]:
def get_smaller_dataset(size: int, texts: list[str], labels: list[int], seed=10) -> tuple:
    
    random.seed(seed)
    smaller_texts = random.sample(texts, size)
    random.seed(seed)
    smaller_labels = random.sample(labels, size)

    return (smaller_texts, smaller_labels)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

random.seed()
seed = random.randint(1, 1000)

(sm_texts, sm_labels) = get_smaller_dataset(2000, train_texts, train_labels, seed=seed)
vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,2))

X_sm = vectorizer.fit_transform(sm_texts)
y_sm = np.array(sm_labels)

X_sm_train, X_sm_test, y_sm_train, y_sm_test = train_test_split(X_sm, y_sm, test_size=0.33, random_state=42)

In [None]:
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import accuracy_score

clf = ComplementNB()
clf.fit(X_sm_train, y_sm_train)
y_pred = clf.predict(X_sm_test)

accuracy_score(y_sm_test, y_pred)