In [1]:
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import LatentDirichletAllocation, NMF
from dataset.dataset import Dataset
from constants import CLEANED_DATASET_PATH

In [2]:
RANDOM_SEED = 0

In [3]:
dataset = Dataset(full_data_path=CLEANED_DATASET_PATH,
                  from_scratch=False,
                  split_sizes=[10000, 4232, 4232])
dataset.build()

X_train = dataset.get_features(split_type="train")
Y_train = dataset.get_labels(split_type="train")
X_val = dataset.get_features(split_type="val")
Y_val = dataset.get_labels(split_type="val")
X_test = dataset.get_features(split_type="test")
Y_test = dataset.get_labels(split_type="test")

Data loaded from dataset/cleaned_dataset.pkl


# Classifier only Pipeline

## Count Vectorizer

In [4]:
# Define the CountVectorizer
count_vectorizer = CountVectorizer(max_df=0.95, min_df=2)
X_train_counts = count_vectorizer.fit_transform(X_train)
X_val_counts = count_vectorizer.transform(X_val)
X_test_counts = count_vectorizer.transform(X_test)

In [5]:
# Classifier Training
rf = RandomForestClassifier(n_estimators=511, random_state=RANDOM_SEED)
rf.fit(X_train_counts, Y_train)
Y_pred = rf.predict(X_test_counts)
score = accuracy_score(Y_test, Y_pred)
print(f"Best Counts-RF only Model Accuracy: {score}")

Best Counts-RF only Model Accuracy: 0.7374763705103969


## TF-IDF Vectorizer

In [6]:
# Define the CountVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [7]:
# Classifier Training
rf = RandomForestClassifier(n_estimators=511, random_state=RANDOM_SEED)
rf.fit(X_train_tfidf, Y_train)
Y_pred = rf.predict(X_test_tfidf)
score = accuracy_score(Y_test, Y_pred)
print(f"Best TFIDF-RF only Model Accuracy: {score}")

Best TFIDF-RF only Model Accuracy: 0.7318052930056711


# LDA/NMF-Classifier Pipeline

## Count Vectorizer - LDA Pipeline

In [8]:
LDA_TOPICS = 10
print(f"Performing Latent Dirichlet Allocation for {LDA_TOPICS} topics")
lda = LatentDirichletAllocation(n_components=LDA_TOPICS, random_state=RANDOM_SEED)
X_train_lda = lda.fit_transform(X_train_counts)
X_val_lda = lda.transform(X_val_counts)
X_test_lda = lda.transform(X_test_counts)
print(f"Done performing Latent Dirichlet Allocation for {LDA_TOPICS} topics")

Performing Latent Dirichlet Allocation for 10 topics
Done performing Latent Dirichlet Allocation for 10 topics


In [9]:
# Classifier Training
rf = RandomForestClassifier(n_estimators=511, random_state=RANDOM_SEED)
rf.fit(X_train_lda, Y_train)
Y_pred = rf.predict(X_test_lda)
score = accuracy_score(Y_test, Y_pred)
print(f"Best Counts-LDA-RF Model Accuracy: {score}")

Best Counts-LDA-RF Model Accuracy: 0.6422495274102079


## Count Vectorizer - NMF Pipeline

In [10]:
NMF_TOPICS = 150
print(f"Performing Non-negative Matrix Factorization for {NMF_TOPICS} topics")
nmf = NMF(n_components=NMF_TOPICS, random_state=RANDOM_SEED)
X_train_nmf = nmf.fit_transform(X_train_counts)
X_val_nmf = nmf.transform(X_val_counts)
X_test_nmf = nmf.transform(X_test_counts)
print(f"Done performing Non-negative Matrix Factorization for {NMF_TOPICS} topics")

Performing Non-negative Matrix Factorization for 150 topics
Done performing Non-negative Matrix Factorization for 150 topics


In [11]:
# Classifier Training
rf = RandomForestClassifier(n_estimators=511, random_state=RANDOM_SEED)
rf.fit(X_train_nmf, Y_train)
Y_pred = rf.predict(X_test_nmf)
score = accuracy_score(Y_test, Y_pred)
print(f"Best Counts-NMF-RF Model Accuracy: {score}")

Best Counts-NMF-RF Model Accuracy: 0.7162098298676749


## TF-IDF Vectorizer - LDA Pipeline

In [12]:
LDA_TOPICS = 10
print(f"Performing Latent Dirichlet Allocation for {LDA_TOPICS} topics")
lda = LatentDirichletAllocation(n_components=LDA_TOPICS, random_state=RANDOM_SEED)
X_train_lda = lda.fit_transform(X_train_tfidf)
X_val_lda = lda.transform(X_val_tfidf)
X_test_lda = lda.transform(X_test_tfidf)
print(f"Done performing Latent Dirichlet Allocation for {LDA_TOPICS} topics")

Performing Latent Dirichlet Allocation for 10 topics
Done performing Latent Dirichlet Allocation for 10 topics


In [13]:
# Classifier Training
rf = RandomForestClassifier(n_estimators=511, random_state=RANDOM_SEED)
rf.fit(X_train_lda, Y_train)
Y_pred = rf.predict(X_test_lda)
score = accuracy_score(Y_test, Y_pred)
print(f"Best TFIDF-LDA-RF Model Accuracy: {score}")

Best TFIDF-LDA-RF Model Accuracy: 0.599952741020794


## TF-IDF Vectorizer - NMF Pipeline

In [14]:
NMF_TOPICS = 150
print(f"Performing Non-negative Matrix Factorization for {NMF_TOPICS} topics")
nmf = NMF(n_components=NMF_TOPICS, random_state=RANDOM_SEED)
X_train_nmf = nmf.fit_transform(X_train_tfidf)
X_val_nmf = nmf.transform(X_val_tfidf)
X_test_nmf = nmf.transform(X_test_tfidf)
print(f"Done performing Non-negative Matrix Factorization for {NMF_TOPICS} topics")

Performing Non-negative Matrix Factorization for 150 topics
Done performing Non-negative Matrix Factorization for 150 topics


In [15]:
# Classifier Training
rf = RandomForestClassifier(n_estimators=511, random_state=RANDOM_SEED)
rf.fit(X_train_nmf, Y_train)
Y_pred = rf.predict(X_test_nmf)
score = accuracy_score(Y_test, Y_pred)
print(f"Best TFIDF-NMF-RF Model Accuracy: {score}")

Best TFIDF-NMF-RF Model Accuracy: 0.7232986767485823


# FastText - Classifier Pipeline

In [16]:
from gensim.models import FastText
from nltk.tokenize import word_tokenize
import numpy as np

# Tokenize the text
tokenized_train = [word_tokenize(doc.lower()) for doc in X_train]
tokenized_val = [word_tokenize(doc.lower()) for doc in X_val]
tokenized_test = [word_tokenize(doc.lower()) for doc in X_test]

# Train FastText model on your training dataset
model = FastText(vector_size=200, window=10)  # Adjust parameters as needed
model.build_vocab(tokenized_train)
model.train(tokenized_train, total_examples=len(tokenized_train), epochs=10)

# Function to convert text documents to a mean vector
def document_vector(doc, model):
    # Remove out-of-vocabulary words
    doc = [word for word in doc if word in model.wv.key_to_index]
    if not doc:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[doc], axis=0)

# Vectorize the documents
X_train_embeddings = np.array([document_vector(doc, model) for doc in tokenized_train])
X_val_embeddings = np.array([document_vector(doc, model) for doc in tokenized_val])
X_test_embeddings = np.array([document_vector(doc, model) for doc in tokenized_test])

In [17]:
# Classifier Training
rf = RandomForestClassifier(n_estimators=511, random_state=RANDOM_SEED)
rf.fit(X_train_embeddings, Y_train)
Y_pred = rf.predict(X_test_embeddings)
score = accuracy_score(Y_test, Y_pred)
print(f"Best FastText-RF Model Accuracy: {score}")

Best FastText-RF Model Accuracy: 0.6713137996219282
