In [2]:
import numpy as np
import umap
from sklearn.manifold import TSNE
from sklearn.decomposition import KernelPCA

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import LatentDirichletAllocation, NMF, PCA, SparsePCA
from sklearn.pipeline import Pipeline
from dataset.dataset import Dataset
from constants import CLEANED_DATASET_PATH

In [3]:
RANDOM_SEED = 0
N_ESTIMATORS = 511

In [4]:
dataset = Dataset(full_data_path=CLEANED_DATASET_PATH,
                  from_scratch=False,
                  split_sizes=[10000, 4232, 4232])
dataset.build()

X_train = dataset.get_features(split_type="train")
Y_train = dataset.get_labels(split_type="train")
X_val = dataset.get_features(split_type="val")
Y_val = dataset.get_labels(split_type="val")
X_test = dataset.get_features(split_type="test")
Y_test = dataset.get_labels(split_type="test")

Data loaded from dataset/cleaned_dataset.pkl


# Classifier only Pipeline

## Count Vectorizer

In [5]:
# Create the pipeline
N_ESTIMATORS = 511
counts_rf_pipeline = Pipeline([
    ("vectorizer", CountVectorizer(max_df=0.95, min_df=2)),
    ("rf", RandomForestClassifier(n_estimators=511, random_state=RANDOM_SEED))
])

# Fit the pipeline on the training data
counts_rf_pipeline.fit(X_train, Y_train)

# Evaluate the pipeline
Y_pred = counts_rf_pipeline.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)
print(f"Best Counts-RF only Model Accuracy: {accuracy}")
print(f"Best Counts-RF only Model F1-Score: {f1}")

Best Counts-RF only Model Accuracy: 0.7374763705103969
Best Counts-RF only Model F1-Score: 0.7036543078154175


## TF-IDF Vectorizer

In [6]:
# Create the pipeline
N_ESTIMATORS = 511
tfidf_rf_pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(max_df=0.95, min_df=2)),
    ("rf", RandomForestClassifier(n_estimators=N_ESTIMATORS, random_state=RANDOM_SEED))
])

# Fit the pipeline on the training data
tfidf_rf_pipeline.fit(X_train, Y_train)

# Evaluate the pipeline
Y_pred = tfidf_rf_pipeline.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)
print(f"Best TFIDF-RF only Model Accuracy: {accuracy}")
print(f"Best TFIDF-RF only Model F1-Score: {f1}")

Best TFIDF-RF only Model Accuracy: 0.7318052930056711
Best TFIDF-RF only Model F1-Score: 0.7103853023730543


# LDA/NMF-Classifier Pipeline

## Count Vectorizer - LDA Pipeline

In [7]:
# Setup
LDA_TOPICS = 10
N_ESTIMATORS = 511

# Create and fit the pipeline
counts_lda_rf_pipeline = Pipeline([
    ("vectorizer", CountVectorizer(max_df=0.95, min_df=2)),
    ("lda", LatentDirichletAllocation(n_components=LDA_TOPICS, random_state=RANDOM_SEED)),
    ("rf", RandomForestClassifier(n_estimators=511, random_state=RANDOM_SEED))
])

# Fit the pipeline
counts_lda_rf_pipeline.fit(X_train, Y_train)

# Evaluate the pipeline
Y_pred = counts_lda_rf_pipeline.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)
print(f"Best Counts-LDA-RF Model Accuracy: {accuracy}")
print(f"Best Counts-LDA-RF Model F1-Score: {f1}")

Best Counts-LDA-RF Model Accuracy: 0.6422495274102079
Best Counts-LDA-RF Model F1-Score: 0.6172901921132457


## Count Vectorizer - NMF Pipeline

In [8]:
# Setup
NMF_TOPICS = 150

# Create and fit the pipeline
counts_nmf_rf_pipeline = Pipeline([
    ("vectorizer", CountVectorizer(max_df=0.95, min_df=2)),
    ("nmf", NMF(n_components=NMF_TOPICS, random_state=RANDOM_SEED)),
    ("rf", RandomForestClassifier(n_estimators=511, random_state=RANDOM_SEED))
])

# Fit the pipeline
counts_nmf_rf_pipeline.fit(X_train, Y_train)

# Evaluate the pipeline
Y_pred = counts_nmf_rf_pipeline.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)
print(f"Best Counts-NMF-RF Model Accuracy: {accuracy}")
print(f"Best Counts-NMF-RF Model F1-Score: {f1}")

Best Counts-NMF-RF Model Accuracy: 0.7162098298676749
Best Counts-NMF-RF Model F1-Score: 0.6926030202201178


## TF-IDF Vectorizer - LDA Pipeline

In [9]:
# Setup
LDA_TOPICS = 10

# Create and fit the pipeline
tfidf_lda_rf_pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(max_df=0.95, min_df=2)),
    ("lda", LatentDirichletAllocation(n_components=NMF_TOPICS, random_state=RANDOM_SEED)),
    ("rf", RandomForestClassifier(n_estimators=511, random_state=RANDOM_SEED))
])

# Fit the pipeline
tfidf_lda_rf_pipeline.fit(X_train, Y_train)

# Evaluate the pipeline
Y_pred = tfidf_lda_rf_pipeline.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)
print(f"Best TFIDF-LDA-RF Model Accuracy: {accuracy}")
print(f"Best TFIDF-LDA-RF Model F1-Score: {f1}")

Best TFIDF-LDA-RF Model Accuracy: 0.6372873345935728
Best TFIDF-LDA-RF Model F1-Score: 0.5997392438070405


## TF-IDF Vectorizer - NMF Pipeline

In [10]:
# Setup
NMF_TOPICS = 150

# Create and fit the pipeline
tfidf_lda_rf_pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer(max_df=0.95, min_df=2)),
    ("nmf", NMF(n_components=NMF_TOPICS, random_state=RANDOM_SEED)),
    ("rf", RandomForestClassifier(n_estimators=511, random_state=RANDOM_SEED))
])

# Fit the pipeline
tfidf_lda_rf_pipeline.fit(X_train, Y_train)

# Evaluate the pipeline
Y_pred = tfidf_lda_rf_pipeline.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)
print(f"Best TFIDF-NMF-RF Model Accuracy: {accuracy}")
print(f"Best TFIDF-NMF-RF Model Accuracy: {f1}")

Best TFIDF-NMF-RF Model Accuracy: 0.7232986767485823
Best TFIDF-NMF-RF Model Accuracy: 0.7049634668682287


# FastText - Classifier Pipeline

In [11]:
# from gensim.models import FastText
# from nltk.tokenize import word_tokenize
# import numpy as np
# 
# # Tokenize the text
# tokenized_train = [word_tokenize(doc.lower()) for doc in X_train]
# tokenized_val = [word_tokenize(doc.lower()) for doc in X_val]
# tokenized_test = [word_tokenize(doc.lower()) for doc in X_test]
# 
# # Train FastText model on your training dataset
# model = FastText(vector_size=200, window=10)  # Adjust parameters as needed
# model.build_vocab(tokenized_train)
# model.train(tokenized_train, total_examples=len(tokenized_train), epochs=10)
# 
# # Function to convert text documents to a mean vector
# def document_vector(doc, model):
#     # Remove out-of-vocabulary words
#     doc = [word for word in doc if word in model.wv.key_to_index]
#     if not doc:
#         return np.zeros(model.vector_size)
#     return np.mean(model.wv[doc], axis=0)
# 
# # Vectorize the documents
# X_train_embeddings = np.array([document_vector(doc, model) for doc in tokenized_train])
# X_val_embeddings = np.array([document_vector(doc, model) for doc in tokenized_val])
# X_test_embeddings = np.array([document_vector(doc, model) for doc in tokenized_test])

In [12]:
# # Classifier Training
# rf = RandomForestClassifier(n_estimators=511, random_state=RANDOM_SEED)
# rf.fit(X_train_embeddings, Y_train)
# Y_pred = rf.predict(X_test_embeddings)
# accuracy = accuracy_score(Y_test, Y_pred)
# f1 = f1_score(Y_test, Y_pred)
# print(f"Best FastText-RF Model Accuracy: {score}")

# Dimensionality Reduction - Classifier Pipeline

## Kernel PCA

In [13]:
# Create and fit the pipeline
kpca_rf_pipeline = Pipeline([
    ("vectorizer", CountVectorizer(max_df=0.95, min_df=2)),
    ("kpca", KernelPCA(n_components=50, kernel="rbf", gamma=0.01, n_jobs=-1)),
    ("rf", RandomForestClassifier(n_estimators=511, random_state=RANDOM_SEED))
])

# Fit the pipeline
kpca_rf_pipeline.fit(X_train, Y_train)

# Evaluate the pipeline
Y_pred = kpca_rf_pipeline.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)
print(f"Best KPCA-RF Model Accuracy: {accuracy}")
print(f"Best KPCA-RF Model F1-Score: {f1}")

Best KPCA-RF Model Accuracy: 0.6720226843100189
Best KPCA-RF Model F1-Score: 0.6468193384223918


## UMAP

In [17]:
import umap.umap_ as umap

In [18]:
# Create and fit the pipeline
umap_rf_pipeline = Pipeline([
    ("vectorizer", CountVectorizer(max_df=0.95, min_df=2)),
    ("umap", umap.UMAP()),
    ("rf", RandomForestClassifier(n_estimators=511, random_state=RANDOM_SEED))
])

# Fit the pipeline
umap_rf_pipeline.fit(X_train, Y_train)

# Evaluate the pipeline
Y_pred = umap_rf_pipeline.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred)
print(f"Best UMAP-RF Model Accuracy: {accuracy}")
print(f"Best UMAP-RF Model F1-Score: {f1}")

  self._set_arrayXarray(i, j, x)


Best UMAP-RF Model Accuracy: 0.6042060491493384
Best UMAP-RF Model F1-Score: 0.4444444444444444


## Sparse PCA