In [None]:
import cudf
import cupy as cp
import dask
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from gensim.models import Word2Vec
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("cleaned_dataset.csv")
X = df['cleaned_review']
y = df['sentiment']

# Convert to Dask DataFrame for parallel processing
dask_df = dd.from_pandas(df, npartitions=4)

In [None]:
from gensim.models import Word2Vec

# Tokenize and preprocess data (using Dask for parallel tokenization)
def tokenize(text):
    return text.split()

# Parallel tokenization
with ProgressBar():
    tokenized_reviews = dask_df['cleaned_review'].map(tokenize).compute()

# Initialize and train Word2Vec model
w2v_model = Word2Vec(sentences=tokenized_reviews, vector_size=100, window=5, min_count=2, workers=4)


In [None]:
import os
from tqdm import tqdm

# Load GloVe embeddings (using Dask for parallel loading)
def load_glove_embeddings(glove_file_path):
    embeddings = {}
    with open(glove_file_path, 'r') as file:
        for line in tqdm(file):
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove_embeddings = load_glove_embeddings("glove.6B.100d.txt")

In [None]:
def get_sentence_embedding(sentence, model):
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Apply sentence embeddings in parallel for Word2Vec
with ProgressBar():
    X_word2vec = dask_df['cleaned_review'].map(lambda x: get_sentence_embedding(x.split(), w2v_model)).compute()

In [None]:
def get_sentence_glove_embedding(sentence, embeddings):
    vectors = [embeddings[word] for word in sentence if word in embeddings]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(100)  # 100 is the dimension of GloVe vectors

with ProgressBar():
    X_glove = dask_df['cleaned_review'].map(lambda x: get_sentence_glove_embedding(x.split(), glove_embeddings)).compute()

In [None]:
from cuml.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Convert to cuDF for GPU usage
X_word2vec = cudf.DataFrame.from_pandas(pd.DataFrame(X_word2vec))
X_glove = cudf.DataFrame.from_pandas(pd.DataFrame(X_glove))
y_cudf = cudf.Series(y)

# Split the data
X_train_w2v, X_test_w2v, y_train, y_test = train_test_split(X_word2vec, y_cudf, test_size=0.2, random_state=42)
X_train_glove, X_test_glove, _, _ = train_test_split(X_glove, y_cudf, test_size=0.2, random_state=42)

In [None]:
# Training and evaluation with Word2Vec embeddings
model_w2v = MultinomialNB()
model_w2v.fit(X_train_w2v, y_train)
y_pred_w2v = model_w2v.predict(X_test_w2v)

# Evaluate
print("Word2Vec Classification Report:")
print(classification_report(y_test.to_array(), y_pred_w2v.to_array()))

In [None]:
model_glove = MultinomialNB()
model_glove.fit(X_train_glove, y_train)
y_pred_glove = model_glove.predict(X_test_glove)

# Evaluate
print("GloVe Classification Report:")
print(classification_report(y_test.to_array(), y_pred_glove.to_array()))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

performance = pd.DataFrame({
    "Model": ["Word2Vec", "GloVe"],
    "Accuracy": [accuracy_score(y_test.to_array(), y_pred_w2v.to_array()), accuracy_score(y_test.to_array(), y_pred_glove.to_array())]
})

sns.barplot(x="Model", y="Accuracy", data=performance)
plt.title("Word2Vec vs GloVe Model Performance")
plt.show()