In [None]:
import cudf
import cuml
import dask
import dask.dataframe as dd
from cuml.feature_extraction.text import TfidfVectorizer, CountVectorizer
from cuml.naive_bayes import MultinomialNB
from cuml.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("cleaned_dataset.csv")
df.head()

In [None]:
print(df['sentiment'].value_counts())
dask_df = dd.from_pandas(df, npartitions=4)

In [None]:
X = dask_df['cleaned_review'].compute()
y = dask_df['sentiment'].compute()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = cudf.Series(X_train)
X_test = cudf.Series(X_test)
y_train = cudf.Series(y_train)
y_test = cudf.Series(y_test)

In [None]:
print("Bag of Words Model on GPU:")
bow_vectorizer = CountVectorizer()  # Using CuML's CountVectorizer
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

# Train and evaluate model on BoW features
bow_model = MultinomialNB()
bow_model.fit(X_train_bow, y_train)
y_pred_bow = bow_model.predict(X_test_bow)

# Convert GPU predictions to numpy arrays for compatibility with sklearn metrics
y_pred_bow = y_pred_bow.to_array()
y_test_np = y_test.to_array()

print("BoW Classification Report:")
print(classification_report(y_test_np, y_pred_bow))
print("BoW Accuracy:", accuracy_score(y_test_np, y_pred_bow))

In [None]:
print("TF-IDF Model on GPU:")
tfidf_vectorizer = TfidfVectorizer()  # Using CuML's TF-IDF for GPU
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train and evaluate model on TF-IDF features
tfidf_model = MultinomialNB()
tfidf_model.fit(X_train_tfidf, y_train)
y_pred_tfidf = tfidf_model.predict(X_test_tfidf)

# Convert GPU predictions to numpy for compatibility with sklearn metrics
y_pred_tfidf = y_pred_tfidf.to_array()

print("TF-IDF Classification Report:")
print(classification_report(y_test_np, y_pred_tfidf))
print("TF-IDF Accuracy:", accuracy_score(y_test_np, y_pred_tfidf))

In [None]:
performance = pd.DataFrame({
    "Model": ["BoW", "TF-IDF"],
    "Accuracy": [accuracy_score(y_test_np, y_pred_bow), accuracy_score(y_test_np, y_pred_tfidf)]
})

sns.barplot(x="Model", y="Accuracy", data=performance)
plt.title("BoW vs TF-IDF Model Performance")
plt.show()