In [None]:
import cudf
import cuml
import dask
import dask.dataframe as dd
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.naive_bayes import MultinomialNB
from cuml.linear_model import LogisticRegression
from cuml.ensemble import RandomForestClassifier
from cuml.metrics import accuracy_score
from dask_ml.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = cudf.read_csv("cleaned_dataset.csv")
X = df['cleaned_review']
y = df['sentiment']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = cudf.Series(X_train)
X_test = cudf.Series(X_test)
y_train = cudf.Series(y_train)
y_test = cudf.Series(y_test)    

In [None]:
cluster = LocalCUDACluster()
client = Client(cluster)
models = {
    'Logistic Regression': LogisticRegression(),
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(),
    # Add XGBoost or any other GPU-compatible model
}

In [None]:
results = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred = y_pred.to_array()
    y_test_cpu = y_test.to_array()  # Convert to CPU for compatibility with sklearn metrics
    accuracy = accuracy_score(y_test_cpu, y_pred)
    results[model_name] = accuracy

    # Print classification report
    print(f"{model_name} Classification Report:")
    print(classification_report(y_test_cpu, y_pred))

In [None]:
results_df = pd.DataFrame(list(results.items()), columns=['Model', 'Accuracy'])
print(results_df)