In [None]:
import cudf
import cuml
import dask
import dask_cudf
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from cuml.linear_model import LogisticRegression
import numpy as np

In [None]:
df = pd.read_csv("cleaned_dataset.csv")
X = df['cleaned_review']
y = df['sentiment']

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased").to("cuda")

In [None]:
def get_bert_embeddings(text_list):
    # Tokenize and get embeddings in batches
    inputs = tokenizer(text_list, return_tensors="pt", padding=True, truncation=True, max_length=512).to("cuda")
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the CLS token's embedding as the sentence embedding
    embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    return embeddings

In [None]:
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

# Convert to Dask DataFrame
dask_df = dd.from_pandas(df, npartitions=4)

# Parallel processing with Dask
with ProgressBar():
    X_embeddings = dask_df['cleaned_review'].map_partitions(lambda x: get_bert_embeddings(x.tolist())).compute()

In [None]:
X_embeddings_cudf = cudf.DataFrame.from_pandas(pd.DataFrame(X_embeddings))
y_cudf = cudf.Series(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_embeddings_cudf, y_cudf, test_size=0.2, random_state=42)

In [None]:
clf = LogisticRegression()
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
print("Classification Report:")
print(classification_report(y_test.to_array(), y_pred.to_array()))
print("Accuracy:", accuracy_score(y_test.to_array(), y_pred.to_array()))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming you have performance metrics from other models in a DataFrame `performance`
performance = pd.DataFrame({
    "Model": ["BERT"],
    "Accuracy": [accuracy_score(y_test.to_array(), y_pred.to_array())]
})

sns.barplot(x="Model", y="Accuracy", data=performance)
plt.title("BERT Model Performance")
plt.show()