In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import torch
import time
import tracemalloc
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from math import sqrt
from sklearn.svm import SVC
import time
import tracemalloc
import xgboost as xgb
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression

# get the start time
st = time.time()
tracemalloc.start()

# Load the dataset
df = pd.read_csv('restaurant.csv')

# Prepare the inputs and labels
texts = df['review'].values
labels = df['label'].values

# Load the RoBERTa tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("roberta-large")
model = AutoModel.from_pretrained("roberta-large")

# Extract features using RoBERTa
inputs = np.zeros((len(texts), 1024))
for i, text in enumerate(texts):
    input_ids = tokenizer.encode(text, return_tensors='pt')
    with torch.no_grad():
        last_hidden_states = model(input_ids).last_hidden_state
    inputs[i, :] = last_hidden_states[0, 0, :].numpy()

# Split the data into train, validation, and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(inputs, labels, test_size=0.2, random_state=42)

# Define the k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define the classifiers for the voting ensemble
svc = SVC(kernel='linear', C=1.0, probability=True)
ada = AdaBoostClassifier(n_estimators=100)
rf = RandomForestClassifier(n_estimators=100)

# Define the voting ensemble classifier
voting_clf = VotingClassifier(estimators=[('svc', svc), ('ada', ada), ('rf', rf)], voting='soft')

# Perform the k-fold cross-validation to tune hyperparameters
for train_index, val_index in kf.split(X_train_val):
    X_train, X_val = X_train_val[train_index], X_train_val[val_index]
    y_train, y_val = y_train_val[train_index], y_train_val[val_index]
    
    # Train the voting ensemble classifier
    voting_clf.fit(X_train, y_train)
    
    # Evaluate the classifier on the validation set
    val_pred = voting_clf.predict(X_val)
    val_acc = accuracy_score(y_val, val_pred)
    print("Validation accuracy:", val_acc)
    
    # Reset the classifiers
    svc = SVC(kernel='linear', C=1.0, probability=True)
    ada = AdaBoostClassifier(n_estimators=100)
    rf = RandomForestClassifier(n_estimators=100)
    voting_clf = VotingClassifier(estimators=[('svc', svc), ('ada', ada), ('rf', rf)], voting='soft')

# Evaluate the voting ensemble classifier on the test set
voting_clf.fit(X_train_val, y_train_val)
test_pred = voting_clf.predict(X_test)
test_acc = accuracy_score(y_test, test_pred)
print("Test accuracy for Voting:", test_acc)
rms= sqrt(mean_squared_error(y_test, test_pred))
print("RMSE Error is: " + str(rms))

# Train a BaggingClassifier using Logistic Regression as the base estimator
clf = BaggingClassifier(base_estimator=LogReg_clf, n_estimators=100, random_state=42)
clf.fit(X_train_val, y_train_val)
test_pred = voting_clf.predict(X_test)
test_acc = accuracy_score(y_test, test_pred)
print("Test accuracy for Bag LR:", test_acc)
rms= sqrt(mean_squared_error(y_test, test_pred))
print("RMSE Error is: " + str(rms))

# Train a BaggingClassifier using RF as the base estimator
clf = BaggingClassifier(base_estimator=rf, n_estimators=100, random_state=42)
clf.fit(X_train_val, y_train_val)
test_pred = voting_clf.predict(X_test)
test_acc = accuracy_score(y_test, test_pred)
print("Test accuracy for Bag RF:", test_acc)
rms= sqrt(mean_squared_error(y_test, test_pred))
print("RMSE Error is: " + str(rms))

# Train a BaggingClassifier using DTree as the base estimator
clf = BaggingClassifier(base_estimator=DTree_clf, n_estimators=100, random_state=42)
clf.fit(X_train_val, y_train_val)
test_pred = voting_clf.predict(X_test)
test_acc = accuracy_score(y_test, test_pred)
print("Test accuracy for Bag DTree:", test_acc)
rms= sqrt(mean_squared_error(y_test, test_pred))
print("RMSE Error is: " + str(rms))

clf = xgb.XGBClassifier(n_estimators=70, learning_rate=0.9)
clf.fit(X_train_val, y_train_val)
test_pred = voting_clf.predict(X_test)
test_acc = accuracy_score(y_test, test_pred)
print("Test accuracy for XBoost:", test_acc)
rms= sqrt(mean_squared_error(y_test, test_pred))
print("RMSE Error is: " + str(rms))


# get the end time
et = time.time()

# get the execution time
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

current, peak = tracemalloc.get_traced_memory()
# Stop tracemalloc
tracemalloc.stop()
print("Current memory usage is", current / (1024 * 1024), "MB; Peak was", peak / (1024 * 1024), "MB")
