In [2]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from math import sqrt
from sklearn.svm import SVC
import time
import tracemalloc
import xgboost as xgb

# get the start time
st = time.time()
tracemalloc.start()

# Load the dataset
df = pd.read_csv("restaurant.csv")

# Preprocess the text
df['text'] = df['review'].apply(lambda x: x.lower().split())

# Train the CBOW model
model = Word2Vec(df['text'], size=100, window=10, min_count=1, sg=1)


# Convert the text to numerical vectors using the CBOW model
text_vectors = []
for i in range(len(df)):
    text_vectors.append(np.mean([model.wv[word] for word in df['text'][i]], axis=0))
text_vectors = np.array(text_vectors)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(text_vectors, df['label'], test_size=0.2, random_state=0)

# Train multiple classifiers and use a voting ensemble
LogReg_clf = LogisticRegression()
SVC_clf = SVC(probability=True, kernel='rbf')
DTree_clf = DecisionTreeClassifier()
rf = RandomForestClassifier(random_state=1200, criterion='entropy', n_estimators=200)
#ada = AdaBoostClassifier(n_estimators=200, learning_rate=2.0, algorithm='SAMME')
ada = AdaBoostClassifier()
ETree = ExtraTreesClassifier(random_state=42)


ensemble = VotingClassifier(estimators=[('svm', SVC_clf), ('AdaBoost', ada), ('RF',rf )], voting='soft')
ensemble.fit(X_train, y_train)

# Predict the labels on the test set
y_pred = ensemble.predict(X_test)

# Evaluate the accuracy of the model
acc = accuracy_score(y_test, y_pred)
rms= sqrt(mean_squared_error(y_test, y_pred))

print("Accuracy For Voting Ensemble SVM+AadBoost+RF is: " + str(acc))
print("RMSE Error is: " + str(rms))


# get the end time
et = time.time()

# get the execution time
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

current, peak = tracemalloc.get_traced_memory()
# Stop tracemalloc
tracemalloc.stop()
print("Current memory usage is", current / (1024 * 1024), "MB; Peak was", peak / (1024 * 1024), "MB")

Accuracy For Voting Ensemble SVM+AadBoost+RF is: 0.9432692307692307
RMSE Error is: 0.23818221854447746
Execution time: 42.666717767715454 seconds
Current memory usage is 30.350130081176758 MB; Peak was 41.96127891540527 MB


In [5]:
# Boosting Ensemble (Adaboost classifier)

# get the start time
st = time.time()
tracemalloc.start()

clf = AdaBoostClassifier()
clf.fit(X_train, y_train)

# make predictions on the testing set
y_pred = clf.predict(X_test)

# evaluate the model's performance using accuracy and RMSE
accuracy = accuracy_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)

print("Accuracy for AdaBoost:", accuracy)
print("RMSE:", rmse)

# get the end time
et = time.time()

# get the execution time
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

current, peak = tracemalloc.get_traced_memory()
# Stop tracemalloc
tracemalloc.stop()
print("Current memory usage is", current / (1024 * 1024), "MB; Peak was", peak / (1024 * 1024), "MB")

Accuracy for AdaBoost: 0.9360576923076923
RMSE: 0.06394230769230769
Execution time: 5.767467498779297 seconds
Current memory usage is 0.055663108825683594 MB; Peak was 0.6820220947265625 MB


In [7]:
# Train XGBoost model
# get the start time
st = time.time()
tracemalloc.start()


model = xgb.XGBClassifier()
model.fit(X_train, y_train)

# Predict on test set and calculate accuracy
y_pred = model.predict(y_test)
acc = accuracy_score(y_test, y_pred)
print('Accuracy for XGBoost:', acc)

# get the end time
et = time.time()

# get the execution time
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

current, peak = tracemalloc.get_traced_memory()
# Stop tracemalloc
tracemalloc.stop()
#print(f"Current memory usage is {current / 10**6}MB; Peak was {peak / 10**6}MB")
print("Current memory usage is", current / (1024 * 1024), "MB; Peak was", peak / (1024 * 1024), "MB")

Accuracy for XGBoost: 0.7788461538461539
Execution time: 7.315021991729736 seconds
Current memory usage is 0.024686813354492188 MB; Peak was 0.26821041107177734 MB


In [5]:
#Skip-Gram
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score


# Load the dataset
df = pd.read_csv('restaurant.csv')

# Preprocess the text
df['text'] = df['review'].apply(lambda x: x.lower().split())

# Train a skip-gram word2vec model
model = Word2Vec(df['text'], size=150, window=15, min_count=1, sg=0)

# Extract features from the text using the skip-gram model
text_vectors = []
for i in range(len(df)):
    text_vectors.append(np.mean([model.wv[word] for word in df['text'][i]], axis=0))

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(text_vectors, df["label"], test_size=0.2, random_state=42)

# Train multiple classifiers and use a voting ensemble
LogReg_clf = LogisticRegression()
SVC_clf = SVC(probability=True, kernel='rbf')
DTree_clf = DecisionTreeClassifier()
rf = RandomForestClassifier(random_state=1200, criterion='entropy', n_estimators=200)
#ada = AdaBoostClassifier(n_estimators=200, learning_rate=2.0, algorithm='SAMME')
ada = AdaBoostClassifier()
ETree = ExtraTreesClassifier(random_state=42)


ensemble = VotingClassifier(estimators=[('svm', SVC_clf), ('AdaBoost', ada), ('RF',rf )], voting='soft')
ensemble.fit(X_train, y_train)

# Predict the labels on the test set
y_pred = ensemble.predict(X_test)

# Evaluate the accuracy of the model
acc = accuracy_score(y_test, y_pred)
rms= sqrt(mean_squared_error(y_test, y_pred))

print("Accuracy For Voting Ensemble SVM+AadBoost+RF is: " + str(acc))
print("RMSE Error is: " + str(rms))


# get the end time
et = time.time()

# get the execution time
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

current, peak = tracemalloc.get_traced_memory()
# Stop tracemalloc
tracemalloc.stop()
print("Current memory usage is", current / (1024 * 1024), "MB; Peak was", peak / (1024 * 1024), "MB")


Accuracy For Voting Ensemble SVM+AadBoost+RF is: 0.9293269230769231
RMSE Error is: 0.2658440838594625
Execution time: 251.33450889587402 seconds
Current memory usage is 0.0 MB; Peak was 0.0 MB


In [9]:
# train the Adaboost classifier
clf = AdaBoostClassifier()
clf.fit(X_train, y_train)

# make predictions on the testing set
y_pred = clf.predict(X_test)

# evaluate the model's performance using accuracy and RMSE
accuracy = accuracy_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)

print("Accuracy:", accuracy)
print("RMSE:", rmse)

Accuracy: 0.9100961538461538
RMSE: 0.08990384615384615


In [12]:
# Train XGBoost model
# get the start time
st = time.time()
tracemalloc.start()


model = xgb.XGBClassifier()
model.fit(X_train, y_train)

# Predict on test set and calculate accuracy
y_pred = model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print('Accuracy for XGBoost:', acc)

# get the end time
et = time.time()

# get the execution time
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

current, peak = tracemalloc.get_traced_memory()
# Stop tracemalloc
tracemalloc.stop()
#print(f"Current memory usage is {current / 10**6}MB; Peak was {peak / 10**6}MB")
print("Current memory usage is", current / (1024 * 1024), "MB; Peak was", peak / (1024 * 1024), "MB")

Accuracy for XGBoost: 0.916826923076923
Execution time: 6.85010027885437 seconds
Current memory usage is 0.023090362548828125 MB; Peak was 3.432027816772461 MB
