In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC

# Load the dataset
df = pd.read_csv("GrammarandProductReviews[modified].csv")

# Preprocess the text
df['text'] = df['review'].apply(lambda x: x.lower().split())

# Train the CBOW model
model = Word2Vec(df['text'], size=100, window=5, min_count=1, sg=1)


# Convert the text to numerical vectors using the CBOW model
text_vectors = []
for i in range(len(df)):
    text_vectors.append(np.mean([model.wv[word] for word in df['text'][i]], axis=0))
text_vectors = np.array(text_vectors)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(text_vectors, df['positive_review'], test_size=0.2, random_state=0)

# Train multiple classifiers and use a voting ensemble
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3= SVC()
ensemble = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)], voting='hard')
ensemble.fit(X_train, y_train)

# Predict the labels on the test set
y_pred = ensemble.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)



Accuracy:  0.8867041198501873


In [6]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from math import sqrt
from sklearn.svm import SVC
import time
import tracemalloc
import xgboost as xgb

# get the start time
st = time.time()
tracemalloc.start()

# Load the dataset
df = pd.read_csv("GrammarandProductReviews[modified].csv")

# Preprocess the text
df['text'] = df['review'].apply(lambda x: x.lower().split())

# Train the CBOW model
model = Word2Vec(df['text'], size=200, window=20, min_count=1, sg=1)


# Convert the text to numerical vectors using the CBOW model
text_vectors = []
for i in range(len(df)):
    text_vectors.append(np.mean([model.wv[word] for word in df['text'][i]], axis=0))
text_vectors = np.array(text_vectors)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(text_vectors, df['positive_review'], test_size=0.2, random_state=0)

# Train multiple classifiers and use a voting ensemble
LogReg_clf = LogisticRegression()
SVC_clf = SVC(probability=True, kernel='rbf')
DTree_clf = DecisionTreeClassifier()
rf = RandomForestClassifier(random_state=1200, criterion='entropy', n_estimators=200)
ada = AdaBoostClassifier(n_estimators=200, learning_rate=2.0, algorithm='SAMME')
#ada = AdaBoostClassifier()
ETree = ExtraTreesClassifier(random_state=42)


ensemble = VotingClassifier(estimators=[('svm', SVC_clf), ('AdaBoost', ada), ('RF',rf )], voting='soft')
ensemble.fit(X_train, y_train)

# Predict the labels on the test set
y_pred = ensemble.predict(X_test)

# Evaluate the accuracy of the model
acc = accuracy_score(y_test, y_pred)
rms= sqrt(mean_squared_error(y_test, y_pred))

print("Accuracy For Voting Ensemble SVM+AadBoost+RF is: " + str(acc))
print("RMSE Error is: " + str(rms))


# get the end time
et = time.time()

# get the execution time
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

current, peak = tracemalloc.get_traced_memory()
# Stop tracemalloc
tracemalloc.stop()
print("Current memory usage is", current / (1024 * 1024), "MB; Peak was", peak / (1024 * 1024), "MB")

Accuracy For Voting Ensemble SVM+AadBoost+RF is: 0.901685393258427
RMSE Error is: 0.31355160140170396
Execution time: 125.90628409385681 seconds
Current memory usage is 56.38986778259277 MB; Peak was 78.3046464920044 MB


In [4]:
# Boosting Ensemble (Adaboost classifier)

# get the start time
st = time.time()
tracemalloc.start()

clf = AdaBoostClassifier()
clf.fit(X_train, y_train)

# make predictions on the testing set
y_pred = clf.predict(X_test)

# evaluate the model's performance using accuracy and RMSE
accuracy = accuracy_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)

print("Accuracy for AdaBoost:", accuracy)
print("RMSE:", rmse)

# get the end time
et = time.time()

# get the execution time
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

current, peak = tracemalloc.get_traced_memory()
# Stop tracemalloc
tracemalloc.stop()
print("Current memory usage is", current / (1024 * 1024), "MB; Peak was", peak / (1024 * 1024), "MB")

Accuracy for AdaBoost: 0.8703183520599251
RMSE: 0.12968164794007492
Execution time: 12.649820804595947 seconds
Current memory usage is 0.05268383026123047 MB; Peak was 0.6943483352661133 MB


In [8]:
# Train XGBoost model
# get the start time
st = time.time()
tracemalloc.start()


model = xgb.XGBClassifier(n_estimators=100, learning_rate=1.0)
model.fit(X_train, y_train)

# Predict on test set and calculate accuracy
y_pred = model.predict(y_test)
acc = accuracy_score(y_test, y_pred)
print('Accuracy for XGBoost:', acc)

# get the end time
et = time.time()

# get the execution time
elapsed_time = et - st
print('Execution time:', elapsed_time, 'seconds')

current, peak = tracemalloc.get_traced_memory()
# Stop tracemalloc
tracemalloc.stop()
#print(f"Current memory usage is {current / 10**6}MB; Peak was {peak / 10**6}MB")
print("Current memory usage is", current / (1024 * 1024), "MB; Peak was", peak / (1024 * 1024), "MB")

Accuracy for XGBoost: 0.6427902621722846
Execution time: 10.921875476837158 seconds
Current memory usage is 0.024506568908691406 MB; Peak was 0.21202564239501953 MB


In [2]:
from sklearn.ensemble import BaggingClassifier
LogReg_clf = LogisticRegression(max_iter=1000)
logreg_bagging_model = BaggingClassifier(base_estimator=LogReg_clf, n_estimators=50)

#dtree_bagging_model = BaggingClassifier(base_estimator=DTree_clf, n_estimators=50)
#random_forest = RandomForestClassifier(n_estimators=100)
#extra_trees = ExtraTreesClassifier(n_estimators=100)

def bagging_ensemble(model):
    k_folds = KFold(n_splits=20)
    results = cross_val_score(model, X_train, y_train, cv=k_folds)
    print(results.mean())

bagging_ensemble(logreg_bagging_model)
#bagging_ensemble(dtree_bagging_model)
#bagging_ensemble(random_forest)
#bagging_ensemble(extra_trees)

0.869479524612051


In [5]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, log_loss, mean_squared_error,recall_score,precision_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.svm import SVC
from math import sqrt

In [6]:
# train the Adaboost classifier
clf = AdaBoostClassifier()
clf.fit(X_train, y_train)

# make predictions on the testing set
y_pred = clf.predict(X_test)

# evaluate the model's performance using accuracy and RMSE
accuracy = accuracy_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)

print("Accuracy:", accuracy)
print("RMSE:", rmse)

Accuracy: 0.8703183520599251
RMSE: 0.12968164794007492


In [7]:
#Skip-Gram
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score


# Load the dataset
df = pd.read_csv('GrammarandProductReviews[modified].csv')

# Preprocess the data by tokenizing the text and creating a list of words
df['text'] = df['review'].apply(lambda x: x.split())

# Train a skip-gram word2vec model
model = Word2Vec(df['text'], size=100, window=5, min_count=1, sg=1)

# Extract features from the text using the skip-gram model
text_vectors = []
for i in range(len(df)):
    text_vectors.append(np.mean([model.wv[word] for word in df['text'][i]], axis=0))

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(text_vectors, df['positive_review'], test_size=0.2, random_state=42)

# Define the classifiers
clf1 = LogisticRegression()
clf2 = GaussianNB()
clf3 = RandomForestClassifier(random_state=1)

# Create the voting ensemble classifier
ensemble = VotingClassifier(estimators=[('lr', clf1), ('gnb', clf3)], voting='hard')

# Train the ensemble classifier
ensemble.fit(X_train, y_train)

# Predict on the test data
y_pred = ensemble.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: ', accuracy)


Accuracy:  0.8787453183520599


In [8]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, f1_score, log_loss, mean_squared_error,recall_score,precision_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.svm import SVC
from math import sqrt

In [11]:
LogReg_clf = LogisticRegression()
SVC_clf = SVC()
DTree_clf = DecisionTreeClassifier()

In [12]:
#Bagging Ensemble Method
#logreg_bagging_model = BaggingClassifier(base_estimator=LogReg_clf, n_estimators=50)
dtree_bagging_model = BaggingClassifier(base_estimator=DTree_clf, n_estimators=50)
#random_forest = RandomForestClassifier(n_estimators=100)
#extra_trees = ExtraTreesClassifier(n_estimators=100)

def bagging_ensemble(model):
    k_folds = KFold(n_splits=20)
    results = cross_val_score(model, X_train, y_train, cv=k_folds)
    print(results.mean())

#bagging_ensemble(logreg_bagging_model)
bagging_ensemble(dtree_bagging_model)
#bagging_ensemble(random_forest)
#bagging_ensemble(extra_trees)

0.8872751099827092


In [13]:
# train the Adaboost classifier
clf = AdaBoostClassifier()
clf.fit(X_train, y_train)

# make predictions on the testing set
y_pred = clf.predict(X_test)

# evaluate the model's performance using accuracy and RMSE
accuracy = accuracy_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)

print("Accuracy:", accuracy)
print("RMSE:", rmse)

Accuracy: 0.8506554307116105
RMSE: 0.14934456928838952
