In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
!pip install datasets

In [None]:
 !apt-get install git-lfs

In [None]:
import pandas as pd
import requests
from datasets import load_dataset
import string

import nltk
nltk.download('punkt')
from nltk.stem import PorterStemmer

from transformers import pipeline, tokenization_utils

In [None]:
#dataset
ds = load_dataset('yelp_review_full')

trained = ds['train']
tested = ds['test']

train_df = trained.to_pandas()
test_df = tested.to_pandas()
combined_df = pd.concat([train_df, test_df])

#preprocessing
def remove_punctuation(text):
  for punctuation in string.punctuation:
    text = text.replace(punctuation, '')
  return text

combined_df['text'] = combined_df['text'].apply(remove_punctuation)

#tokenize
combined_df['text'] = combined_df['text'].apply(nltk.word_tokenize)

In [None]:
#sample
nltk.download('stopwords')
combined_df_sample = combined_df.sample(n=100)
#lowercase
combined_df_sample['text'] = combined_df_sample['text'].apply(lambda x: [word.lower() for word in x])
#stopwords_remove
combined_df_sample['text'] = combined_df_sample['text'].apply(lambda x: [word for word in x if word not in nltk.corpus.stopwords.words('english')])

In [None]:
#pipelines
classifier1 = pipeline(
    task = 'sentiment-analysis',
    model = 'distilbert-base-uncased',
)
classifier2 = pipeline(
    task = 'sentiment-analysis',
    model = 'aychang/roberta-base-imdb',
)

#results
prediction_classifier1 = []
prediction_classifier2 = []
labels_true = []

#review loop and pass text to get the sentiment and save results in lists created
for i, review in enumerate(combined_df_sample['text']):
  try:
    prediction_classifier1.append(classifier1(review))
    prediction_classifier2.append(classifier2(review))
    labels_true.append(combined_df_sample['label'][i])
  except:
    continue

In [None]:
from sklearn.metrics import accuracy_score, classification_report
#conversion. pos = 1, neg = 0
labels_true = combined_df_sample['label'].tolist()
labels_pred_classifier1 = [1 if prediction[0]['label'].lower()[:3] == 'pos' else 0 for prediction in prediction_classifier1]
labels_pred_classifier2 = [1 if prediction[0]['label'].lower()[:3] == 'pos' else 0 for prediction in prediction_classifier2]

#classification report
report_classifier1 = classification_report(labels_true, labels_pred_classifier1)
report_classifier2 = classification_report(labels_true, labels_pred_classifier2)

#accuracy score
accuracy_score1 = accuracy_score(labels_true, labels_pred_classifier1)
accuracy_score2 = accuracy_score(labels_true, labels_pred_classifier2)

In [None]:
print("Model1 classification report:")
print(report_classifier1)
print("Model2 classification report:")
print(report_classifier2)
print("Model1 accuracy score:",accuracy_score1)
print("Model2 accuracy score:",accuracy_score2)

In [None]:
#Sentiment Results

#Classifier1: 
# total reviews
print(f"Total reviews classified by classifier1: {len(combined_df_sample['text'])}")
# positive and negative reviews 
positive_reviews = sum(labels_pred_classifier1)
negative_reviews = len(combined_df_sample['text']) - positive_reviews
print(f"Positive reviews: {positive_reviews}")
print(f"Negative reviews: {negative_reviews}")

#Classifier2: 
# total reviews
print(f"Total reviews classified by classifier2: {len(combined_df_sample['text'])}")
# positive and negative reviews 
positive_reviews = sum(labels_pred_classifier2)
negative_reviews = len(combined_df_sample['text']) - positive_reviews
print(f"Positive reviews: {positive_reviews}")
print(f"Negative reviews: {negative_reviews}")

In [None]:
combined_df_sample.info()

In [None]:
import matplotlib.pyplot as plt

# Create a new column for sentiment
combined_df_sample['Sentiment'] = combined_df_sample['label'].apply(lambda x: 'Positive' if x == 1 else 'Negative')

# Create a bar chart of the sentiment distribution
combined_df_sample['Sentiment'].value_counts().plot(kind='bar')
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()