In [None]:
# GPU Check
import torch
print(torch.cuda.device_count())  # Number of GPUs
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# Mount google drive
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
# Import the necessary libraries.
import re
import string
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# NLTK imports
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
#from nltk.sentiment.vader import SentimentIntensityAnalyzer # Perform VADER Sentiment Analysis

# Download NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Sklearn
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils import resample

In [None]:
# Load the dataset
review_data = pd.read_csv('/content/drive/MyDrive/capstone_data/data.csv')

# Display settings
pd.set_option('display.width', 100)
pd.set_option('display.max_colwidth', 200)

# Inspect the first two rows
review_data.head(2)

In [1]:
# List of columns
review_data.columns

NameError: name 'review_data' is not defined

In [None]:
# Check information to assess null values and data types
review_data.info()

# Data preprocessing

In [2]:
# Check for duplicate using the review_id field
review_data['review_id'].duplicated().sum()

NameError: name 'review_data' is not defined

In [None]:
# Drop duplicates
review_data.drop_duplicates(inplace=True)

In [None]:
# Count total duplicates considering all columns
total_duplicates = review_data.duplicated(keep=False).sum()
print(f'Total duplicates considering all columns: {total_duplicates}')

# Show duplicates if any
review_data[review_data.duplicated(keep=False)].sort_values(by=list(review_data.columns)).head(10)

In [3]:
# Create a string of all punctuation characters, excluding apostrophes
punctuation_to_remove = string.punctuation.replace("'", "")  # Keep apostrophes

# regular expression to find and identify unwanted punctuation characters
regex_punctuation = re.compile('[%s]' % re.escape(punctuation_to_remove))

# regular expression to identify one or more whitespace characters
regex_whitespace = re.compile('\s+') #remove extra spaces

# regular expression to detect newline characters ()
regex_newline = re.compile(r'\r?\n|\r')#clean up line breaks in text

# regular expression to locate whole numbers (sequences of digits)
regex_digits = re.compile(r'\b\d+\b')


NameError: name 'string' is not defined

In [None]:
# Function for additional cleaning
def clean_text(data):
    # Convert text to lower case
    data = str(data).lower()

    # Replace escaped characters (e.g., \' with ')
    data = re.sub(r"\\'", "'", data)

    # Remove punctuation (except apostrophes)
    data = regex_punctuation.sub('', data)

    # Remove extra spaces
    data = regex_whitespace.sub(' ', data)

    # Remove newline characters
    data = regex_newline.sub('', data)

    # Remove standalone digits
    data = regex_digits.sub('', data)

    # Strip leading/trailing spaces
    data = data.strip()

    return data

# Apply the function to the data
review_data['cleaned_text'] = review_data['text'].apply(lambda reviewtext: clean_text(reviewtext))

In [None]:
# Get the list of stop words
stop_words = set(stopwords.words('english'))
#print(stop_words)

review_data['cleaned_text'] = review_data['cleaned_text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

In [4]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
review_data['cleaned_text'] = review_data['cleaned_text'].apply(lambda x:' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

NameError: name 'WordNetLemmatizer' is not defined

In [None]:
# New features for additional insights
review_data['review_length'] = review_data['text'].apply(len)
review_data['word_count'] = review_data['text'].apply(lambda x: len(x.split()))
review_data['char_count'] = review_data['text'].apply(lambda x: len(x))
review_data['sentence_count'] = review_data['text'].apply(lambda x: len(sent_tokenize(x)))

In [None]:
# New feature-  business popularity
review_data['is_popular_business'] = (review_data['review_count'] > 100).astype(int)

In [None]:
# Categorize the useful field.
review_data['label'] = review_data['useful'].apply(lambda x: 0 if x == 0 else 1)

In [5]:
# Import library for resample
from sklearn.utils import resample

# Define majority and minority class
useful_majority = review_data[review_data.label==0]
useful_minority = review_data[review_data.label !=0]

# undersample majority class
useful_majority_undersampled = resample(useful_majority,
                                 replace=False,
                                 n_samples=12100,
                                 random_state=42)

# Combine and shuffle the dataset
resampled_df = pd.concat([useful_majority_undersampled, useful_minority])
resampled_df  = resampled_df .sample(frac=1, random_state=42).reset_index(drop=True)

# Display the new class counts
resampled_df.label.value_counts()

NameError: name 'review_data' is not defined

In [None]:
resampled_df.columns

# Exploratory Data Analysis

## Univariate Analysis

In [None]:
print('\nDescriptive Statistics')
review_data.describe()

In [None]:
# Calculate distibution of the label
useful_counts = review_data['label'].value_counts()

#create a figure
plt.figure(figsize=(6, 6))
plt.pie( # create pie chart
    useful_counts,
    labels=['Not Useful', 'Useful'],
    colors=['#1f77b4', '#5F9ED1'],
    autopct='%1.1f%%',
    startangle=90,
    wedgeprops={'edgecolor': 'white', 'linewidth': 0.5},
    textprops={'fontsize': 12}
)

# Add title and formatting
plt.title('Overall Review Usefulness Distribution', pad=20, fontsize=14)
plt.tight_layout()

In [None]:
# Review trend over time
review_data['date'] = pd.to_datetime(review_data['date']) # get the date(year) from date field
review_data.set_index('date', inplace=True)
review_data.resample('M')['useful'].sum().plot(figsize=(12,6))
plt.title('Useful Votes Over Time') # title
plt.xlabel('Date') #x label
plt.ylabel('Useful Votes') #y label
plt.show()


In [None]:
# Boxplot to show useful votes
plt.figure(figsize=(6,4)) # create figure
sns.boxplot(x=review_data['useful'], showfliers=False, width=0.3, color='skyblue')
sns.stripplot(x=review_data['useful'], color='blue', alpha=0.3, size=3, jitter=True)
plt.xlim(-1, 30)
plt.xlabel('Useful Votes')
plt.title('Spread of "Useful" Votes')

In [None]:
# Top 10 business with the most reviews
plt.figure(figsize=(4, 4))
review_data['name'].value_counts().head().plot(kind = 'bar', figsize = (5,4), color = '#1F77B4')
plt.title('Top 5 Business with the Most Reviews')
plt.xlabel('Business name')
plt.ylabel('Count')
plt.show()

In [None]:
# Get top 5 business names
top_5 = review_data['name'].value_counts().head(5).index

# Filter data for the top 5
top_reviews = review_data[review_data['name'].isin(top_5)]

# Create plot
plt.figure(figsize=(6, 4))
sns.countplot(
    y='name',
    hue='label',
    data=top_reviews,
    order=top_5,
    palette=['#1f77b4', '#5F9ED1'],  # Blue and orange
    edgecolor='black',
    linewidth=0.5
)

# Format the plot (e.g., title, xlabel,legend)
plt.title('Useful vs Non Useful Reviews (Top 5 Businesses)')
plt.xlabel('Number of Reviews')
plt.ylabel('')
plt.legend(['Not Useful', 'Useful'], title='Review Type')
plt.grid(axis='x', alpha=0.2)

plt.tight_layout()
plt.show()

In [None]:
# Histogram to display the distribution of review length for useful and not-useful reviews
plt.figure(figsize=(12, 5))
sns.histplot(
    x='review_length',
    hue='label',
    data=review_data,
    bins=50,
    palette=['#1f77b4', '#4F9ED1'],
    alpha=0.6,
    element='step'
)
plt.title('Review Length Distribution by Usefulness', pad=15)
plt.xlabel('Review Length')
plt.ylabel('Count')
plt.legend(['Not Useful', 'Useful'])
plt.xlim(0, 2000)  # Adjust based on your data
plt.grid(alpha=0.3)
plt.show()

## Bivariate

In [None]:
# Review length by star rating
g = sns.FacetGrid(review_data, col='stars_review', height=3, aspect=0.8)
g.map(plt.hist, 'review_length', bins=50, color='#006BA4', edgecolor='white')
g.set_axis_labels('Review Length', 'Count')
g.set_titles('Star Rating: {col_name}')
plt.tight_layout()
plt.show()

In [None]:
# Create cross-tabulation
star_counts = pd.crosstab(review_data['stars_review'], review_data['label'])

# Plot heatmap
plt.figure(figsize=(6, 6))
sns.heatmap(star_counts, annot=True, fmt='d', cmap='YlGnBu',
            cbar_kws={'label': 'Number of Reviews'})
plt.title('Star Rating Counts by Usefulness')
plt.xlabel('Usefulness Label (0=Not Useful, 1=Useful)')
plt.ylabel('Star Rating')
plt.yticks(rotation=0)
plt.show()

In [None]:
# KDE of useful votes for popular vs non-popular businesses
plt.figure(figsize=(8, 5))
sns.kdeplot(data=review_data, x='useful', hue='is_popular_business',
            fill=True, common_norm=False, alpha=0.5, palette='GnBu')
plt.title('Distribution of Useful Votes by Business Popularity')
plt.xlabel('Useful Votes')
plt.legend(title='Popular Business?', labels=['No', 'Yes'])

In [None]:
# Scatter plot with regression line to display the relationship between useful and word count
plt.figure(figsize=(10, 5))
sns.regplot(x='word_count', y='useful', data=review_data,
            scatter_kws={'alpha':0.2}, line_kws={'color':'blue'})
plt.title('Word Count vs Useful Votes')
plt.xlabel('Word Count')
plt.ylabel('Useful Votes')

# Statistical Analysis

In [None]:
import scipy.stats as stats

In [None]:
# Calculate correlation coefficient and p-value
corr_coef, p_value = stats.pearsonr(review_data['stars_review'], review_data['stars_business'])

# Print results
print(f"Correlation Coefficient: {corr_coef:.2f}")
print(f"P-value: {p_value:.4f}")

# Create heatmap
corr = review_data[['stars_review', 'stars_business']].corr()
sns.heatmap(corr, annot=True, cmap='GnBu')
plt.title('Correlation between Customer Rating vs. Business Rating')
plt.show()

In [None]:
# Calculate Spearman rank correlation coefficient
corr_coef, p_value = stats.spearmanr(review_data['review_count'], review_data['stars_business'])
print(f"Spearman Correlation (review_count vs stars_business): {corr_coef:.4f}, p-value: {p_value:.4f}")

# Word Cloud


In [None]:
# For the word cloud, remove short words(i.e., words that have 2 letters or less)
def remove_short_words(text):
  return ' '.join([word for word in text.split() if len(word)>2])

review_data['filtered_text'] = review_data['cleaned_text'].apply(remove_short_words)

In [None]:
# 1-star word cloud
one_star_text = ' '.join(review_data[review_data['stars_review'] == 1]['filtered_text'])
wordcloud = WordCloud(width=800, height=800, background_color='white', min_font_size=10).generate(one_star_text)

plt.figure(figsize=(8,8))
plt.imshow(wordcloud)
plt.axis('off')
plt.title('1-Star Reviews Word Cloud')
plt.show()

In [None]:
# 5-star word cloud
five_star_text = ' '.join(review_data[review_data['stars_review'] == 5]['filtered_text'])
wordcloud = WordCloud(width=800, height=800, background_color='white', min_font_size=10).generate(five_star_text)

plt.figure(figsize=(8,8))
plt.imshow(wordcloud)
plt.axis('off')
plt.title('5-Star Reviews Word Cloud')
plt.show()

# Bert

In [None]:
resampled_df['text'] = resampled_df['text'].apply(clean_text)  # Clean text

In [None]:
useful_review_df = resampled_df[['text', 'label']]
useful_review_df.head()

In [None]:
# Import train test split 80% training/20% test
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(useful_review_df, test_size=0.2, random_state=42)

In [None]:
from transformers import DistilBertTokenizerFast

# Load the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenize the data
train_encodings = tokenizer(train_df['text'].tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_df['text'].tolist(), truncation=True, padding=True, max_length=128)

In [None]:
pip install transformers

In [None]:
pip install torch

In [None]:
pip install datasets

In [None]:
import torch

# Define a custom Dataset class for the review data.
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
  # Initialize the dataset with input data and the target output and labels
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
      # Retrieve a single data sample at the given index 'idx'.
      # Convert encoding (input data) to pytorch tensors.
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
      # Add the corresponding label to the 'item' dictionary.
        item['labels'] = torch.tensor(self.labels[idx])
        return item
      # Return the total number of samples in the dataset (i.e.,the length of labels).

    def __len__(self):
        return len(self.labels)

# wrap tokenized training data and labels into a pytorch dataset
train_dataset = ReviewDataset(train_encodings, train_df['label'].tolist())
# wrap tokenized test data and labels into a pytorch dataset
test_dataset = ReviewDataset(test_encodings, test_df['label'].tolist())

In [None]:
# Load the DistilBERT model for sequence classification tasks.
from transformers import DistilBertForSequenceClassification

# Load the model with 3 output labels
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

In [None]:
from transformers import TrainingArguments, EarlyStoppingCallback

# Define the training arguments for fine-tuning the Hugging Face Transformer model.
training_args = TrainingArguments(
    output_dir='./results',  # Directory for checkpoint and output.
    num_train_epochs=5,  # Number of complete passes through the training dataset.
    per_device_train_batch_size=16,  # Batch size for training on each device (e.g., GPU/CPU).
    per_device_eval_batch_size=16,  # Batch size for evaluation on each device.
    warmup_steps=500,  # Number of steps for learning rate warm-up (gradual increase).
    weight_decay=0.2,  # L2 regularization to prevent overfitting.
    logging_dir='./logs',
    logging_steps=500,  # Log metrics and loss every 500 steps during training.
    eval_strategy="epoch",  # Perform evaluation at the end of every epoch.
    save_strategy="epoch",  # Save model checkpoints at the end of every epoch.
    load_best_model_at_end=True,  # Automatically load the best model (based on evaluation) after training finishes.
    metric_for_best_model="eval_loss",  # Use validation loss to determine the best model.
    greater_is_better=False,  # Indicate that lower validation loss is better.
    report_to='none'  # Avoid reporting metrics to external systems (e.g., WandB).
)


In [None]:
# Import the necessary libraries
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# function to compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Calculate performance metrics
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average='macro')
    recall = recall_score(labels, predictions, average='macro')
    f1 = f1_score(labels, predictions, average='macro')

# Return specified metrcis
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
# Train the trainer using the defined variables
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],  # Stop after 3 epochs of no improvement
)

trainer.train()

In [None]:
# Import necessary libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score

In [None]:
#train_df, test_df = train_test_split(useful_review_df, test_size=0.2, random_state=42)

In [None]:
# Define the TfidfVectorizer and Naive Bayes model within a pipeline
nb_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # perfrom tf-idf vectorization
    ('nb', MultinomialNB())        # instance for multinomial naive bayes classifier
])

# Define parameter grid for hyperparameter tuning
param_grid = {
    'tfidf__max_features': [5000, 10000],       # Vocabulary size for TF-IDF
    'tfidf__ngram_range': [(1, 1), (1, 2)],     # Unigrams and bigrams
    'tfidf__stop_words': [None, 'english'],     # Include or exclude stopwords
    'tfidf__min_df': [1, 3],                   # Minimum document frequency
    'tfidf__max_df': [0.7, 1.0],               # Maximum document frequency
    'nb__alpha': [0.1, 0.5]               # Smoothing parameter for MultinomialNB
}

In [None]:
#  GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(nb_pipeline, param_grid, cv=3, scoring='f1_weighted', n_jobs=-1)
grid_search.fit(train_df['text'], train_df['label'])

In [None]:
# Get the best model from grid search
best_model = grid_search.best_estimator_

# Print the best hyperparameters
print("Best Parameters:", grid_search.best_params_)

In [None]:
# Evaluate the model on the test set
y_pred = best_model.predict(test_df['text'])
accuracy = accuracy_score(test_df['label'], y_pred)
f1 = f1_score(test_df['label'], y_pred, average='weighted')

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(test_df['label'], y_pred))

In [None]:
# SVM with RBF kernel model
svm_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=3, max_df=0.7)),
    ('svm_rbf', SVC(C=1.0, kernel='rbf', gamma='scale'))
])

# Fit the model with raw text data
svm_pipeline.fit(train_df['text'], train_df['label'])

In [None]:
# Predict the labels on the test dataset
# Use the raw text data from the test set (test_df['text'])
predictions_svm_rbf = svm_pipeline.predict(test_df['text'])

# Evaluate the model performance
accuracy = accuracy_score(test_df['label'], predictions_svm_rbf)
print("Accuracy ", accuracy * 100)

# Print  Report
print("Classification Report:\n", classification_report(test_df['label'], predictions_svm_rbf))

In [None]:
#test_df.shape