# Detection of Fraud Reviews: Exploratory Data Analysis and Pre-Processing

# Loading libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

# Pre=Processing
import re
import nltk
import demoji
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Reading datasets

## Fake Reviews dataset

In [2]:
# Path to your CSV file
fake_reviews_path = '/Users/ShanShan/Fake-Reviews-Detection/Dataset/fake reviews dataset.csv'

# Read the CSV file
fake_reviews_df = pd.read_csv(fake_reviews_path)

In [None]:
fake_reviews_df.info()

In [None]:
fake_reviews_df.shape

In [None]:
fake_reviews_df.columns

In [None]:
# Display the first few rows
fake_reviews_df.head()

## Yelp dataset

In [7]:
# Path to your Excel file
yelp_path = '/Users/ShanShan/Fake-Reviews-Detection/Dataset/Yelp Labelled Review Dataset with Sentiments and Features.xlsx'

# Read the Excel file
yelp_df = pd.read_excel(yelp_path, engine='openpyxl')

In [None]:
yelp_df.info()

In [None]:
yelp_df.shape

In [None]:
yelp_df.columns

In [None]:
# Display the first few rows
yelp_df.head()

# Check for missing values

## Fake Reviews dataset

In [None]:
# Check for missing values
fake_reviews_df.isnull().sum()

## Yelp dataset

In [None]:
# Check for missing values
yelp_df.isnull().sum()

# Check for duplicated values

## Fake Reviews dataset

In [None]:
fake_reviews_df_duplicates = fake_reviews_df.duplicated().sum()

print("Number of duplicates in dataset:", fake_reviews_df_duplicates)

In [None]:
# If you want to see the actual duplicate rows
fake_reviews_duplicate_rows = fake_reviews_df[fake_reviews_df.duplicated(keep=False)]

print("Duplicate rows in the dataset:\n", fake_reviews_duplicate_rows)

Drop the duplicate rows

In [None]:
# Drop duplicates while keeping the first occurrence
fake_reviews_df = fake_reviews_df.drop_duplicates()

# Check the new count of duplicates to confirm removal
print("Number of duplicates after dropping: ", fake_reviews_df.duplicated().sum())

## Yelp dataset

In [None]:
yelp_df_duplicates = yelp_df.duplicated().sum()

print("Number of duplicates in dataset: ", yelp_df_duplicates)

# Describing dataset

## Fake Reviews dataset

In [None]:
fake_reviews_df.describe()

## Yelp dataset

In [None]:
yelp_df.describe()

# Distribution of ratings

## Fake Reviews dataset

In [None]:
# Set up the plot for Fake Reviews countplot
plt.figure(figsize=(14, 6))

# Choose a color palette from Set1 with a fixed number of colors
set1_colors = sns.color_palette("Set1", n_colors=5)

# Define a mapping of ratings to colors from the Set1 palette
rating_colors_fake_reviews = {
    1.0: set1_colors[0],  # First color (Red)
    2.0: set1_colors[1],  # Second color (Blue)
    3.0: set1_colors[2],  # Third color (Green)
    4.0: set1_colors[3],  # Fourth color (Orange)
    5.0: set1_colors[4]   # Fifth color (Yellow)
}

# Plot the countplot with hue set to 'rating'
ax_fake_reviews = plt.subplot(1, 2, 1)  # 1 row, 2 columns, 1st subplot
sns.countplot(x='rating', data=fake_reviews_df, hue='rating', palette=rating_colors_fake_reviews)

# Add title
plt.title("Distribution of Ratings (Fake Reviews Dataset)")
plt.xlabel("Ratings")  # Customize the x-axis label
plt.ylabel("Count")    # Customize the y-axis label

# Add number on top of each bar
for p in ax_fake_reviews.patches:
    height = p.get_height()
    # Only annotate if the height is greater than 0
    if height > 0:
        ax_fake_reviews.annotate(f'{height}', 
                                (p.get_x() + p.get_width() / 2., height), 
                                ha='center', va='baseline',
                                fontsize=8, color='black', xytext=(0, 5), 
                                textcoords='offset points')

# Show legend outside the plot
plt.legend(title='Ratings', loc='upper left', bbox_to_anchor=(1, 1))  # Adjust position here

# Create a pie chart for the percentage distribution of ratings
plt.subplot(1, 2, 2)  # 1 row, 2 columns, 2nd subplot
rating_counts_fake_reviews = fake_reviews_df['rating'].value_counts()
wedges_fake_reviews, texts_fake_reviews, autotexts_fake_reviews = plt.pie(
    rating_counts_fake_reviews, 
    autopct='%1.1f%%', 
    startangle=140, 
    colors=[rating_colors_fake_reviews[rating] for rating in rating_counts_fake_reviews.index]
)

# Add title for the pie chart
plt.title("Percentage Distribution of Ratings (Fake Reviews Dataset)")

# Create a legend with the corresponding colors
plt.legend(wedges_fake_reviews, rating_counts_fake_reviews.index, title="Ratings", loc="best", bbox_to_anchor=(1, 0, 0.5, 1))

# Adjust layout and show the plot
plt.tight_layout()  # Adjust layout to make room for the legend
plt.show()


## Yelp dataset

In [None]:
# Set up the plot for Yelp Dataset countplot
plt.figure(figsize=(14, 6))

# Choose a color palette from Set1 with a fixed number of colors
set1_colors = sns.color_palette("Set1", n_colors=5)

# Define a mapping of ratings to colors from the Set1 palette
rating_colors_yelp = {
    1.0: set1_colors[0],  # First color (Red)
    2.0: set1_colors[1],  # Second color (Blue)
    3.0: set1_colors[2],  # Third color (Green)
    4.0: set1_colors[3],  # Fourth color (Orange)
    5.0: set1_colors[4]   # Fifth color (Yellow)
}

# Plot the countplot with hue set to 'Rating'
ax_yelp = plt.subplot(1, 2, 1)  # 1 row, 2 columns, 1st subplot
sns.countplot(x='Rating', data=yelp_df, hue='Rating', palette=rating_colors_yelp)

# Add title
plt.title("Distribution of Ratings (Yelp Dataset)")
plt.xlabel("Ratings")  # Customize the x-axis label
plt.ylabel("Count")    # Customize the y-axis label

# Add number on top of each bar
for p in ax_yelp.patches:
    height = p.get_height()
    # Only annotate if the height is greater than 0
    if height > 0:
        ax_yelp.annotate(f'{height}', 
                         (p.get_x() + p.get_width() / 2., height), 
                         ha='center', va='baseline',
                         fontsize=8, color='black', xytext=(0, 5), 
                         textcoords='offset points')

# Show legend outside the plot
plt.legend(title='Ratings', loc='upper left', bbox_to_anchor=(1, 1))  # Adjust position here

# Create a pie chart for the percentage distribution of ratings
plt.subplot(1, 2, 2)  # 1 row, 2 columns, 2nd subplot
rating_counts_yelp = yelp_df['Rating'].value_counts()
wedges_yelp, texts_yelp, autotexts_yelp = plt.pie(
    rating_counts_yelp, 
    autopct='%1.1f%%', 
    startangle=140, 
    colors=[rating_colors_yelp[rating] for rating in rating_counts_yelp.index]
)

# Add title for the pie chart
plt.title("Percentage Distribution of Ratings (Yelp Dataset)")

# Create a legend with the corresponding colors
plt.legend(wedges_yelp, rating_counts_yelp.index, title="Ratings", loc="best", bbox_to_anchor=(1, 0, 0.5, 1))

# Adjust layout and show the plot
plt.tight_layout()  # Adjust layout to make room for the legend
plt.show()


# Distribution of target variable

## Fake Reviews dataset

In [None]:
# Choose a color palette for fake reviews dataset
unique_labels_fake_reviews = fake_reviews_df['label'].unique()
colors_fake_review_label = sns.color_palette("Set1", n_colors=len(unique_labels_fake_reviews))

# Create a dictionary to map each label to its corresponding color
label_color_map_fake_reviews = dict(zip(unique_labels_fake_reviews, colors_fake_review_label))

# Set up the plot for Fake Reviews label countplot and pie chart
plt.figure(figsize=(14, 6))

# Plot the countplot with hue for fake reviews
ax_fake_reviews = plt.subplot(1, 2, 1)  # 1 row, 2 columns, 1st subplot
sns.countplot(x='label', data=fake_reviews_df, hue='label', palette=label_color_map_fake_reviews)

# Add title
plt.title("Distribution of Labels (Fake Reviews Dataset)")

# Add axis labels
plt.xlabel("Label Classification")  # Customize the x-axis label
plt.ylabel("Count")                 # Customize the y-axis label

# Add number on top of each bar
for p in ax_fake_reviews.patches:
    height = p.get_height()
    # Only annotate if the height is greater than 0
    if height > 0:
        ax_fake_reviews.annotate(f'{height}', 
                                (p.get_x() + p.get_width() / 2., height), 
                                ha='center', va='baseline',
                                fontsize=8, color='black', xytext=(0, 5), 
                                textcoords='offset points')

# Manually create legend for the countplot
handles_fake_reviews = [plt.Line2D([0], [0], marker='o', color='w', label=label, 
                       markerfacecolor=label_color_map_fake_reviews[label]) for label in unique_labels_fake_reviews]
plt.legend(handles=handles_fake_reviews, title='Label Classification', loc='upper left', bbox_to_anchor=(1, 1))

# Create a pie chart for the percentage distribution of labels
plt.subplot(1, 2, 2)  # 1 row, 2 columns, 2nd subplot
label_counts = fake_reviews_df['label'].value_counts()
wedges_fake_review, texts_fake_review, autotexts_fake_review = plt.pie(
    label_counts, 
    autopct='%1.1f%%', 
    startangle=140, 
    colors=[label_color_map_fake_reviews[label] for label in label_counts.index]
)

# Add title for the pie chart
plt.title("Percentage Distribution of Labels (Fake Reviews Dataset)")

# Create a legend with the corresponding colors for the pie chart
plt.legend(wedges_fake_review, label_counts.index, title="Label Classification", loc="best", bbox_to_anchor=(1, 0, 0.5, 1))

# Adjust layout and show the plot
plt.tight_layout()  # Adjust layout to make room for the legend
plt.show()


## Yelp dataset

In [None]:
# Choose a color palette for Yelp dataset
unique_labels_yelp = yelp_df['Spam(1) and Not Spam(0)'].unique()
colors_yelp_label = sns.color_palette("Set1", n_colors=len(unique_labels_yelp))

# Create a dictionary to map each label to its corresponding color
label_color_map_yelp = dict(zip(unique_labels_yelp, colors_yelp_label))

# Set up the plot for Yelp dataset countplot and pie chart
plt.figure(figsize=(14, 6))

# Plot the countplot with hue for Yelp dataset
ax_yelp = plt.subplot(1, 2, 1)  # 1 row, 2 columns, 1st subplot
sns.countplot(x='Spam(1) and Not Spam(0)', data=yelp_df, hue='Spam(1) and Not Spam(0)', palette=label_color_map_yelp)

# Add title
plt.title("Distribution of Spam and Not Spam (Yelp Dataset)")

# Add axis labels
plt.xlabel("Spam Classification")  # Customize the x-axis label
plt.ylabel("Count")                 # Customize the y-axis label

# Add number on top of each bar
for p in ax_yelp.patches:
    height = p.get_height()
    # Only annotate if the height is greater than 0
    if height > 0:
        ax_yelp.annotate(f'{height}', 
                         (p.get_x() + p.get_width() / 2., height), 
                         ha='center', va='baseline',
                         fontsize=8, color='black', xytext=(0, 5), 
                         textcoords='offset points')

# Manually create legend for the countplot
handles_yelp = [plt.Line2D([0], [0], marker='o', color='w', label=str(label), 
                             markerfacecolor=label_color_map_yelp[label]) for label in unique_labels_yelp]
plt.legend(handles=handles_yelp, title='Spam Classification', loc='upper left', bbox_to_anchor=(1, 1))

# Create a pie chart for the percentage distribution of spam labels
plt.subplot(1, 2, 2)  # 1 row, 2 columns, 2nd subplot
spam_counts = yelp_df['Spam(1) and Not Spam(0)'].value_counts()
wedges_yelp, texts_yelp, autotexts_yelp = plt.pie(
    spam_counts, 
    autopct='%1.1f%%', 
    startangle=140, 
    colors=[label_color_map_yelp[label] for label in spam_counts.index]
)

# Add title for the pie chart
plt.title("Percentage Distribution of Spam and Not Spam (Yelp Dataset)")

# Create a legend with the corresponding colors for the pie chart
plt.legend(wedges_yelp, spam_counts.index, title="Spam Classification", loc="best", bbox_to_anchor=(1, 0, 0.5, 1))

# Adjust layout and show the plot
plt.tight_layout()  # Adjust layout to make room for the legend
plt.show()


# Graphs between ratings and target variable

## Distribution of ratings by target variable

In [None]:
# Set up the plot for both datasets
plt.figure(figsize=(12, 6))

# Create a countplot for Fake Reviews
plt.subplot(1, 2, 1)  # 1 row, 2 columns, 1st subplot
sns.countplot(x='rating', hue='label', data=fake_reviews_df, palette='Set1')
plt.title("Distribution of Ratings by Target Variable (Fake Reviews Dataset)")
plt.xlabel("Ratings")
plt.ylabel("Count")
plt.legend(title='Label Classification')

# Add annotations for Fake Reviews, only if height > 0
for p in plt.gca().patches:
    height = p.get_height()
    if height > 0:  # Only annotate if height is greater than zero
        plt.annotate(f'{height}', 
                     (p.get_x() + p.get_width() / 2., height), 
                     ha='center', va='baseline', fontsize=6, 
                     color='black', xytext=(0, 5), 
                     textcoords='offset points')

# Create a countplot for Yelp Dataset
plt.subplot(1, 2, 2)  # 1 row, 2 columns, 2nd subplot
sns.countplot(x='Rating', hue='Spam(1) and Not Spam(0)', data=yelp_df, palette='Set1')
plt.title("Distribution of Ratings by Target Variable (Yelp Dataset)")
plt.xlabel("Ratings")
plt.ylabel("Count")
plt.legend(title='Spam Classification')

# Add annotations for Yelp Dataset, only if height > 0
for p in plt.gca().patches:
    height = p.get_height()
    if height > 0:  # Only annotate if height is greater than zero
        plt.annotate(f'{height}', 
                     (p.get_x() + p.get_width() / 2., height), 
                     ha='center', va='baseline', fontsize=8, 
                     color='black', xytext=(0, 5), 
                     textcoords='offset points')

# Adjust layout and show the plot
plt.tight_layout()
plt.show()

## Distribution of target variable by ratings

In [None]:
# Set up the plot for both datasets
plt.figure(figsize=(12, 6))

# Create a countplot for Fake Reviews by Target Variable
plt.subplot(1, 2, 1)  # 1 row, 2 columns, 1st subplot
sns.countplot(x='label', hue='rating', data=fake_reviews_df, palette='Set1')
plt.title("Distribution of Target Variable by Ratings (Fake Reviews Dataset)")
plt.xlabel("Label Classification")
plt.ylabel("Count")
plt.legend(title='Ratings')

# Add annotations for Fake Reviews, only if height > 0
for p in plt.gca().patches:
    height = p.get_height()
    if height > 0:  # Only annotate if height is greater than zero
        plt.annotate(f'{height}', 
                     (p.get_x() + p.get_width() / 2., height), 
                     ha='center', va='baseline', fontsize=8, 
                     color='black', xytext=(0, 5), 
                     textcoords='offset points')

# Create a countplot for Yelp Dataset by Target Variable
plt.subplot(1, 2, 2)  # 1 row, 2 columns, 2nd subplot
sns.countplot(x='Spam(1) and Not Spam(0)', hue='Rating', data=yelp_df, palette='Set1')
plt.title("Distribution of Target Variable by Ratings (Yelp Dataset)")
plt.xlabel("Spam Classification")
plt.ylabel("Count")
plt.legend(title='Ratings')

# Add annotations for Yelp Dataset, only if height > 0
for p in plt.gca().patches:
    height = p.get_height()
    if height > 0:  # Only annotate if height is greater than zero
        plt.annotate(f'{height}', 
                     (p.get_x() + p.get_width() / 2., height), 
                     ha='center', va='baseline', fontsize=8, 
                     color='black', xytext=(0, 5), 
                     textcoords='offset points')

# Adjust layout and show the plot
plt.tight_layout()
plt.show()

# Other distributions for Fake Reviews dataset

## Graphs between category and target variable

Distribution of category

In [None]:
# Choose a color palette for fake reviews
colors_fake_review_category = sns.color_palette("Set1", n_colors=fake_reviews_df['category'].nunique())

# Count occurrences of each category
category_counts = fake_reviews_df['category'].value_counts()

# Create a bar plot using Matplotlib
plt.figure(figsize=(10, 6))
bars = plt.bar(category_counts.index, category_counts.values, color=colors_fake_review_category)

# Add title and labels
plt.title("Distribution of Categories (Fake Reviews Dataset)")
plt.xlabel("Categories")
plt.ylabel("Count")

# Add number on top of each bar
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval, int(yval), ha='center', va='bottom')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Create a legend
# Use a list comprehension to create labels
legend_labels = category_counts.index
plt.legend(bars, legend_labels, title="Categories", bbox_to_anchor=(1.05, 1), loc='upper left')

# Show plot
plt.tight_layout()
plt.show()


Distribution of category by target variable

In [None]:
# Set up the plot for Distribution of Category by Target Variable
plt.figure(figsize=(12, 6))

# Create a countplot for Category by Target Variable
sns.countplot(x='category', hue='label', data=fake_reviews_df, palette='Set1')

# Add title and labels
plt.title("Distribution of Categories by Target Variable (Fake Reviews Dataset)")
plt.xlabel("Categories")
plt.ylabel("Count")

# Add annotations for the countplot, only if height > 0
for p in plt.gca().patches:
    height = p.get_height()
    if height > 0:  # Only annotate if height is greater than zero
        plt.annotate(f'{height}', 
                     (p.get_x() + p.get_width() / 2., height), 
                     ha='center', va='baseline', fontsize=6, 
                     color='black', xytext=(0, 5), 
                     textcoords='offset points')

# Rotate x-axis labels for better readability if needed
plt.xticks(rotation=45, ha='right')

# Create a legend outside the plot
plt.legend(title='Label Classification', bbox_to_anchor=(1.05, 1), loc='upper left')

# Show plot
plt.tight_layout()  # Adjust layout
plt.show()

Distribution of target variable by category 

In [None]:
# Set up the plot for Distribution of Target Variable by Category
plt.figure(figsize=(12, 6))

# Create a countplot for Target Variable by Category
sns.countplot(x='label', hue='category', data=fake_reviews_df, palette='Set1')

# Add title and labels
plt.title("Distribution of Target Variable by Categories (Fake Reviews Dataset)")
plt.xlabel("Label Classification")
plt.ylabel("Count")

# Add annotations for the countplot, only if height > 0
for p in plt.gca().patches:
    height = p.get_height()
    if height > 0:  # Only annotate if height is greater than zero
        plt.annotate(f'{height}', 
                     (p.get_x() + p.get_width() / 2., height), 
                     ha='center', va='baseline', fontsize=6, 
                     color='black', xytext=(0, 5), 
                     textcoords='offset points')

# Rotate x-axis labels for better readability if needed
plt.xticks(rotation=45, ha='right')

# Create a legend outside the plot
plt.legend(title='Categories', bbox_to_anchor=(1.05, 1), loc='upper left')

# Show plot
plt.tight_layout()  # Adjust layout
plt.show()


# Other distributions for Yelp dataset

## Graphs between reviews and date

Distribution of reviews by year

In [None]:
# Extract the year from the date
yelp_df['Year'] = yelp_df['Date'].dt.year

# Set up the plot
plt.figure(figsize=(12, 6))

# Create a countplot for reviews by year
sns.countplot(x='Year', data=yelp_df, palette='Set1', hue='Year')  # Set hue=None to avoid the warning

# Add title and labels
plt.title("Distribution of Reviews by Year (Yelp Dataset)")
plt.xlabel("Year")
plt.ylabel("Count")

# Add annotations for the countplot, only if height > 0
for p in plt.gca().patches:
    height = p.get_height()
    if height > 0:  # Only annotate if height is greater than zero
        plt.annotate(f'{height}', 
                     (p.get_x() + p.get_width() / 2., height), 
                     ha='center', va='baseline', fontsize=8, 
                     color='black', xytext=(0, 5), 
                     textcoords='offset points')

# Show plot
plt.tight_layout()  # Adjust layout
plt.show()


Distribution of reviews by year and month

In [None]:
# Extract Year and Month from the 'Date' column
yelp_df['YearMonth'] = yelp_df['Date'].dt.to_period('M')

# Count the number of reviews per month-year
reviews_per_month = yelp_df['YearMonth'].value_counts().sort_index()

# Convert index back to a DatetimeIndex for plotting
reviews_per_month.index = reviews_per_month.index.to_timestamp()

# Create a line plot
plt.figure(figsize=(12, 6))
sns.lineplot(x=reviews_per_month.index, y=reviews_per_month.values, marker='o')

# Add title and labels
plt.title("Distribution of Reviews by Year-Month (Yelp Dataset)")
plt.xlabel("Year-Month")
plt.ylabel("Number of Reviews")

# Annotate each point with the review count
for x, y in zip(reviews_per_month.index, reviews_per_month.values):
    plt.text(x, y, str(y), fontsize=8, ha='center', va='bottom')

# Format x-axis to display Year-Month
plt.xticks(reviews_per_month.index, reviews_per_month.index.strftime('%Y-%m'), rotation=90)

# Show plot
plt.tight_layout()
plt.show()


## Graphs between date and target variable

Distribution of year by target variable and Distribution of target variable by year

In [None]:
# Set up the figure and subplots for side-by-side plots
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# First Plot: Distribution of Year by Target Variable
sns.countplot(x='Year', hue='Spam(1) and Not Spam(0)', data=yelp_df, palette='Set1', ax=axes[0])

# Add title and labels for the first plot
axes[0].set_title("Distribution of Year by Target Variable (Yelp Dataset)")
axes[0].set_xlabel("Year")
axes[0].set_ylabel("Count")

# Add annotations for the first plot, only if height > 0
for p in axes[0].patches:
    height = p.get_height()
    if height > 0:  # Only annotate if height is greater than zero
        axes[0].annotate(f'{height}', 
                         (p.get_x() + p.get_width() / 2., height), 
                         ha='center', va='baseline', fontsize=6, 
                         color='black', xytext=(0, 5), 
                         textcoords='offset points')

# Rotate x-axis labels for better readability if needed
axes[0].tick_params(axis='x', rotation=0)

# Create a legend outside the first plot
axes[0].legend(title='Spam Classification', bbox_to_anchor=(1.05, 1), loc='upper left')

# Second Plot: Distribution of Target Variable by Year
sns.countplot(x='Spam(1) and Not Spam(0)', hue='Year', data=yelp_df, palette='Set1', ax=axes[1])

# Add title and labels for the second plot
axes[1].set_title("Distribution of Target Variable by Year (Yelp Dataset)")
axes[1].set_xlabel("Spam Classification")
axes[1].set_ylabel("Count")

# Add annotations for the second plot, only if height > 0
for p in axes[1].patches:
    height = p.get_height()
    if height > 0:  # Only annotate if height is greater than zero
        axes[1].annotate(f'{height}', 
                         (p.get_x() + p.get_width() / 2., height), 
                         ha='center', va='baseline', fontsize=6, 
                         color='black', xytext=(0, 5), 
                         textcoords='offset points')

# Rotate x-axis labels for better readability if needed
axes[1].tick_params(axis='x', rotation=0)

# Create a legend outside the second plot
axes[1].legend(title='Year', bbox_to_anchor=(1.05, 1), loc='upper left')

# Adjust layout for both plots
plt.tight_layout()
plt.show()


## Graphs between sentiment and target variable

Distribution of sentiment

In [None]:
# Check the unique values in 'Sentiment'
unique_sentiments = yelp_df['Sentiment'].unique()

# Create the countplot without 'palette'
ax_yelp = sns.countplot(x='Sentiment', data=yelp_df)

# Manually color each bar using the 'Set1' palette (3 unique colors)
colors_yelp_sentiment = sns.color_palette("Set1", n_colors=3)
for i, bar in enumerate(ax_yelp.patches):
    bar.set_color(colors_yelp_sentiment[i % 3])  # Apply colors cyclically for each sentiment

# Add title and labels
plt.title("Distribution of Sentiment in Yelp Dataset")
plt.xlabel("Sentiment")
plt.ylabel("Count")

# Annotate each bar with the count
for p in ax_yelp.patches:
    height = p.get_height()
    if height > 0:
        ax_yelp.annotate(f'{height}', 
                         (p.get_x() + p.get_width() / 2., height), 
                         ha='center', va='baseline',
                         fontsize=8, color='black', xytext=(0, 5), 
                         textcoords='offset points')

# Manually create the legend
handles = [mpatches.Patch(color=colors_yelp_sentiment[i], label=unique_sentiments[i]) for i in range(3)]
plt.legend(handles=handles, title='Sentiment', loc='upper left', bbox_to_anchor=(1, 1))

# Display the plot
plt.tight_layout()
plt.show()


Distribution of sentiment by target variable and Distribution of target variable by sentiment 

In [None]:
# Set up the figure and subplots for side-by-side plots
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# First Plot: Distribution of Sentiment by Target Variable
sns.countplot(x='Sentiment', hue='Spam(1) and Not Spam(0)', data=yelp_df, palette='Set1', ax=axes[0])

# Add title and labels for the first plot
axes[0].set_title("Distribution of Sentiment by Target Variable (Yelp Dataset)")
axes[0].set_xlabel("Sentiment")
axes[0].set_ylabel("Count")

# Add annotations for the first plot, only if height > 0
for p in axes[0].patches:
    height = p.get_height()
    if height > 0:  # Only annotate if height is greater than zero
        axes[0].annotate(f'{height}', 
                         (p.get_x() + p.get_width() / 2., height), 
                         ha='center', va='baseline', fontsize=6, 
                         color='black', xytext=(0, 5), 
                         textcoords='offset points')

# Rotate x-axis labels for better readability if needed
axes[0].tick_params(axis='x', rotation=0)

# Create a legend outside the first plot
axes[0].legend(title='Spam Classification', bbox_to_anchor=(1.05, 1), loc='upper left')

# Second Plot: Distribution of Target Variable by Sentiment
sns.countplot(x='Spam(1) and Not Spam(0)', hue='Sentiment', data=yelp_df, palette='Set1', ax=axes[1])

# Add title and labels for the second plot
axes[1].set_title("Distribution of Target Variable by Sentiment (Yelp Dataset)")
axes[1].set_xlabel("Spam Classification")
axes[1].set_ylabel("Count")

# Add annotations for the second plot, only if height > 0
for p in axes[1].patches:
    height = p.get_height()
    if height > 0:  # Only annotate if height is greater than zero
        axes[1].annotate(f'{height}', 
                         (p.get_x() + p.get_width() / 2., height), 
                         ha='center', va='baseline', fontsize=6, 
                         color='black', xytext=(0, 5), 
                         textcoords='offset points')

# Rotate x-axis labels for better readability if needed
axes[1].tick_params(axis='x', rotation=0)

# Create a legend outside the second plot
axes[1].legend(title='Sentiment', bbox_to_anchor=(1.05, 1), loc='upper left')

# Adjust layout for both plots
plt.tight_layout()
plt.show()


# Pre-processing review text

preprocess_text:

1. Converting to Lowercasing
2. Removing Emojis
3. Normalize Contractions
4. Removing Punctuations
5. Removal of Links
6. Removing Special Characters
7. Removing Mentions
8. Removing Line breakers
9. Removal of UTF-encoding
10. Removing Hashtags
11. Removing Special Characters
12. Removing Extra Whitespaces

lemmatize_and_remove_stopwords:
1. Removing Stopwords
2. Tokenization
3. Lemmatization

In [34]:
def remove_emojis(text):
    """Remove emojis from the text."""
    return demoji.replace(text, '')

def clean_hashtags(review):
    """Remove hashtags from the text."""
    # Remove hashtags at the end of the sentence
    new_review = " ".join(word.strip() for word in re.split(r'#(?!(?:hashtag)\b)[\w\'-]+(?=(?:\s+#[\w\'-]+)*\s*$)', review))
    # Remove '#' symbol from words in the middle of the sentence
    new_review2 = " ".join(word.strip() for word in re.split(r'#|_', new_review))
    return new_review2

def filter_chars(a):
    """Filter out unwanted characters from the text."""
    sent = []
    for word in a.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

def remove_mult_spaces(text):
    """Remove multiple spaces in the text."""
    return re.sub(r"\s\s+", " ", text)

def normalize_contractions(text):
    """Normalize common English contractions."""
    contractions = {
        "can't": "cannot",
        "won't": "will not",
        "don't": "do not",
        "doesn't": "does not",
        "didn't": "did not",
        "haven't": "have not",
        "hasn't": "has not",
        "isn't": "is not",
        "aren't": "are not",
        "we're": "we are",
        "they're": "they are",
        "i'm": "I am",
        "you're": "you are",
        "it's": "it is",
        "he's": "he is",
        "she's": "she is",
        "i've": "I have",
        "you've": "you have",
        "they've": "they have",
        "that's": "that is",
        "what's": "what is",
        "where's": "where is",
        "who's": "who is",
        "let's": "let us",
        "o'clock": "of the clock",
        "wouldn't": "would not",
        "shouldn't": "should not",
        "couldn't": "could not",
        "would've": "would have",  
        "i'd": "I would",           
        "i'll": "I will",           
        "there's": "there is"       
    }
    # Replace contractions in the text
    for contraction, full in contractions.items():
        text = text.replace(contraction, full)
    return text

def normalize_punctuation(text):
    """Replace multiple consecutive periods and punctuation marks with a single instance and ensure spaces."""
    text = re.sub(r'\.{2,}', '.', text)  # Normalize ellipses
    text = re.sub(r'([!?.])\1+', r'\1', text)  # Normalize multiple punctuation marks
    # Add space after punctuation if not at the end of the string
    text = re.sub(r'([!?.])(?=\S)', r'\1 ', text)  
    return text

def preprocess_text(text: str, remove_stopwords: bool = True) -> str:
    """Preprocess the input text."""

# Check if the input is an integer, float, or contains only numeric characters
    if isinstance(text, (int, float)) or (isinstance(text, str) and text.replace('.', '', 1).isdigit()):
        return ""  # Return an empty string or any placeholder you'd like

    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove emojis 
    text = remove_emojis(text)
    # Normalize contractions
    text = normalize_contractions(text)
    # Normalize punctuation
    text = normalize_punctuation(text)
    text = re.sub(r'[!.,?;:]', '', text)  
    # Remove double quotes
    text = re.sub(r'"', '', text)
    # Normalize dashes
    text = re.sub(r'-{2,}', '-', text)  # Replace multiple dashes with a single dash
    text = re.sub(r'\s*-\s*', ' ', text)  # Remove isolated dashes
    # Remove links
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)
    # Remove specific special characters
    text = re.sub(r'[\\/ร\^\]:,.\[รท]', '', text) 
    # Remove all spaces and line breaks
    text = text.replace('\r', '').replace('\n', '')
    # Remove UTF encodings
    text = re.sub(r'[^\x00-\x7f]', r'', text) 
    # Clean hashtags
    text = clean_hashtags(text)
    # Filter special characters
    text = filter_chars(text)
    # Remove multiple spaces
    text = remove_mult_spaces(text)
    
    return text

def lemmatize_and_remove_stopwords(text, remove_stopwords: bool = True):
    """Lemmatize the text and remove stopwords."""
    special_characters = r'[@_!#$%^&*()<>?/\|}{~:]'
    STOPWORDS = set(stopwords.words('english'))
    lemmatizer = nltk.WordNetLemmatizer()

    if remove_stopwords:
        tokens = nltk.word_tokenize(text)
        tokens = [w for w in tokens if (not all(c in special_characters for c in w)) and (not w.isdigit())]
        tokens = [lemmatizer.lemmatize(w) for w in tokens]
        tokens = [w.lower().strip() for w in tokens if w.lower() not in STOPWORDS]
        return ' '.join(tokens)
    return text

In [35]:
# Apply the preprocessing function to the 'text_' column in fake_reviews_df
fake_reviews_df['cleaned_text'] = fake_reviews_df['text_'].apply(preprocess_text)

fake_reviews_df['final_cleaned_review'] = fake_reviews_df['cleaned_text'].apply(lemmatize_and_remove_stopwords)

In [36]:
fake_reviews_cleaned_output_path = '/Users/ShanShan/Fake-Reviews-Detection/ShanShan_notebooks/output/processed_fake_reviews.csv'

# Save to CSV in the specified output directory
fake_reviews_df.to_csv(fake_reviews_cleaned_output_path, index=False)

In [37]:
# Apply the preprocessing function to the 'Review' column and convert to string
yelp_df['cleaned_review'] = yelp_df['Review'].apply(preprocess_text)

yelp_df['final_cleaned_review'] = yelp_df['cleaned_review'].apply(lemmatize_and_remove_stopwords)

In [None]:
yelp_cleaned_output_path = '/Users/ShanShan/Fake-Reviews-Detection/ShanShan_notebooks/output/processed_yelp.csv'

# Save to CSV in the specified output directory
yelp_df.to_csv(yelp_cleaned_output_path, index=False)

# Reading processed datasets

## Fake Reviews dataset

In [37]:
# Read the CSV file
fake_reviews_cleaned_df = pd.read_csv(fake_reviews_cleaned_output_path)

In [None]:
fake_reviews_cleaned_df.info()

In [None]:
fake_reviews_cleaned_df.shape

In [None]:
fake_reviews_cleaned_df.columns

In [None]:
fake_reviews_cleaned_df.head()

## Yelp dataset

In [10]:
# Read the Excel file
yelp_cleaned_df = pd.read_csv(yelp_cleaned_output_path)

In [None]:
yelp_cleaned_df.info()

In [None]:
yelp_cleaned_df.shape

In [None]:
yelp_cleaned_df.columns

In [None]:
yelp_cleaned_df.head()

# Check for missing values

## Fake Reviews dataset

In [None]:
# Check for missing values
fake_reviews_cleaned_df.isnull().sum()

In [43]:
# Print all rows where any column has NA values
na_rows_fake_reviews_cleaned= fake_reviews_cleaned_df[fake_reviews_cleaned_df.isna().any(axis=1)]

In [None]:
na_rows_fake_reviews_cleaned

In [48]:
# Drop rows with any NA values from the DataFrame
fake_reviews_cleaned_new_df = fake_reviews_cleaned_df.copy()

fake_reviews_cleaned_new_df = fake_reviews_cleaned_new_df.dropna()

In [None]:
fake_reviews_cleaned_new_df.isnull().sum()

## Yelp dataset

In [None]:
# Check for missing values
yelp_cleaned_df.isnull().sum()

In [21]:
# Print all rows where any column has NA values
na_rows_yelp_cleaned = yelp_cleaned_df[yelp_cleaned_df.isna().any(axis=1)]

In [None]:
na_rows_yelp_cleaned

In [23]:
# Drop rows with any NA values from the DataFrame
yelp_cleaned_new_df = yelp_cleaned_df.dropna()

In [None]:
yelp_cleaned_new_df.isnull().sum()

# Feature engineering

In [50]:
#Count the number of unique words
def count_unique_words(text):
    return len(set(text.split()))

In [51]:
# Function to count punctuation and special characters
def count_punctuation_special(text):
    # Find all punctuation and special characters
    punct_and_special = re.findall(r'[^\w\s]', text)
    return len(punct_and_special)

In [52]:
# Function to count uppercase characters
def count_uppercase_chars(text):
    # Find all uppercase letters in the text
    uppercase_chars = re.findall(r'[A-Z]', text)
    return len(uppercase_chars)

## Fake Reviews dataset

In [None]:
# Word count: counts the number of tokens in the text (separated by a space)
fake_reviews_cleaned_new_df['word_count'] = fake_reviews_cleaned_new_df['cleaned_text'].apply(lambda x: len(str(x).split(" ")))

# Character count: sum the number of characters of each token
fake_reviews_cleaned_new_df['char_count'] = fake_reviews_cleaned_new_df['cleaned_text'].apply(lambda x: sum(len(word) for word in str(x).split(" ")))

# Sentence count: count the number of sentences (separated by a period)
fake_reviews_cleaned_new_df['sentence_count'] = fake_reviews_cleaned_new_df['text_'].apply(lambda x: len(str(x).split(".")))

In [None]:
# Average word length: sum of words length divided by the number of words
fake_reviews_cleaned_new_df['avg_word_length'] = fake_reviews_cleaned_new_df.apply(
    lambda row: row['char_count'] / row['word_count'] if row['word_count'] > 0 else 0, axis=1)

# Average sentence length: sum of sentences length divided by the number of sentences
fake_reviews_cleaned_new_df['avg_sentence_length'] = fake_reviews_cleaned_new_df.apply(
    lambda row: row['word_count'] / row['sentence_count'] if row['sentence_count'] > 0 else 0, axis=1)

In [None]:
# Unique word count: Counts unique words in each review.
fake_reviews_cleaned_new_df['unique_word_count'] = fake_reviews_cleaned_new_df['cleaned_text'].apply(count_unique_words)

# Ratio of unique words to total words.
fake_reviews_cleaned_new_df['unique_vs_words'] = fake_reviews_cleaned_new_df.apply(
    lambda row: row['unique_word_count'] / row['word_count'] if row['word_count'] > 0 else 0, axis=1)

In [82]:
# Stopwords count
stopwords_eng = set(stopwords.words('english'))

fake_reviews_cleaned_new_df['stopwords_count'] = fake_reviews_cleaned_new_df['cleaned_text'].str.split().apply(lambda i: len(set(i) & stopwords_eng))

# Punctuation and special character count
fake_reviews_cleaned_new_df['punctuation_special_count'] = fake_reviews_cleaned_new_df['text_'].apply(count_punctuation_special)

# Uppercase character count
fake_reviews_cleaned_new_df['uppercase_char_count'] = fake_reviews_cleaned_new_df['text_'].apply(count_uppercase_chars)

In [None]:
fake_reviews_cleaned_new_df.head()

## Yelp dataset

In [None]:
# Word count: counts the number of tokens in the text (separated by a space)
yelp_cleaned_new_df['word_count'] = yelp_cleaned_new_df['cleaned_review'].apply(lambda x: len(str(x).split(" ")))

# Character count: sum the number of characters of each token
yelp_cleaned_new_df['char_count'] = yelp_cleaned_new_df['cleaned_review'].apply(lambda x: sum(len(word) for word in str(x).split(" ")))

# Sentence count: count the number of sentences (separated by a period)
yelp_cleaned_new_df['sentence_count'] = yelp_cleaned_new_df['Review'].apply(lambda x: len(str(x).split(".")))

In [None]:
# Average word length: sum of words length divided by the number of words
yelp_cleaned_new_df['avg_word_length'] = yelp_cleaned_new_df.apply(
    lambda row: row['char_count'] / row['word_count'] if row['word_count'] > 0 else 0, axis=1)

# Average sentence length: sum of sentences length divided by the number of sentences
yelp_cleaned_new_df['avg_sentence_length'] = yelp_cleaned_new_df.apply(
    lambda row: row['word_count'] / row['sentence_count'] if row['sentence_count'] > 0 else 0, axis=1)

In [None]:
# Unique word count: Counts unique words in each review.
yelp_cleaned_new_df['unique_word_count'] = yelp_cleaned_new_df['cleaned_review'].apply(count_unique_words)

# Ratio of unique words to total words.
yelp_cleaned_new_df['unique_vs_words'] = yelp_cleaned_new_df.apply(
    lambda row: row['unique_word_count'] / row['word_count'] if row['word_count'] > 0 else 0, axis=1)

In [None]:
# Stopwords count
stopwords_eng = set(stopwords.words('english'))

yelp_cleaned_new_df['stopwords_count'] = yelp_cleaned_new_df['cleaned_review'].str.split().apply(lambda i: len(set(i) & stopwords_eng))

# Punctuation and special character count
yelp_cleaned_new_df['punctuation_special_count'] = yelp_cleaned_new_df['Review'].apply(count_punctuation_special)

# Uppercase character count
yelp_cleaned_new_df['uppercase_char_count'] = yelp_cleaned_new_df['Review'].apply(count_uppercase_chars)

In [None]:
yelp_cleaned_df.head()

In [None]:
#Removal of rows with empty tweets.
new_train_data = train_data[train_data["char_count"] > 0]
new_test_data = test_data[test_data["char_count"] > 0]

new_train_data.head()

# Graphs between processed review and target variable

Distribution of target variable by average length of review

In [None]:
# Calculate the length of each review in the fake_reviews_df dataset
fake_reviews_df['review_length'] = fake_reviews_df['text_'].apply(len)
# Calculate the average length of the review grouped by the label
average_review_length_fake = fake_reviews_df.groupby('label')['review_length'].mean().reset_index()

# Ensure 'text_' and 'Review' columns contain strings
yelp_df['Review_new'] = yelp_df['Review'].astype(str)
# Calculate the length of each review in the yelp_df dataset
yelp_df['review_length'] = yelp_df['Review_new'].apply(len)
# Calculate the average length of the review grouped by the Spam(1) column
average_review_length_yelp = yelp_df.groupby('Spam(1) and Not Spam(0)')['review_length'].mean().reset_index()

In [None]:
# Plotting average review length by label for both datasets
plt.figure(figsize=(12, 6))

# Fake Reviews dataset
ax_fake_reviews = plt.subplot(1, 2, 1)
sns.barplot(x='label', y='review_length', data=average_review_length_fake, palette=label_color_map_fake_reviews, hue='label', dodge=False, ax=ax_fake_reviews)
plt.title('Distribution of Target Variable by \n Average Review Length (Fake Reviews Dataset)')
plt.xlabel('Label Classification')
plt.ylabel('Average Review Length')

# Add numbers on top of the bars for Fake Reviews dataset
for p in ax_fake_reviews.patches:
    height = p.get_height()
    if height > 0:  # Only annotate if height is greater than zero
        ax_fake_reviews.annotate(f'{format(height, ".2f")}',
                                 (p.get_x() + p.get_width() / 2., height),
                                 ha='center', va='baseline',
                                 fontsize=8, color='black', xytext=(0, 5),
                                 textcoords='offset points')

# Manually create legend for Fake Reviews dataset
unique_labels_fake_reviews = fake_reviews_df['label'].unique()
handles_fake_reviews = [plt.Line2D([0], [0], marker='o', color='w', label=label, markerfacecolor=label_color_map_fake_reviews[label]) for label in unique_labels_fake_reviews]
plt.legend(handles=handles_fake_reviews, title='Label Classification', loc='upper left', bbox_to_anchor=(1, 1))

# Yelp dataset
ax_yelp_reviews = plt.subplot(1, 2, 2)
sns.barplot(x='Spam(1) and Not Spam(0)', y='review_length', data=average_review_length_yelp, palette=label_color_map_yelp, hue='Spam(1) and Not Spam(0)', dodge=False, ax=ax_yelp_reviews)
plt.title('Distribution of Target Variable by \n Average Review Length (Yelp Dataset)')
plt.xlabel('Spam Classification')
plt.ylabel('Average Review Length')

# Add numbers on top of the bars for Yelp dataset
for p in ax_yelp_reviews.patches:
    height = p.get_height()
    if height > 0:  # Only annotate if height is greater than zero
        ax_yelp_reviews.annotate(f'{format(height, ".2f")}',
                                 (p.get_x() + p.get_width() / 2., height),
                                 ha='center', va='baseline',
                                 fontsize=8, color='black', xytext=(0, 5),
                                 textcoords='offset points')

# Manually create legend for Yelp dataset
unique_labels_yelp_reviews = yelp_df['Spam(1) and Not Spam(0)'].unique()
handles_yelp_reviews = [plt.Line2D([0], [0], marker='o', color='w', label=label, markerfacecolor=label_color_map_yelp[label]) for label in unique_labels_yelp_reviews]
plt.legend(handles=handles_yelp_reviews, title='Spam Classification', loc='upper left', bbox_to_anchor=(1, 1))

plt.tight_layout()
plt.show()
