In [20]:
import pandas as pd
df = pd.read_csv("Processed_df.csv")

  df = pd.read_csv("Processed_df.csv")


In [21]:
from summa import summarizer

def generate_summary(text):
    summary = summarizer.summarize(text, ratio=0.3)  # You can adjust the ratio as needed
    # Check if the summary is blank
    if not summary:
        return text  # Return the original review if the summary is blank
    else:
        return summary

# Apply the generate_summary function to create a new column with summaries
df['reviewSummary'] = df['reviewContent'].apply(generate_summary)

# Display the first few rows of the dataFrame with the summaries
print("Sampled Review Summaries:")
print(df['reviewSummary'])

Sampled Review Summaries:
0        I won't recount every course, just highlight t...
1        Having Grant Achatz prepare our dessert was am...
2        Food was not mouth watering, tasted like it it...
3        The problem with places like this, given the e...
4        I think everyone has a unique experience with ...
                               ...                        
26951    There must be some kind of contest to see who ...
26952    Yellow Rose is a favorite of mine.\nThe food i...
26953    We ate there because the odd wicker seats were...
26954    It's a non pretentious bar with video games an...
26955    A gorgeous shy young teen asked the owner if s...
Name: reviewSummary, Length: 26956, dtype: object


feature engineering: polarity and subjectivity

Polarity: numerical score that quantifies the sentiment of the text on a continuous scale. It measures how positive or negative the text is. Polarity scores typically range from -1 (extremely negative) to 1 (extremely positive), with 0 indicating neutral sentiment.

Subjectivity: Subjectivity measures the degree to which the text is subjective or opinion-based rather than objective. Subjectivity is also represented as a numerical score ranging from 0 to 1. A score closer to 0 suggests that the text is more objective, factual, or informational. A score closer to 1 suggests that the text is more subjective and opinion-based.

In [22]:
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from textblob import TextBlob
import string

# Tokenization
tokenizer = WordPunctTokenizer()
df['tokens'] = df['reviewContent'].apply(tokenizer.tokenize)

# Define a function to remove stopwords and punctuation
def preprocess_text(tokens):
    stop_words = set(stopwords.words('english'))
    tokens = [word.lower() for word in tokens]  # Convert to lowercase
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    tokens = [word for word in tokens if word not in string.punctuation]  # Remove punctuation
    tokens = [word for word in tokens if len(word) > 1]  # Remove single-character words
    return tokens

# Apply preprocessing to the 'tokens' column
df['clean_tokens'] = df['tokens'].apply(preprocess_text)

# Join the clean tokens back into sentences
df['clean_text'] = df['clean_tokens'].apply(lambda x: ' '.join(x))

# Sentiment Analysis using TextBlob
df['sentiment'] = df['clean_text'].apply(lambda x: TextBlob(x).sentiment)

# Extract polarity and subjectivity scores from the sentiment analysis
df['polarity'] = df['sentiment'].apply(lambda x: x.polarity)
df['subjectivity'] = df['sentiment'].apply(lambda x: x.subjectivity)

N-gram analysis (common phrases)

In [23]:
from nltk.util import ngrams
from collections import Counter
# Define the n-gram range (e.g., 2 for bigrams)
ngram_range = 2  # For bigrams (adjust as needed)

# Function to extract n-grams from a list of tokens
def extract_ngrams(tokens, n):
    return list(ngrams(tokens, n))

# Extract n-grams and store them in a new column
df['ngrams'] = df['clean_tokens'].apply(lambda x: extract_ngrams(x, ngram_range))

# Flatten the list of n-grams
all_ngrams = [ngram for ngram_list in df['ngrams'] for ngram in ngram_list]

# Count the frequency of each n-gram
ngram_freq = Counter(all_ngrams)

# Print the most common n-grams and their frequencies
most_common_ngrams = ngram_freq.most_common(10)
for ngram, freq in most_common_ngrams:
    print(f"N-gram: {' '.join(ngram)} - Frequency: {freq}")

N-gram: go back - Frequency: 2121
N-gram: really good - Frequency: 1303
N-gram: first time - Frequency: 1154
N-gram: food good - Frequency: 1143
N-gram: great food - Frequency: 1094
N-gram: deep dish - Frequency: 1061
N-gram: pretty good - Frequency: 1013
N-gram: food great - Frequency: 980
N-gram: great place - Frequency: 941
N-gram: love place - Frequency: 916


Find most common phrases of fake vs real reviews

In [24]:
flagged_Y_df = df[df['flagged'] == 1]
flagged_N_df = df[df['flagged'] == 0]

# Flatten the list of n-grams for each subset
ngrams_Y = [ngram for ngram_list in flagged_Y_df['ngrams'] for ngram in ngram_list]
ngrams_N = [ngram for ngram_list in flagged_N_df['ngrams'] for ngram in ngram_list]

# Count the frequency of each n-gram for each subset
ngram_freq_Y = Counter(ngrams_Y)
ngram_freq_N = Counter(ngrams_N)

# Print the most common n-grams and their frequencies for each category
most_common_ngrams_Y = ngram_freq_Y.most_common(10)
most_common_ngrams_N = ngram_freq_N.most_common(10)

print("Most common n-grams for flagged 'Y' reviews:")
for ngram, freq in most_common_ngrams_Y:
    print(f"N-gram: {' '.join(ngram)} - Frequency: {freq}")

print("\nMost common n-grams for flagged 'N' reviews:")
for ngram, freq in most_common_ngrams_N:
    print(f"N-gram: {' '.join(ngram)} - Frequency: {freq}")

Most common n-grams for flagged 'Y' reviews:
N-gram: go back - Frequency: 391
N-gram: great food - Frequency: 359
N-gram: food good - Frequency: 279
N-gram: great place - Frequency: 275
N-gram: food great - Frequency: 272
N-gram: love place - Frequency: 266
N-gram: first time - Frequency: 258
N-gram: good food - Frequency: 230
N-gram: salad bar - Frequency: 224
N-gram: great service - Frequency: 217

Most common n-grams for flagged 'N' reviews:
N-gram: go back - Frequency: 1730
N-gram: really good - Frequency: 1134
N-gram: deep dish - Frequency: 900
N-gram: first time - Frequency: 896
N-gram: pretty good - Frequency: 876
N-gram: food good - Frequency: 864
N-gram: next time - Frequency: 768
N-gram: great food - Frequency: 735
N-gram: come back - Frequency: 717
N-gram: food great - Frequency: 708


In [27]:
# Define the list of common n-grams for both 'Y' and 'N' reviews
common_ngrams_Y = [("go", "back"), ("great", "food"), ("food", "good"), ("great", "place"), ("food", "great"), ("love", "place"), ("first", "time"), ("good", "food"), ("salad", "bar"), ("great", "service")]
common_ngrams_N = [("go", "back"), ("really", "good"), ("deep", "dish"), ("first", "time"), ("pretty", "good"), ("food", "good"), ("next", "time"), ("great", "food"), ("come", "back"), ("food", "great")]

# Create binary presence features for the common n-grams
for ngram in common_ngrams_Y:
    df[f'has_ngram_Y_{"_".join(ngram)}'] = df['ngrams'].apply(lambda x: 1 if all(word in x for word in ngram) else 0)

for ngram in common_ngrams_N:
    df[f'has_ngram_N_{"_".join(ngram)}'] = df['ngrams'].apply(lambda x: 1 if all(word in x for word in ngram) else 0)


In [28]:
df.head()

Unnamed: 0,reviewDate,reviewID,reviewerID,reviewContent,reviewRating,reviewUsefulCount,reviewCoolCount,reviewFunnyCount,restaurantID,flagged,...,has_ngram_N_go_back,has_ngram_N_really_good,has_ngram_N_deep_dish,has_ngram_N_first_time,has_ngram_N_pretty_good,has_ngram_N_food_good,has_ngram_N_next_time,has_ngram_N_great_food,has_ngram_N_come_back,has_ngram_N_food_great
0,9/22/2012,GtwU21YOQn-wf4vWRUIx6w,bNYesZ944s6IJVowOnB0iA,"Unlike Next, which we'd eaten at the previous ...",5,0,0,0,pbEiXam9YJL3neCYHGwLUA,0,...,0,0,0,0,0,0,0,0,0,0
1,9/22/2012,0LpVTc3,TRKxLC3y-ZvP45e5iilMtw,Probably one of the best meals I've had ever. ...,5,0,0,0,pbEiXam9YJL3neCYHGwLUA,0,...,0,0,0,0,0,0,0,0,0,0
2,9/19/2012,tljtLzf68Fkwf,0EMm8umAqXZzyhxNpL4M9g,Service was impeccable. Experience and present...,3,2,0,0,pbEiXam9YJL3neCYHGwLUA,0,...,0,0,0,0,0,0,0,0,0,0
3,9/6/2012,iSN,DlwexC7z88ymAzu45skODw,"The problem with places like this, given the e...",3,8,0,3,pbEiXam9YJL3neCYHGwLUA,0,...,0,0,0,0,0,0,0,0,0,0
4,9/9/2012,Jmwrh7,kW2dk1CWihmh3g7k9N2G8A,I have no idea how to write my review - dining...,5,1,2,0,pbEiXam9YJL3neCYHGwLUA,0,...,0,0,0,0,0,0,0,0,0,0


Topic modelling

In [25]:
from gensim import corpora

# Create a dictionary from your tokenized text df
dictionary = corpora.Dictionary(df['clean_tokens'])

# Create a bag-of-words (BoW) corpus
corpus = [dictionary.doc2bow(text) for text in df['clean_tokens']]

from gensim.models import LdaModel

# Train an LDA model
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

# Print the topics and their top keywords
topics = lda_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

# Assign topics to documents
topics_in_documents = [lda_model[doc] for doc in corpus]

(0, '0.012*"us" + 0.011*"food" + 0.008*"would" + 0.008*"time" + 0.008*"table" + 0.007*"one" + 0.007*"like" + 0.007*"back" + 0.007*"service" + 0.006*"place"')
(1, '0.025*"food" + 0.024*"place" + 0.024*"great" + 0.018*"good" + 0.012*"service" + 0.010*"go" + 0.010*"love" + 0.009*"always" + 0.008*"get" + 0.008*"really"')
(2, '0.009*"good" + 0.007*"like" + 0.007*"cheese" + 0.006*"salad" + 0.006*"delicious" + 0.005*"also" + 0.005*"meat" + 0.005*"dessert" + 0.005*"ordered" + 0.005*"sweet"')
(3, '0.019*"rice" + 0.013*"spicy" + 0.013*"chicken" + 0.013*"good" + 0.010*"steak" + 0.009*"mexican" + 0.009*"chinese" + 0.009*"soup" + 0.008*"sandwich" + 0.008*"tacos"')
(4, '0.050*"pizza" + 0.018*"chicago" + 0.014*".." + 0.013*"dish" + 0.013*"crust" + 0.012*"..." + 0.012*"good" + 0.012*"best" + 0.010*"like" + 0.010*"deep"')


Split topics into fake vs real reviews

In [26]:
def find_common_topics(data_subset, num_topics):
    # Create a dictionary from tokenized text data
    dictionary = corpora.Dictionary(data_subset['clean_tokens'])

    # Create a bag-of-words (BoW) corpus
    corpus = [dictionary.doc2bow(text) for text in data_subset['clean_tokens']]

    # Train an LDA model
    lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)

    # Print the topics and their top keywords
    topics = lda_model.print_topics(num_words=10)
    for topic in topics:
        print(topic)

    # Assign topics to documents
    topics_in_documents = [lda_model[doc] for doc in corpus]

# Separate the data into two subsets based on the "flagged" category
flagged_Y_data = df[df['flagged'] == 1]
flagged_N_data = df[df['flagged'] == 0]

# Find common topics for flagged 'Y' reviews
print("Common Topics for flagged 'Y' reviews:")
find_common_topics(flagged_Y_data, num_topics=5)

# Find common topics for flagged 'N' reviews
print("Common Topics for flagged 'N' reviews:")
find_common_topics(flagged_N_data, num_topics=5)

Common Topics for flagged 'Y' reviews:
(0, '0.033*"pizza" + 0.012*"good" + 0.012*"best" + 0.010*"chicago" + 0.010*"like" + 0.009*"tacos" + 0.009*"food" + 0.009*"mexican" + 0.009*"place" + 0.008*"steak"')
(1, '0.016*"us" + 0.010*"food" + 0.010*"table" + 0.009*"would" + 0.008*"time" + 0.008*"restaurant" + 0.007*"back" + 0.007*"service" + 0.007*"minutes" + 0.007*"one"')
(2, '0.017*"food" + 0.015*"good" + 0.015*"place" + 0.012*"..." + 0.010*"like" + 0.008*"great" + 0.007*"go" + 0.007*"really" + 0.007*"service" + 0.006*"get"')
(3, '0.032*"great" + 0.025*"food" + 0.018*"place" + 0.015*"love" + 0.013*"service" + 0.011*"good" + 0.010*"always" + 0.008*"restaurant" + 0.008*"go" + 0.008*"delicious"')
(4, '0.008*"crab" + 0.008*"cake" + 0.008*"fogo" + 0.005*"brazilian" + 0.005*"de" + 0.005*"best" + 0.005*"one" + 0.005*".." + 0.004*"top" + 0.004*"great"')
Common Topics for flagged 'N' reviews:
(0, '0.012*"us" + 0.012*"food" + 0.009*"table" + 0.009*"would" + 0.009*"time" + 0.008*"place" + 0.008*"back