## Import Library & Dataset Processing
* import necessary libraries 
* preprocess datasets - merging 

In [71]:
import pandas as pd

In [72]:
users = pd.read_csv('data/users.csv')
reviews = pd.read_csv('data/reviews.csv')
restaurants = pd.read_csv('data/restaurants.csv')

There are 4 classes of flagged reviews by Yelp recommendation system. 
* N - Reviews that are confirmed as not fake 
* Y - Reviews that are confirmed as fake (least frequent label)
* NR - Reviews that are not confirmed as fake but flagged for investigation (most frequent laabel)
* YR - Reviews that are suspected to be fake and require more review before confirmation 

In [74]:
reviews['flagged'].value_counts()

NR    402774
YR    318678
N      58716
Y       8303
Name: flagged, dtype: int64

In [75]:
## inner join 
merged = pd.merge(reviews, users, on='reviewerID')

In [76]:
merged.shape

(708268, 22)

In [77]:
# drop rows with NR and YR in flagged column since NR and YR are not confirmed reviews
merged_filtered = merged[~merged['flagged'].isin(['YR', 'NR'])]

In [78]:
merged_filtered['flagged'].value_counts()

N    20752
Y     6206
Name: flagged, dtype: int64

In [79]:
merged_filtered.rename(columns={'usefulCount_x': 'usefulCount_review', 
                   'coolCount_x': 'coolCount_review',
                   'funnyCount_x': 'funnyCount_review',
                   'usefulCount_y': 'usefulCount_user',
                   'coolCount_y': 'coolCount_user',
                   'funnyCount_y': 'funnyCount_user'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_filtered.rename(columns={'usefulCount_x': 'usefulCount_review',


In [80]:
restaurant_merged = pd.merge(merged_filtered, restaurants, on='restaurantID')

In [81]:
restaurant_merged.rename(columns={'name_x': 'name_user',
                                  'location_x': 'location_user',
                                  'reviewCount_x': 'reviewCount_user',
                                  'name_y': 'name_restaurant',
                                  'location_y': 'location_restaurant',
                                  'reviewCount_y': 'reviewCount_restaurant',
                                  'rating_x': 'rating_review',
                                  'rating_y': 'rating_restaurant'}, inplace=True)

In [82]:
restaurant_merged.columns

Index(['date', 'reviewID', 'reviewerID', 'reviewContent', 'rating_review',
       'usefulCount_review', 'coolCount_review', 'funnyCount_review',
       'flagged', 'restaurantID', 'name_user', 'location_user', 'yelpJoinDate',
       'friendCount', 'reviewCount_user', 'firstCount', 'usefulCount_user',
       'coolCount_user', 'funnyCount_user', 'complimentCount', 'tipCount',
       'fanCount', 'name_restaurant', 'location_restaurant',
       'reviewCount_restaurant', 'rating_restaurant', 'categories', 'address',
       'Hours', 'GoodforKids', 'AcceptsCreditCards', 'Parking', 'Attire',
       'GoodforGroups', 'PriceRange', 'TakesReservations', 'Delivery',
       'Takeout', 'WaiterService', 'OutdoorSeating', 'WiFi', 'GoodFor',
       'Alcohol', 'NoiseLevel', 'Ambience', 'HasTV', 'Caters',
       'WheelchairAccessible', 'webSite', 'phoneNumber', 'filReviewCount'],
      dtype='object')

In [100]:
restaurant_merged.shape

(26958, 51)

## Adding Columns using Sentiment Analysis

#### Cleaning text

In [85]:
import pandas as pd
from textblob import TextBlob
import re

# Replace NaN values in the 'reviewContent' column with an empty string
restaurant_merged['reviewContent'] = restaurant_merged['reviewContent'].fillna('')

# Convert all entries in the 'reviewContent' column to strings
restaurant_merged['reviewContent'] = restaurant_merged['reviewContent'].astype(str)


def clean_text(text):
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    return text

# Apply cleaning function
restaurant_merged['reviewContent'] = restaurant_merged['reviewContent'].apply(clean_text)

print(restaurant_merged)

             date                reviewID              reviewerID  \
0       9/22/2012  GtwU21YOQn-wf4vWRUIx6w  bNYesZ944s6IJVowOnB0iA   
1       9/22/2012                 0LpVTc3  TRKxLC3y-ZvP45e5iilMtw   
2       9/19/2012           tljtLzf68Fkwf  0EMm8umAqXZzyhxNpL4M9g   
3        9/6/2012                     iSN  DlwexC7z88ymAzu45skODw   
4        9/9/2012                  Jmwrh7  kW2dk1CWihmh3g7k9N2G8A   
...           ...                     ...                     ...   
26953   3/17/2010              PZu8sDx2T2  tivh8lr6pzBDNfrJLYWh_g   
26954   4/14/2011               S-zbPPGoB  jKs4FQgkV0wSX8BG2_dgTg   
26955   9/23/2007    roKqXYooTy49OMAIJJjf  vX6aOMQ3HWCbwZVfCkCauw   
26956  11/18/2011                FefmFaWa  vX6aOMQ3HWCbwZVfCkCauw   
26957   5/18/2012    x8knvE6V8MkwT90wCV0f  OZTkqoi8_luhrL-mMj7O8A   

                                           reviewContent  rating_review  \
0      unlike next which wed eaten at the previous ni...              5   
1      probably one o

#### Sentiment Score & Labels - pos, neg, neutral

In [102]:
# Function to get sentiment polarity
def get_sentiment(review):
    blob = TextBlob(review)
    return blob.sentiment.polarity  # returns a score between -1 and 1

# Apply sentiment analysis to the cleaned reviews
restaurant_merged['sentiment_score'] = restaurant_merged['reviewContent'].apply(get_sentiment)


print(restaurant_merged)

             date                reviewID              reviewerID  \
0       9/22/2012  GtwU21YOQn-wf4vWRUIx6w  bNYesZ944s6IJVowOnB0iA   
1       9/22/2012                 0LpVTc3  TRKxLC3y-ZvP45e5iilMtw   
2       9/19/2012           tljtLzf68Fkwf  0EMm8umAqXZzyhxNpL4M9g   
3        9/6/2012                     iSN  DlwexC7z88ymAzu45skODw   
4        9/9/2012                  Jmwrh7  kW2dk1CWihmh3g7k9N2G8A   
...           ...                     ...                     ...   
26953   3/17/2010              PZu8sDx2T2  tivh8lr6pzBDNfrJLYWh_g   
26954   4/14/2011               S-zbPPGoB  jKs4FQgkV0wSX8BG2_dgTg   
26955   9/23/2007    roKqXYooTy49OMAIJJjf  vX6aOMQ3HWCbwZVfCkCauw   
26956  11/18/2011                FefmFaWa  vX6aOMQ3HWCbwZVfCkCauw   
26957   5/18/2012    x8knvE6V8MkwT90wCV0f  OZTkqoi8_luhrL-mMj7O8A   

                                           reviewContent  rating_review  \
0      unlike next which wed eaten at the previous ni...              5   
1      probably one o

In [104]:
def classify_sentiment(score):
    if score > 0:
        return 'positive'
    elif score == 0:
        return 'neutral'
    else:
        return 'negative'

# Apply the classification to the sentiment scores
restaurant_merged['sentiment_labels'] = restaurant_merged['sentiment_score'].apply(classify_sentiment)

print(restaurant_merged)

             date                reviewID              reviewerID  \
0       9/22/2012  GtwU21YOQn-wf4vWRUIx6w  bNYesZ944s6IJVowOnB0iA   
1       9/22/2012                 0LpVTc3  TRKxLC3y-ZvP45e5iilMtw   
2       9/19/2012           tljtLzf68Fkwf  0EMm8umAqXZzyhxNpL4M9g   
3        9/6/2012                     iSN  DlwexC7z88ymAzu45skODw   
4        9/9/2012                  Jmwrh7  kW2dk1CWihmh3g7k9N2G8A   
...           ...                     ...                     ...   
26953   3/17/2010              PZu8sDx2T2  tivh8lr6pzBDNfrJLYWh_g   
26954   4/14/2011               S-zbPPGoB  jKs4FQgkV0wSX8BG2_dgTg   
26955   9/23/2007    roKqXYooTy49OMAIJJjf  vX6aOMQ3HWCbwZVfCkCauw   
26956  11/18/2011                FefmFaWa  vX6aOMQ3HWCbwZVfCkCauw   
26957   5/18/2012    x8knvE6V8MkwT90wCV0f  OZTkqoi8_luhrL-mMj7O8A   

                                           reviewContent  rating_review  \
0      unlike next which wed eaten at the previous ni...              5   
1      probably one o

#### Polarity & Subjectivity Score

In [107]:
# Add columns for polarity and subjectivity using TextBlob
restaurant_merged['polarity'] = restaurant_merged['reviewContent'].apply(lambda x: TextBlob(x).sentiment.polarity)
restaurant_merged['subjectivity'] = restaurant_merged['reviewContent'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

print(restaurant_merged)

             date                reviewID              reviewerID  \
0       9/22/2012  GtwU21YOQn-wf4vWRUIx6w  bNYesZ944s6IJVowOnB0iA   
1       9/22/2012                 0LpVTc3  TRKxLC3y-ZvP45e5iilMtw   
2       9/19/2012           tljtLzf68Fkwf  0EMm8umAqXZzyhxNpL4M9g   
3        9/6/2012                     iSN  DlwexC7z88ymAzu45skODw   
4        9/9/2012                  Jmwrh7  kW2dk1CWihmh3g7k9N2G8A   
...           ...                     ...                     ...   
26953   3/17/2010              PZu8sDx2T2  tivh8lr6pzBDNfrJLYWh_g   
26954   4/14/2011               S-zbPPGoB  jKs4FQgkV0wSX8BG2_dgTg   
26955   9/23/2007    roKqXYooTy49OMAIJJjf  vX6aOMQ3HWCbwZVfCkCauw   
26956  11/18/2011                FefmFaWa  vX6aOMQ3HWCbwZVfCkCauw   
26957   5/18/2012    x8knvE6V8MkwT90wCV0f  OZTkqoi8_luhrL-mMj7O8A   

                                           reviewContent  rating_review  \
0      unlike next which wed eaten at the previous ni...              5   
1      probably one o

#### Num Characters, Words & Sentences

In [110]:
# Add a column for review length (number of characters)
restaurant_merged['review_num_characters'] = restaurant_merged['reviewContent'].apply(len)

# Add a column for the number of words
restaurant_merged['review_num_words'] = restaurant_merged['reviewContent'].apply(lambda x: len(TextBlob(x).words))

# Add a column for the number of sentences
restaurant_merged['review_num_sentences'] = restaurant_merged['reviewContent'].apply(lambda x: len(TextBlob(x).sentences))

# Display the first few rows to check the new columns
print(restaurant_merged)

             date                reviewID              reviewerID  \
0       9/22/2012  GtwU21YOQn-wf4vWRUIx6w  bNYesZ944s6IJVowOnB0iA   
1       9/22/2012                 0LpVTc3  TRKxLC3y-ZvP45e5iilMtw   
2       9/19/2012           tljtLzf68Fkwf  0EMm8umAqXZzyhxNpL4M9g   
3        9/6/2012                     iSN  DlwexC7z88ymAzu45skODw   
4        9/9/2012                  Jmwrh7  kW2dk1CWihmh3g7k9N2G8A   
...           ...                     ...                     ...   
26953   3/17/2010              PZu8sDx2T2  tivh8lr6pzBDNfrJLYWh_g   
26954   4/14/2011               S-zbPPGoB  jKs4FQgkV0wSX8BG2_dgTg   
26955   9/23/2007    roKqXYooTy49OMAIJJjf  vX6aOMQ3HWCbwZVfCkCauw   
26956  11/18/2011                FefmFaWa  vX6aOMQ3HWCbwZVfCkCauw   
26957   5/18/2012    x8knvE6V8MkwT90wCV0f  OZTkqoi8_luhrL-mMj7O8A   

                                           reviewContent  rating_review  \
0      unlike next which wed eaten at the previous ni...              5   
1      probably one o

#### Part of Speech (POS) Tag Count - Noun & Verb count

In [114]:
# Function to count nouns and verbs in the review
def count_pos(text, pos_tag):
    blob = TextBlob(text)
    return sum(1 for word, pos in blob.tags if pos.startswith(pos_tag))

# Add columns for noun and verb counts for the entire dataset
restaurant_merged['noun_count'] = restaurant_merged['reviewContent'].apply(lambda x: count_pos(x, 'NN'))  # 'NN' for nouns
restaurant_merged['verb_count'] = restaurant_merged['reviewContent'].apply(lambda x: count_pos(x, 'VB'))  # 'VB' for verbs

print(restaurant_merged)

             date                reviewID              reviewerID  \
0       9/22/2012  GtwU21YOQn-wf4vWRUIx6w  bNYesZ944s6IJVowOnB0iA   
1       9/22/2012                 0LpVTc3  TRKxLC3y-ZvP45e5iilMtw   
2       9/19/2012           tljtLzf68Fkwf  0EMm8umAqXZzyhxNpL4M9g   
3        9/6/2012                     iSN  DlwexC7z88ymAzu45skODw   
4        9/9/2012                  Jmwrh7  kW2dk1CWihmh3g7k9N2G8A   
...           ...                     ...                     ...   
26953   3/17/2010              PZu8sDx2T2  tivh8lr6pzBDNfrJLYWh_g   
26954   4/14/2011               S-zbPPGoB  jKs4FQgkV0wSX8BG2_dgTg   
26955   9/23/2007    roKqXYooTy49OMAIJJjf  vX6aOMQ3HWCbwZVfCkCauw   
26956  11/18/2011                FefmFaWa  vX6aOMQ3HWCbwZVfCkCauw   
26957   5/18/2012    x8knvE6V8MkwT90wCV0f  OZTkqoi8_luhrL-mMj7O8A   

                                           reviewContent  rating_review  \
0      unlike next which wed eaten at the previous ni...              5   
1      probably one o

#### Intensifier Count - Pos & Neg

In [118]:
# List of common subjectivity modifiers (intensifiers)
positive_intensifiers = [
    'very', 'extremely', 'absolutely', 'incredibly', 'totally', 'completely', 
    'really', 'quite', 'so', 'remarkably', 'awfully', 'exceptionally', 
    'unbelievably', 'astoundingly', 'especially', 'particularly', 
    'fabulously', 'superbly', 'wonderfully', 'amazingly', 'fantastically', 
    'tremendously', 'unusually', 'significantly', 'phenomenally', 'exceedingly', 
    'immensely', 'decidedly', 'positively', 'impressively', 'beautifully', 
    'supremely', 'perfectly', 'delightfully', 'elegantly', 'gracefully'
]

negative_intensifiers = [
    'terribly', 'horribly', 'awfully', 'dreadfully', 'incredibly', 
    'exceedingly', 'ridiculously', 'shockingly', 'unbelievably', 
    'disastrously', 'painfully', 'appallingly', 'desperately', 
    'hideously', 'atrociously', 'outrageously', 'grievously', 'miserably', 
    'horrendously', 'distressingly', 'pathetically', 'hopelessly', 
    'tragically', 'disgustingly', 'foully', 'frightfully', 'insanely', 
    'disastrously', 'bitterly', 'unbearably', 'viciously', 'disastrously', 
    'hatefully', 'intolerably', 'terrifyingly'
]


# Function to count positive and negative intensifiers
def count_intensifiers(text, intensifier_list):
    words = TextBlob(text).words
    return sum(1 for word in words if word.lower() in intensifier_list)

# Add columns for positive and negative intensifier counts
restaurant_merged['positive_intensifier_count'] = restaurant_merged['reviewContent'].apply(lambda x: count_intensifiers(x, positive_intensifiers))
restaurant_merged['negative_intensifier_count'] = restaurant_merged['reviewContent'].apply(lambda x: count_intensifiers(x, negative_intensifiers))

print(restaurant_merged)

             date                reviewID              reviewerID  \
0       9/22/2012  GtwU21YOQn-wf4vWRUIx6w  bNYesZ944s6IJVowOnB0iA   
1       9/22/2012                 0LpVTc3  TRKxLC3y-ZvP45e5iilMtw   
2       9/19/2012           tljtLzf68Fkwf  0EMm8umAqXZzyhxNpL4M9g   
3        9/6/2012                     iSN  DlwexC7z88ymAzu45skODw   
4        9/9/2012                  Jmwrh7  kW2dk1CWihmh3g7k9N2G8A   
...           ...                     ...                     ...   
26953   3/17/2010              PZu8sDx2T2  tivh8lr6pzBDNfrJLYWh_g   
26954   4/14/2011               S-zbPPGoB  jKs4FQgkV0wSX8BG2_dgTg   
26955   9/23/2007    roKqXYooTy49OMAIJJjf  vX6aOMQ3HWCbwZVfCkCauw   
26956  11/18/2011                FefmFaWa  vX6aOMQ3HWCbwZVfCkCauw   
26957   5/18/2012    x8knvE6V8MkwT90wCV0f  OZTkqoi8_luhrL-mMj7O8A   

                                           reviewContent  rating_review  \
0      unlike next which wed eaten at the previous ni...              5   
1      probably one o