In [1]:
import pandas as pd
import numpy as np
import os
from afinn import Afinn
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import sys
import string
import scipy.stats as scipy
import matplotlib.pyplot as plt
import re
import random

covid = ["covid", "corona", "rona", "quarantine", "coronavirus",
        "epidemic", "virus", "social-distancing", "social distancing",
        "six-feet", "masks", "facemasks", "face masks", "mask", "handwash",
        "covid-19", "pandemic","quarantining","covid19","lockdown","washing hands",
        "quarintine", "social distance","social distancing" "wash hands", "the rona", "fever", "looked sick",
        "shelter-in-place", "shelter in place", "kung flu", "flatten the curve",
        "outbreak", "surgical", "N95", "herd immunity", "self-quarantine", "ventilator",
        "stimulus","unemployment", "face shield"]

def get_datasets():
    if sys.platform == 'win32':
        pr1 = pd.read_csv('..\\data\\reviews\\csv\\yelp_pricerange1_reviews.csv')
        pr2 = pd.read_csv('..\\data\\reviews\\csv\\yelp_pricerange2_reviews.csv')
        pr3 = pd.read_csv('..\\data\\reviews\\csv\\yelp_pricerange3_reviews.csv')
    else:
        pr1 = pd.read_csv('../data/reviews/csv/yelp_pricerange1_reviews.csv')
        pr2 = pd.read_csv('../data/reviews/csv/yelp_pricerange2_reviews.csv')
        pr3 = pd.read_csv('../data/reviews/csv/yelp_pricerange3_reviews.csv')

    return (pr1,pr2,pr3)

def clean_string(txt):

    p = string.punctuation
    new_string = txt.strip(p)
    new_string = new_string.strip()
    new_string = new_string.lower()
    return new_string

def clean_review(review):
    r = ' '.join(list(map(lambda y: clean_string(y), review.split(' '))))
    return r

def clean_end(txt, span=False):
    ex = re.compile('Useful\B|(?<=\S)Useful[\d\s\S]|(?<=\S)See[\d\s\S]')
    
    if re.search(ex, txt) is None:
        return txt
    else:
        a = re.search(ex, txt).span()[0]
        if span == True:
            print (a)
        new_txt = txt[:a]
        return new_txt
    
def clean_beg(txt,span=False):
    ex = re.compile('\d\scheck-ins?(?=[\s\S])|\d\sphotos?(?=[\s\S])|Updated review')
    if re.search(ex, txt) is None:
        return txt
    else:
        a = re.search(ex, txt).span()[1]
        if span == True:
            print (a)
        new_txt = txt[a:]
        return new_txt
    
def regex_clean(data):
    data = clean_end(data)
    for i in range(3):
        data = clean_beg(data)
    return data

def text_to_words(text):
    return clean_review(text).split()

def vaderize(df, column):
    analyzer = SentimentIntensityAnalyzer()
    
    print ('Estimating pop scores for {} cases'.format(len(df)))
    sentiment = df[column].apply(analyzer.polarity_scores)
    
    sdf = pd.DataFrame(sentiment.tolist()).add_prefix('vader_')
    
    df_combined = pd.concat([df,sdf], axis=1)
    return df_combined

def word_count(sentence):
    return len(sentence.split())

def update_df(dfs):
    return_dfs = []
    afinn = Afinn()
    for i,df in enumerate(dfs):
        df['rlength'] = df.reviews.apply(lambda x:len(x))
        df['reviews'] = df.reviews.apply(lambda x: regex_clean(x))
        df['reviews_cleaned'] = df.reviews.apply(lambda x: clean_review(x))
        df["word_count"] = df['reviews_cleaned'].apply(word_count)
        print("Calculating afinn for df #:", i)
        
        df['afinn_raw'] = df.reviews_cleaned.apply(lambda x: afinn.score(x))
        df["afinn_adjusted"] = (df['afinn_raw'] / df['word_count'])*100
        
        v_df = vaderize(df, "reviews")
        return_dfs.append(v_df)
    
    fdf = pd.concat(return_dfs, ignore_index=True)
    return fdf

def count_covid(text, word_list):
    
    text_list = text_to_words(text)
    intersection = [word for word in text_list if word in word_list]
    
    return len(intersection)

def test(df):
    strings = ['Updated review', 'photos', 'photo', 'check-ins', 'check-in', 'Useful', 'Funny', 'Cool','See all']
    i = 0
    for word in strings:
        for i, sentence in enumerate(df.reviews_cleaned):
            if word in sentence:
                i += 1
                print (sentence)
                print (word, 'found.')
                print (f"In item [{i}]")
    print (i)        
# with open("..\\data\\sentiment\\negative-words.txt", encoding="ISO-8859-1") as f:
#     negatives = list(map(lambda x: x.strip('\n'), f.readlines()))
# with open("..\\data\\sentiment\\positive-words.txt", encoding="ISO-8859-1") as f:
#     positives = list(map(lambda x: x.strip('\n'), f.readlines()))

In [2]:
datasets = get_datasets()
d1 = datasets[0].copy()
d2 = datasets[1].copy()
d3 = datasets[2].copy()

data = [d1,d2,d3]

In [3]:
all_data = update_df(data)
all_data['COVID_freq'] = all_data.reviews_cleaned.apply(lambda x: count_covid(x, covid))

Calculating afinn for df #: 0
Estimating pop scores for 14335 cases
Calculating afinn for df #: 1
Estimating pop scores for 36747 cases
Calculating afinn for df #: 2
Estimating pop scores for 2216 cases


In [587]:
end = re.compile('Useful\B|(?<=\S)Useful[\d\s\S]|(?<=\S)See[\d\s\S]')
beg = re.compile('(\scheck-ins?(?=\S)|\d\sphotos?(?=\S\S)|Updated review(?=\S)){1}')
regex = [end, beg]

In [55]:
def clean_end(txt, span=False):
    ex = re.compile('Useful\B|(?<=\S)Useful[\d\s\S]|(?<=\S)See[\d\s\S]')
    
    if re.search(ex, txt) is None:
        return txt
    else:
        a = re.search(ex, txt).span()[0]
        if span == True:
            print (a)
        new_txt = txt[:a]
        return new_txt
    
def clean_beg(txt,span=False):
    ex = re.compile('\d\scheck-ins?(?=[\s\S])|\d\sphotos?(?=[\s\S])|Updated review')
    if re.search(ex, txt) is None:
        return txt
    else:
        a = re.search(ex, txt).span()[1]
        if span == True:
            print (a)
        new_txt = txt[a:]
        return new_txt
    
def regex_clean(data):

    data = clean_end(data)

    for i in range(3):
        data = clean_beg(data)

    return data


In [4]:
all_data['reviews_cleaned'] = all_data.reviews.apply(lambda x: regex_clean(x))

In [5]:
all_data

Unnamed: 0,dates,stars,reviews,year,month,day,pricerange,rlength,reviews_cleaned,word_count,afinn_raw,afinn_adjusted,vader_neg,vader_neu,vader_pos,vader_compound,COVID_freq
0,2020-07-30,1,We ordered the pizza and wing special through ...,2020,7,30,1,560,We ordered the pizza and wing special through ...,105,-4.0,-3.809524,0.078,0.905,0.017,-0.7814,0
1,2020-07-25,1,I ordered from this place. I spent 45.00. I or...,2020,7,25,1,404,I ordered from this place. I spent 45.00. I or...,77,-7.0,-9.090909,0.163,0.791,0.047,-0.8827,0
2,2020-12-06,4,I got the meat lovers deep dish pizza (Sicilia...,2020,6,12,1,228,I got the meat lovers deep dish pizza (Sicilia...,34,10.0,29.411765,0.000,0.638,0.362,0.9516,0
3,2020-06-07,4,"Friendly staff, excellent food, and great pric...",2020,7,6,1,376,"Friendly staff, excellent food, and great pric...",62,16.0,25.806452,0.052,0.729,0.219,0.8779,0
4,2020-08-21,5,"The shop is amazing, it has donuts, cupcakes, ...",2020,8,21,1,381,"The shop is amazing, it has donuts, cupcakes, ...",63,20.0,31.746032,0.024,0.663,0.312,0.9617,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53293,2020-09-08,4,This was the first yakitori place we ever went...,2020,8,9,3,612,This was the first yakitori place we ever went...,113,24.0,21.238938,0.000,0.700,0.300,0.9930,0
53294,2020-08-24,5,Dinner was great at this spot - my first time ...,2020,8,24,3,311,Dinner was great at this spot - my first time ...,55,7.0,12.727273,0.030,0.740,0.230,0.9425,0
53295,2020-07-22,5,Hands down the best Italian food and service i...,2020,7,22,3,400,Hands down the best Italian food and service i...,69,3.0,4.347826,0.028,0.833,0.139,0.8649,0
53296,2020-07-21,5,Had a birthday dinner with friends at Zizi. T...,2020,7,21,3,343,Had a birthday dinner with friends at Zizi. T...,57,14.0,24.561404,0.000,0.640,0.360,0.9775,0


In [74]:
s = random.sample(list(d1_test), 50)
s

["We were at the Trader Joe's and felt a little hungry, needed a takeout place that was different.... we pulled up, saw the sign with online ordering, placed our order and got a gyro, a schwarma gyro , some tabouleh and a vegetarian falafel gyro. Holy crap was it good. When I went inside to grab it, everything looked SO CLEAN, so fresh and healthy! We all loved it, two we got spicy, one no, I highly recommend, it's not overwhelming. Picky kid got an order of fries and even they were fresh and delicious. Now we will be going here EVERY TIME we go to Trader Joe's. This was easily the best gyro I have ever had in my life.",
 "Just had the most sour Cuban Espressos ever. Wish I had tried it inside instead of to go so I could've let her know to switch it. Usually this place is perfect with drinks I get so it was just a one time mistake, but the the espresso today was disgusting. 3 stars for usually being on point here though and should be 5 other than today.",
 "The Mahi Reuben is SPECIAL. 

In [588]:
t = list(d1.reviews.apply(lambda x: clean_end(x)))

["We ordered the pizza and wing special through Postmates. \xa0For some reason we didn't get the wings we ordered. \xa0Our driver told us he got in an argument over the wings. \xa0We called to find out what happened with our wings and the older lady that works there starts yelling at my wife to shut up. \xa0I usually go here to eat pizza but I will no longer eat here. \xa0Their customer service is terrible, we just wanted an explanation as to what happened. \xa0We were even just going to order the wings and pick it up. \xa0Stay away from eating here at all cost.",
 'I ordered from this place. I spent 45.00. I ordered baked ziti chicken wings and pizza. It was the worst food I have ever had. \xa0The baked ziti was old boiled noodles with tomato sauce the wings were all gross no flavor no sauce. The pizza had no flavor hardly any pepperoni like I ordered. The crust was burnt. It was a waste of money if there was less than 1 star this place would get it.',
 '1 check-inI got the meat lover

In [11]:
def test(df):
    strings = ['Updated review', 'photos', 'photo', 'check-ins', 'check-in', 'Useful', 'Funny', 'Cool','See all']
    i = 0
    for word in strings:
        for i, sentence in enumerate(df.reviews_cleaned):
            if word in sentence:
                i += 1
                print (sentence)
                print (word, 'found.')
                print (f"In item [{i}]")
    print (i)           