### Import Necessary Packages and Clients

In [298]:
from tqdm.notebook import trange, tqdm
import pandas as pd
from google.cloud import translate_v2 as translate
from google.cloud import language_v1

translate_client = translate.Client.from_service_account_json("/Users/ziwon/Downloads/service-account-file.json")
client = language_v1.LanguageServiceClient.from_service_account_json("/Users/ziwon/Downloads/service-account-file.json")

## Import data file and do preliminary processing & analysis

In [3]:
df_reviews = pd.read_csv('../data/reviews.tsv', delimiter='\t', index_col=0)
df_review_details = pd.read_csv('../data/review_details.tsv', delimiter='\t', index_col=0)

df_reviews.rename(columns={'Happy_Ct': 'Overall_Happy_Count', 'OK_Ct': 'Overall_OK_Ct', 'Sad_Ct': 'Overall_Sad_Ct',
                          'Taste': 'Overall_Taste', 'Decor': 'Overall_Decor', 'Service': 'Overall_Service', 'Hygiene': 'Overall_Hygiene',
                          'Value': 'Overall_Value'}, inplace=True)
df_reviews.drop(columns=['Review_Content'], inplace=True)
df = pd.merge(df_reviews, df_review_details, how='left', on=['URL'])
df['Review_Score'] = (df['taste'] + df['decor'] + df['service'] + df['hygiene'] + df['value']) / 5

In [239]:
# Total Review Text Characters (For API Pricing Reference)
print("Avg: " + str(df.full_review.str.len().mean().round(2)) + " characters per review.")
print("Sum: " + str(df.full_review.str.len().sum()) + " characters in total.")

Avg: 380.0 characters per review.
Sum: 2066042 characters in total.


## Detect Review Language

In [96]:
def batch_detect_language(txt: list):
    batch = 100
    final_output = []
    
    for i in range(len(txt)//batch + 1):
        output = translate_client.detect_language(txt[i*batch: (i+1)*batch])
        final_output.extend([result['language'] for result in output])
    
    return final_output

In [97]:
df['language'] = batch_detect_language((df['title'] + df['full_review'].str[:20]).tolist())

### Language Usage Analysis

In [144]:
df['language'].unique()

array(['zh-TW', 'en', 'vi', 'zh-CN', 'ja', 'jw', 'ceb', 'nl', 'tl', 'fr',
       'sv', 'ro'], dtype=object)

In [111]:
df.language.value_counts()

zh-TW    4998
en        378
zh-CN      45
ja          7
nl          2
vi          1
fr          1
ceb         1
ro          1
sv          1
tl          1
jw          1
Name: language, dtype: int64

In [276]:
# Remove Insignificant languages
df = df[~df.language.isin(['nl', 'vi', 'fr', 'ceb', 'ro', 'sv', 'tl', 'jw'])]

In [277]:
df.language.value_counts()

zh-TW    4998
en        378
zh-CN      45
ja          7
Name: language, dtype: int64

In [278]:
df.to_csv('../data/df_language_detected.tsv', sep='\t')

## Sentiment Analysis on Title & Reviews

In [279]:
df.full_review.str.len().sum()

2060905

In [281]:
def analyse_sentiment(text: list, source_language: list):
    scores = []
    magnitudes = []
    for i in trange(len(text)):
        document = language_v1.Document(content=text[i], type_=language_v1.Document.Type.PLAIN_TEXT, language=source_language[i])
        response = client.analyze_sentiment(request = {'document': document, 'encoding_type': language_v1.Document.Type.PLAIN_TEXT})
        sentiment_score = response.document_sentiment.score
        sentiment_magnitude = response.document_sentiment.magnitude
        scores.append(sentiment_score)
        magnitudes.append(sentiment_magnitude)
    return scores, magnitudes
    

In [282]:
title_scores, title_magnitudes = analyse_sentiment(df['title'].tolist(), df['language'].tolist())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5428.0), HTML(value='')))




In [283]:
df['title_score'] = title_scores
df['title_magnitude'] = title_magnitudes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['title_score'] = title_scores
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['title_magnitude'] = title_magnitudes


In [286]:
df.title_magnitude.mean()

0.5438651421801212

In [287]:
df.title_score.mean()

0.3775239475194966

In [289]:
df.to_csv('../data/df_language_detected_title_scored.tsv', sep='\t')

In [290]:
review_scores, review_magnitudes = analyse_sentiment(df['full_review'].tolist(), df['language'].tolist())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5428.0), HTML(value='')))




In [291]:
df['review_score'] = review_scores
df['review_magnitude'] = review_magnitudes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review_score'] = review_scores
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review_magnitude'] = review_magnitudes


In [293]:
df.to_csv('../data/df_language_detected_title_review_scored.tsv', sep='\t')

In [294]:
review_scores[:5], review_magnitudes[:5]

([0.6000000238418579, 0.5, 0.800000011920929, 0.800000011920929, 0.5],
 [2.0,
  14.800000190734863,
  17.700000762939453,
  12.899999618530273,
  5.400000095367432])

## Translate Title and Reviews to English (For Evaluation)

In [257]:
def translate_to_english(source_language: list, title: list, review:list):
    titles_en = []
    reviews_en = []
    for i in trange(len(title)):
        if (source_language[i] != 'en'): # Translate only if source is not English
            title_en = translate_client.translate(title[i], source_language=source_language[i], target_language='en')['translatedText']
            review_en = translate_client.translate(review[i], source_language=source_language[i], target_language='en')['translatedText']
            titles_en.append(title_en)
            reviews_en.append(review_en)
        else:
            titles_en.append(title[i])
            reviews_en.append(review[i])
    return titles_en, reviews_en

In [300]:
%%time
title_en_list, review_en_list = translate_to_english(df['language'].tolist(), df['title'].tolist(), df['full_review'].tolist())


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5428.0), HTML(value='')))


CPU times: user 37.2 s, sys: 4.82 s, total: 42 s
Wall time: 20min 45s


In [301]:
df['title_en'] = title_en_list
df['review_en'] = review_en_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['title_en'] = title_en_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review_en'] = review_en_list


In [302]:
df.to_csv('../data/df_final.tsv', sep='\t')

In [303]:
df_temp = pd.read_csv('../data/df_final.tsv', delimiter='\t')

In [309]:
df_temp.head()

Unnamed: 0.1,Unnamed: 0,URL,Overall_Score,Overall_Happy_Count,Overall_OK_Ct,Overall_Sad_Ct,Overall_Taste,Overall_Decor,Overall_Service,Overall_Hygiene,...,date_of_visit,full_review,language,Review_Score,title_score,title_magnitude,review_score,review_magnitude,title_en,review_en
0,0,https://www.openrice.com/en/hongkong/r-wa-thea...,4.5,184,8,2,4,4,4,4,...,2020-09-07,餐廳擁有廣闊無敵大海景，寧靜宜人環境，位於白沙灣碼頭酒店，位置雖然隔涉，但一切都值得，客人可...,zh-TW,4.8,0.8,1.6,0.6,2.0,Wonderful food theater. It&#39;s memorable,The restaurant has a vast and invincible sea v...
1,1,https://www.openrice.com/en/hongkong/r-wa-thea...,4.5,184,8,2,4,4,4,4,...,,今日得閒去西貢親親大自然🏝\n\n順道去「食之劇場」嘆個午餐🥣\n\n\n首先 店員為我哋斟...,zh-TW,5.0,0.4,0.4,0.5,14.8,Endless aftertaste 🤤 Sushi SET 🍣,"Today, I have to go to Saigon and kiss the nat..."
2,2,https://www.openrice.com/en/hongkong/r-wa-thea...,4.5,184,8,2,4,4,4,4,...,,白沙灣碼頭酒店中的食之劇場餐廳已不是第一次來到，但吸引到我的，是那寧靜宜人的環境，客人可訂坐...,zh-TW,5.0,0.0,0.0,0.8,17.700001,Omakase of Food,It’s not the first time I have come to the res...
3,3,https://www.openrice.com/en/hongkong/r-wa-thea...,4.5,184,8,2,4,4,4,4,...,2020-10-07,位於白沙灣碼頭旁的「食之劇場」，環境幽靜隱世\n餐廳正正對著湛藍廣闊無敵大海景，是香港難得一...,zh-TW,4.8,0.9,0.9,0.8,12.9,Secluded and invincible sea view,Located at the &quot;Theater of Food&quot; nex...
4,4,https://www.openrice.com/en/hongkong/r-wa-thea...,4.5,184,8,2,4,4,4,4,...,2020-09-30,餐廳係碼頭旁邊，天空與海–就算就黎天黑都好靚﹗餐廳環境好浪漫，啱晒拍拖去撐枱腳，而且食物一流...,zh-TW,5.0,0.3,0.3,0.5,5.4,Saigon High Quality Omakase,"The restaurant is next to the pier, the sky an..."


In [None]:
title_en_list = translate_to_english(df['language'].tolist(), df['title'].tolist(), df['full_review'].tolist())

In [150]:
df.language

0       zh-TW
1       zh-TW
2       zh-TW
3       zh-TW
4       zh-TW
        ...  
5432    zh-TW
5433    zh-TW
5434       en
5435    zh-TW
5436       en
Name: language, Length: 5437, dtype: object

## Effectiveness Analysis

### Reviews with Negative Scores and Their Sentiment Score

In [243]:
# Check if sentiment analysis works for bad reviews
bad_reviews = df.loc[df['Review_Score'] < 1.5][:30]

In [247]:
title_scores, title_magnitudes = analyse_sentiment(bad_reviews['title'].tolist(), bad_reviews['language'].tolist())
review_scores, review_magnitudes = analyse_sentiment(bad_reviews['full_review'].tolist(), bad_reviews['language'].tolist())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=30.0), HTML(value='')))




In [248]:
df_bad_review_sa = pd.DataFrame({'Title Score': title_scores, 'Title Magnitude': title_magnitudes, 'Review Score': review_scores, 'Review Magnitude': review_magnitudes})

In [252]:
df_bad_review_sa

Unnamed: 0,Title Score,Title Magnitude,Review Score,Review Magnitude
0,-0.9,0.9,-0.4,1.7
1,-0.8,0.8,-0.3,5.1
2,-0.4,0.4,-0.6,1.3
3,-0.8,0.8,-0.3,4.2
4,-0.8,0.8,0.0,1.1
5,0.3,0.3,-0.3,1.1
6,-0.9,0.9,0.0,7.1
7,0.3,0.3,-0.3,1.1
8,-0.9,0.9,0.0,7.8
9,-0.1,0.1,-0.5,2.2


In [258]:
title_en_list, review_en_list = translate_to_english(bad_reviews['language'].tolist(), bad_reviews['title'].tolist(), bad_reviews['full_review'].tolist())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=30.0), HTML(value='')))




In [260]:
title_en_list[:5], review_en_list[:5]

(['Rubbish',
  'Very bad attitude',
  '(Taedao) Attitude',
  'Very poor staff attitude',
  'Male staff&#39;s service attitude is very poor'],
 ['Nothing to go to South Korea, only Ma Pu Food Fan Shu is the first to get addicted, know that there is no service and no quality! Waiting at the door and so on... Until someone buries them, they don’t even reach out to bury their hands to bury the hot pot + rotten rice, even if the soup base is not good, then go on stage, an old chili sauce and bury rotten rice 🤮 another Bibimbap and rice are rotten 🤮🤮 Force pot bibimbap',
  'It’s good to have a view introduced by a friend, and I will try it with someone in the house tonight. When I went in and asked if the three people had a station, it seemed that they didn&#39;t want to talk to me. There was a woman holding a heat detector in isolation, and she buried Li to Zhiye without saying anything? You are exploring thermal rocks, but will the problem be your attitude? Sitting on the bottom left, rais