In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
reviews = pd.read_csv("../data/raw_reviews.csv", index_col=0)

In [3]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7365 entries, 0 to 7364
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   hotel_id  7365 non-null   object 
 1   date      7365 non-null   object 
 2   rating    7360 non-null   float64
 3   comment   3683 non-null   object 
dtypes: float64(1), object(3)
memory usage: 287.7+ KB


In [4]:
comment_count = reviews.groupby(["hotel_id"]).agg(
    number_of_nulls = pd.NamedAgg('comment', aggfunc=lambda x: x.isnull().sum() ),
    review_count = pd.NamedAgg('date', aggfunc = 'count')
)

In [5]:
comment_count.describe()

Unnamed: 0,number_of_nulls,review_count
count,192.0,192.0
mean,19.177083,38.359375
std,10.544838,17.695831
min,0.0,1.0
25%,11.0,25.0
50%,22.0,50.0
75%,27.0,50.0
max,40.0,50.0


In [6]:
comment_count

Unnamed: 0_level_0,number_of_nulls,review_count
hotel_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ChIJ-WBgspUBIRAR2_17FLZyZ4k,27,50
ChIJ-WdgbxP_3Q8R00ss54EeV-w,28,50
ChIJ-cG9nIr-3Q8RgQqZfdsLsps,27,50
ChIJ012sRE743Q8R7ORbiDThgbc,25,50
ChIJ0TA372Sb3w8RANBG1xTBtD8,33,50
...,...,...
ChIJz0jFu-YlJxARSyG1f8NxJsw,21,43
ChIJz80BtGma3w8RZ5foAQfTeXI,1,3
ChIJzRXvLIj_3Q8RwsQUbpVDPq0,1,8
ChIJzXBPWliF3w8R5C36QUOi5mg,31,50


In [7]:
reviews.head()

Unnamed: 0,hotel_id,date,rating,comment
0,ChIJhd6ATRj_3Q8R7nqmtm6INVI,2023-05-01T08:53:17.543Z,4.0,
1,ChIJhd6ATRj_3Q8R7nqmtm6INVI,2023-02-10T09:25:21.640Z,5.0,I had a wonderful experience at Jonat Hotel. T...
2,ChIJhd6ATRj_3Q8R7nqmtm6INVI,2023-01-23T16:02:18.862Z,5.0,I enjoyed my stay here. For a serene environme...
3,ChIJhd6ATRj_3Q8R7nqmtm6INVI,2023-01-23T07:27:23.598Z,5.0,"The couple of times I have been there, were ve..."
4,ChIJhd6ATRj_3Q8R7nqmtm6INVI,2023-01-23T06:19:51.128Z,5.0,Everything there was nice and super clean 👍👍👍


In [8]:
reviews = reviews.dropna()

In [9]:
reviews['comment'] = reviews['comment'].astype(str)

In [10]:
reviews.head()

Unnamed: 0,hotel_id,date,rating,comment
1,ChIJhd6ATRj_3Q8R7nqmtm6INVI,2023-02-10T09:25:21.640Z,5.0,I had a wonderful experience at Jonat Hotel. T...
2,ChIJhd6ATRj_3Q8R7nqmtm6INVI,2023-01-23T16:02:18.862Z,5.0,I enjoyed my stay here. For a serene environme...
3,ChIJhd6ATRj_3Q8R7nqmtm6INVI,2023-01-23T07:27:23.598Z,5.0,"The couple of times I have been there, were ve..."
4,ChIJhd6ATRj_3Q8R7nqmtm6INVI,2023-01-23T06:19:51.128Z,5.0,Everything there was nice and super clean 👍👍👍
5,ChIJhd6ATRj_3Q8R7nqmtm6INVI,2023-01-22T16:07:27.958Z,4.0,I had a splendid stay at Jonat Hotel. It might...


In [11]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3678 entries, 1 to 7364
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   hotel_id  3678 non-null   object 
 1   date      3678 non-null   object 
 2   rating    3678 non-null   float64
 3   comment   3678 non-null   object 
dtypes: float64(1), object(3)
memory usage: 143.7+ KB


In [12]:
comments = reviews['comment'].values
comments

array(['I had a wonderful experience at Jonat Hotel. Their rooms are new and their services are top-notch . I definitely recommend it to anyone visiting Cape Coast 🙏🏿',
       'I enjoyed my stay here. For a serene environment, choose Jonat Hotel.',
       'The couple of times I have been there, were very peaceful times and I really enjoyed the serenity and the  calm ambiance of the place.\nReally nice rooms and affordable as well.',
       ..., 'Nice place with good view', 'Nice place to be',
       'They have good facilities and a wonderful view of the Atlantic ocean'],
      dtype=object)

Visualizing comments to find frequently occurring words

In [13]:
words = comments.tolist()

In [14]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

In [15]:
stops = set(stopwords.words('english'))

In [16]:
words

['I had a wonderful experience at Jonat Hotel. Their rooms are new and their services are top-notch . I definitely recommend it to anyone visiting Cape Coast 🙏🏿',
 'I enjoyed my stay here. For a serene environment, choose Jonat Hotel.',
 'The couple of times I have been there, were very peaceful times and I really enjoyed the serenity and the  calm ambiance of the place.\nReally nice rooms and affordable as well.',
 'Everything there was nice and super clean 👍👍👍',
 'I had a splendid stay at Jonat Hotel. It might be new but the service there was great and the food was delish',
 'Staff provide warm reception, neat rooms, and room service available.',
 'It’s a nice place to stay at. Pretty clean and the prices of the rooms are great. They also serve everyone breakfast which is a plus.',
 "The Drowaa Plus Hotel is a the go-to place for stop by for a night's sleep. Great designs in room with great ventilation (AC & Fan) rooms etc.\n\nI recommend this place to all organizing events and looki

In [17]:
# # converting all sentences to english
# from googletrans import Translator
# from requests.exceptions import ReadTimeout , ConnectTimeout

# def translate_to_english(sentences):
#     # translator = Translator()
#     english_comments = []

#     for sent in sentences:
#         try:
#             translator = Translator()
#             # detected_language = translator.detect(sent)
#             if sent is None:
#                 english_comments.append(None)  # Handle None translation result
#             # elif detected_language == 'en':
#             #     english_comments.append(sent)
#             else:
#                 translated = translator.translate(sent)
#                 english_comments.append(translated.text)
#         except (ReadTimeout, ConnectTimeout) as e:
#             # Handle timeout exceptions
#             print(f"Translation timeout for sentence: {sent}")
#             english_comments.append(None)  # Or handle as desired

#     return english_comments



In [18]:
# words[25]

'This is the worse place, I have ever been too!! They are very rude, the service is very bad. The owner and his wife wanted to fight with costumers, never go there if you want to have a good time.'

In [19]:
# test_list = words[:30]

In [20]:
# from textblob import TextBlob

In [21]:
# text = 'Es bleibt kein Wunsch unerfüllt!!! Ein kleines Paradies auf Erden!!!!'

# blob = TextBlob(text)

# translated = blob.translate(to='en')  # Translate to French


In [None]:
# translated_test_list = translate_to_english(test_list)

In [None]:
# new_list = translate_to_english(test_list)

In [None]:
# new_list

['Its a nice place. We went to swim in the pool with my students. We enjoyed the clean water and had some fun',
 'Fun',
 'Cool place to be',
 'I met one of the managers, Richard, he was super good. Excellent customer service. They have clean room just that they get a little heated up in the afternoons.',
 "I think the management need to step up their game..Room service was not good..and I haven't yet know a guest house that charges Gh¢100 for a night without serving  Breakfast, and even tying to get food around the area is quite a challenge."]

In [None]:
# def translate(sentences, timeout_duration, resume_interval):
#     translator = Translator()
#     english_comments = []
#     start_time = time.time()
#     for index, sent in enumerate(sentences):
#         translation = translator.translate(sent)
#         if translation is not None and translation.text is not None:
#             english_comments.append(translation.text)
#         else:
#             english_comments.append(None)
#     return english_comments

In [None]:
# test_sent = words[:30]

In [None]:
# translated_words = translate(words[:1000], 60, 2)

In [None]:
# translated_words2 = translate(words[1000:1600], 60, 2)

In [None]:
# #removing stop words from each sentence
# translated_words3 = translate(words[1600:2000], 60, 2)

In [None]:
# translated_words4 = translate(words[2000:2500], 60, 2)

In [None]:
# translated_words5 = translate(words[2500:3000], 60, 2)

In [None]:
# translated_words6 = translate(words[3000:3678], 60, 2)

In [None]:
# english_reviews = translated_words + translated_words2 + translated_words3 + translated_words4 + translated_words5 + translated_words6

Create dict with id, reviews, list of nouns in review for each comment

In [None]:
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from textblob import TextBlob

In [None]:
reviews_dict = {
    'id': [],
    "comment" : [],
    "nouns" : []
}
for index, review in enumerate(english_reviews):
    reviews_dict['id'].append(index)
    reviews_dict['comment'].append(review)
    obj = TextBlob(review)
    tags = obj.tags
    nouns = []
    for word, tag in tags:
        if tag == 'NN':
            nouns.append(word)
    reviews_dict['nouns'].append(nouns)

In [None]:
reviews_dict

{'id': [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  100,
  101,
  102,
  103,
  104,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112,
  113,
  114,
  115,
  116,
  117,
  118,
  119,
  120,
  121,
  122,
  123,
  124,
  125,
  126,
  127,
  128,
  129,
  130,
  131,
  132,
  133,
  134,
  135,
  136,
  137,
  138,
  139,
  140,
  141,
  142,
  143,
  144,
  145,
  146,
  147,
  148,
  149,
  150,
  151,
  152,
  153,
  154,
  155,
  156,
  157

In [None]:
noun_list = []
for i in reviews_dict['nouns']:
    noun_list.extend(i)

In [None]:
noun_list

['experience',
 'anyone',
 '🙏🏿',
 'stay',
 'environment',
 'couple',
 'serenity',
 'ambiance',
 'place',
 'Everything',
 'clean',
 '👍👍👍',
 'stay',
 'service',
 'food',
 'reception',
 'neat',
 'room',
 'service',
 'place',
 'everyone',
 'breakfast',
 'plus',
 'place',
 'stop',
 'night',
 'sleep',
 'room',
 'ventilation',
 'place',
 'place',
 'location',
 'town',
 'condition',
 'customer',
 'service',
 'pub',
 'place',
 'value',
 'high',
 'staff',
 'beach',
 'condition',
 'moment',
 'form',
 'maintenance',
 'place',
 'establishment',
 'part',
 'community',
 'place',
 'everyone',
 'community',
 'saturday',
 'place',
 'conjunction',
 'beach',
 'food',
 'nothing',
 'paradise',
 'place.Very',
 'landscape',
 'place',
 '👍😁',
 'place',
 'accommodation',
 'taste',
 'kitchen',
 'welcome',
 'place',
 'relaxing',
 'time',
 'time',
 'food',
 'value',
 'money',
 'location',
 'piece',
 'lot',
 'time',
 'paradise',
 'earth',
 'service',
 'food',
 'menu',
 'waitress',
 'appearance',
 'journey',
 'servic

In [None]:
# find the count of occurence of each word in the list
def counter(list):
    counter_dict = {}
    for word in list:
        count = 0
        for item in list:
            if word == item:
                count+=1
        counter_dict[word] = count
    list.remove(word)
    return counter_dict
        
        

In [None]:
noun_dict = counter(noun_list)

In [None]:
noun_dict

{'experience': 147,
 'anyone': 29,
 '🙏🏿': 1,
 'stay': 187,
 'environment': 389,
 'couple': 15,
 'serenity': 11,
 'ambiance': 20,
 'place': 1393,
 'Everything': 18,
 'clean': 13,
 '👍👍👍': 1,
 'service': 380,
 'food': 473,
 'reception': 92,
 'neat': 10,
 'room': 251,
 'everyone': 40,
 'breakfast': 126,
 'plus': 3,
 'stop': 11,
 'night': 118,
 'sleep': 4,
 'ventilation': 3,
 'location': 152,
 'town': 41,
 'condition': 15,
 'customer': 121,
 'pub': 1,
 'value': 21,
 'high': 1,
 'staff': 293,
 'beach': 262,
 'moment': 16,
 'form': 1,
 'maintenance': 20,
 'establishment': 6,
 'part': 13,
 'community': 13,
 'saturday': 1,
 'conjunction': 1,
 'nothing': 28,
 'paradise': 13,
 'place.Very': 2,
 'landscape': 5,
 '👍😁': 1,
 'accommodation': 36,
 'taste': 16,
 'kitchen': 22,
 'welcome': 8,
 'relaxing': 9,
 'time': 172,
 'money': 46,
 'piece': 2,
 'lot': 42,
 'earth': 5,
 'menu': 29,
 'waitress': 6,
 'appearance': 1,
 'journey': 8,
 'heart': 12,
 'breaking': 1,
 'owner': 41,
 'wife': 10,
 'cuisine': 1

In [None]:
noun_df = pd.DataFrame(noun_dict.items(), columns=['noun', 'count'])

In [None]:
ordered_df = noun_df.sort_values('count', ascending= False)

In [None]:
ordered_df.head(20)

Unnamed: 0,noun,count
8,place,1393
13,food,473
4,environment,389
12,service,380
154,hotel,374
31,staff,293
32,beach,262
16,room,251
3,stay,187
51,time,172


Visualize word cloud of all nouns

In [None]:
# from wordcloud import WordCloud
# import matplotlib.pyplot as plt
# wordcloud = WordCloud().generate(noun_list)

# # Display the generated image:
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis("off")
# plt.show()


Create ABSA format of data

In [None]:
import pyabsa
from pyabsa import ATEPCCheckpointManager

In [None]:
from pyabsa import available_checkpoints
checkpoint_map = available_checkpoints()

[2023-06-11 06:11:41] (2.3.1) Please specify the task code, e.g. from pyabsa import TaskCodeOption


In [None]:
aspect_extractor = ATEPCCheckpointManager.get_aspect_extractor(checkpoint='english',
                                   auto_device=False )


[2023-06-11 06:12:25] (2.3.1) [32mDownloading checkpoint:english [0m
[2023-06-11 06:12:25] (2.3.1) [31mNotice: The pretrained model are used for testing, it is recommended to train the model on your own custom datasets[0m
[2023-06-11 06:12:25] (2.3.1) Checkpoint already downloaded, skip
[2023-06-11 06:12:25] (2.3.1) Load aspect extractor from checkpoints/ATEPC_ENGLISH_CHECKPOINT/fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43
[2023-06-11 06:12:25] (2.3.1) config: checkpoints/ATEPC_ENGLISH_CHECKPOINT/fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43/fast_lcf_atepc.config
[2023-06-11 06:12:25] (2.3.1) state_dict: checkpoints/ATEPC_ENGLISH_CHECKPOINT/fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43/fast_lcf_atepc.state_dict
[2023-06-11 06:12:25] (2.3.1) model: None
[2023-06-11 06:12:25] (2.3.1) tokenizer: checkpoints/ATEPC_ENGLISH_CHECKPOINT/fast_lcf_atepc_English_cdw_apcacc_82.36_apcf1_81.89_atef1_75.43/fast_lcf_atepc.tokenizer
[2023-06-11

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Special tokens have b

In [None]:
inference_source = english_reviews
atepc_result = aspect_extractor.extract_aspect(inference_source=inference_source,  #
                          pred_sentiment=True,  # Predict the sentiment of extracted aspect terms
                          )

preparing ate inference dataloader: 100%|██████████| 3678/3678 [00:02<00:00, 1456.52it/s]
extracting aspect terms: 100%|██████████| 115/115 [11:23<00:00,  5.94s/it]
preparing apc inference dataloader: 100%|██████████| 6026/6026 [00:07<00:00, 827.26it/s] 
  float(x) for x in F.softmax(i_apc_logits).cpu().numpy().tolist()
classifying aspect sentiments: 100%|██████████| 189/189 [19:27<00:00,  6.17s/it]


[2023-06-11 07:20:49] (2.3.1) The results of aspect term extraction have been saved in /Users/josephineamponsah/Documents/projects/review-senti-analysis/notebooks/Aspect Term Extraction and Polarity Classification.FAST_LCF_ATEPC.result.json
[2023-06-11 07:20:49] (2.3.1) Example 0: I had a wonderful experience at Jonat Hotel . Their [32m<rooms:Positive Confidence:0.9995>[0m are new and their [32m<services:Positive Confidence:0.9995>[0m are top - notch . I definitely recommend it to anyone visiting Cape Coast 🙏🏿
[2023-06-11 07:20:49] (2.3.1) Example 1: I enjoyed my [32m<stay:Positive Confidence:0.9995>[0m here . For a serene [32m<environment:Positive Confidence:0.9995>[0m , choose Jonat Hotel .
[2023-06-11 07:20:49] (2.3.1) Example 2: The couple of [32m<times:Positive Confidence:0.9989>[0m I have been there , were very peaceful times and I really enjoyed the serenity and the calm [32m<ambiance:Positive Confidence:0.999>[0m of the place . Really nice [32m<rooms:Positive Confi

In [None]:
result = pd.DataFrame(atepc_result)
    

In [None]:
atepc_result

[{'sentence': 'I had a wonderful experience at Jonat Hotel . Their rooms are new and their services are top - notch . I definitely recommend it to anyone visiting Cape Coast 🙏🏿',
  'IOB': ['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-ASP',
   'O',
   'O',
   'O',
   'O',
   'B-ASP',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O'],
  'tokens': ['I',
   'had',
   'a',
   'wonderful',
   'experience',
   'at',
   'Jonat',
   'Hotel',
   '.',
   'Their',
   'rooms',
   'are',
   'new',
   'and',
   'their',
   'services',
   'are',
   'top',
   '-',
   'notch',
   '.',
   'I',
   'definitely',
   'recommend',
   'it',
   'to',
   'anyone',
   'visiting',
   'Cape',
   'Coast',
   '🙏🏿'],
  'aspect': ['rooms', 'services'],
  'position': [[10], [15]],
  'sentiment': ['Positive', 'Positive'],
  'probs': [[0.00013757219130638987,
    0.00039960857247933745,
    0.9994627833366394],
   [

In [None]:
aspects = []
for i in atepc_result:
    aspects.extend(i['aspect'])

In [None]:
aspects

['rooms',
 'services',
 'stay',
 'environment',
 'times',
 'ambiance',
 'rooms',
 'stay',
 'service',
 'food',
 'Staff',
 'reception',
 'rooms',
 'service',
 'place',
 'prices',
 'breakfast',
 'designs',
 'events',
 'location',
 'rooms',
 'designs',
 'washrooms',
 'service',
 'staff',
 'beach',
 'maintenance',
 'place',
 'events',
 'place',
 'beach',
 'food',
 'place',
 'landscape',
 'place',
 'place',
 'accommodation',
 'kitchen',
 'Philip',
 'food',
 'rooms',
 'location',
 'wish',
 'service',
 'waiters',
 'food',
 'service',
 'service',
 'owner',
 'place',
 'cuisine',
 'place',
 'views',
 'water',
 'tap',
 'traffic',
 'environment',
 'beach',
 'stay',
 'food',
 'food',
 'beach',
 'place',
 'road',
 'service',
 'manager',
 'beaches',
 'personal',
 'ocean',
 'Philippe',
 'Chef',
 'meal',
 'atmosphere',
 'earth',
 'road',
 'reception',
 'meals',
 'Beaches food',
 'Ambiance',
 'pricing',
 'beach',
 'place',
 'place',
 'atmosphere',
 'atmosphere',
 'the internet',
 '\n',
 'the',
 'in',
 '

In [None]:
aspect_dict = counter(aspects)

In [None]:
aspect_dict

{'rooms': 255,
 'services': 81,
 'stay': 56,
 'environment': 349,
 'times': 1,
 'ambiance': 17,
 'service': 282,
 'food': 339,
 'Staff': 37,
 'reception': 54,
 'place': 1018,
 'prices': 29,
 'breakfast': 67,
 'designs': 2,
 'events': 7,
 'location': 107,
 'washrooms': 5,
 'staff': 231,
 'beach': 60,
 'maintenance': 5,
 'landscape': 5,
 'accommodation': 23,
 'kitchen': 6,
 'Philip': 1,
 'wish': 1,
 'waiters': 11,
 'owner': 22,
 'cuisine': 5,
 'views': 18,
 'water': 17,
 'tap': 2,
 'traffic': 2,
 'road': 14,
 'manager': 12,
 'beaches': 2,
 'personal': 1,
 'ocean': 6,
 'Philippe': 1,
 'Chef': 2,
 'meal': 7,
 'atmosphere': 88,
 'earth': 1,
 'meals': 30,
 'Beaches food': 1,
 'Ambiance': 3,
 'pricing': 3,
 'the internet': 1,
 '\n': 13,
 'the': 4,
 'in': 1,
 'had': 1,
 'and': 3,
 'area': 24,
 'time': 30,
 'drive': 1,
 'Food': 57,
 'Rooms': 35,
 'people': 14,
 'Atmosphere': 4,
 'sunsets': 1,
 'breeze': 11,
 'value': 6,
 'guys': 4,
 'view': 83,
 'facilities': 36,
 'workers': 20,
 'dinner': 12,


# Aspect Clustering with Word2Vec

Train Word2vec model on review data

In [None]:
import gensim
from gensim.models import Word2Vec

# model = Word2Vec(input, min_count = 1)

In [None]:
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

In [None]:
#tokenize each review comment and remove stopwords
def remove_stopwords(tokens):
    filtered =[]
    for word in tokens:
      if word not in stop:
        filtered.append(word)
    return filtered

def preprocess(sent_list):
    sentences = []
    for sent in sent_list:
        line = word_tokenize(sent)
        new_line = remove_stopwords(line)
        # new_sent = " ".join(new_line)
        sentences.append(new_line)
    return sentences

In [None]:
processed_reviews = preprocess(english_reviews)

In [None]:
processed_reviews

In [None]:
model = Word2Vec(processed_reviews, min_count = 1)

In [None]:
import json
with open("../data/atepc-result.json", 'w') as file:
    json.dump(atepc_result, file)

Find distance between aspects and each selected label

In [None]:
from gensim import matutils

def similarity_cosine(vec1, vec2):
    cosine_similarity = np.dot(matutils.unitvec(vec1), matutils.unitvec(vec2))
    return cosine_similarity


In [None]:
labels = ['food', 'service', 'ambiance', 'place']

def label_similarity(labels, aspects, model):
    distance_dict = {}
    for aspect in aspects:
        aspect_vec = model.wv[aspect]
        label_dist = {}
        for label in labels:
            label_vec = model.wv[label]
            distance = similarity_cosine(label_vec, aspect_vec)
            label_dist[label] = distance
        distance_dict[aspect] = label_dist
    return distance_dict
        

In [None]:
unique_aspects = list(set(aspects))

In [None]:
len(unique_aspects)

1098

In [None]:
unique_aspects

['hour',
 'architecture',
 'rooms',
 'territory',
 'Mama',
 'size',
 'Accomodations',
 'outdoor area',
 'around',
 'Relaxing',
 'Price',
 'Joseph',
 'Looks',
 'location',
 'conferencing facilities',
 'sun',
 'residing',
 'Parking',
 'housing',
 'mosquitoes food',
 'saty',
 'drumming',
 'siren',
 'price',
 'map',
 'evironment',
 'dinner',
 'situated',
 'owner kobi',
 'sheets',
 'My',
 'soup',
 'looks',
 'milk',
 'pricing',
 'breakfast options',
 'rooftop',
 'Service personnel',
 'masterpiece \n',
 'Breakfast buffet',
 'VIEW',
 'Ambience',
 'views',
 'buffet service',
 'staff reception',
 'water heater',
 'time',
 'fumigation',
 'Atmosphere',
 'She',
 'Room',
 'water sports',
 'Cost',
 'dishes',
 'Pillows',
 'castle',
 'Accommodation',
 'stuff',
 'toilet',
 'Guest House',
 'pita bread',
 'staff Owner',
 'outdoor shower',
 'Staff',
 'priced food',
 'ostrich',
 'pay',
 'shower',
 'access to',
 'room rates',
 'generally',
 'cleaning',
 'ocean views',
 'Ever',
 'They',
 'safety',
 'comfortab

In [None]:
distance_dict = label_similarity(labels, unique_aspects, model)

Group reviews aspect under which highly occurring noun they are closest to with Word2Vec

Combine grouped aspect with original dataframe as columns

Push to Database

In [None]:
# import re
# word_tokens =[]
# for i in words:
#     word_tokens.append(word_tokenize(i))

Aspect-Based Sentiment Analyzer