# Import data and library

In [8]:
import pandas as pd
import numpy as np
import operator
import matplotlib.pyplot as plt

import nltk
from nltk import sent_tokenize
from nltk import word_tokenize

from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()
from textblob import TextBlob, Word

import spacy
import ast

#nltk.download('omw-1.4')
#nltk.download('wordnet')
#nltk.download('stopwords')
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')

In [9]:
df=pd.read_excel('Australia_Cleaned_API.xlsx')

In [10]:
df.isnull().sum()

Unnamed: 0              0
Restaurant Name         0
Location Restaurant     0
Number Of Reviews       0
Rate                    0
User Name              13
User Location API       0
Date Review             0
Date Visited            0
Score                   0
Review Text             0
Cuisine                 0
dtype: int64

In [11]:
df.shape

(37410, 12)

# Punctuation Removal 

In [13]:
#library that contains punctuation
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [14]:
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

In [15]:
df['clean_review']= df['Review Text'].apply(lambda x:remove_punctuation(x))


# Lowering the text

In [17]:
df['review_lower']= df['clean_review'].apply(lambda x: x.lower())

# Tokenization

In [19]:
#applying function to the column
df['review_tokenied']= df['review_lower'].apply(lambda x: nltk.word_tokenize(x))

# Stop word removal

In [21]:
#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')
stopwords[0:10]
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

#defining the function to remove stopwords from tokenized text
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

In [22]:
#applying the function
df['review_stopwords']= df['review_tokenied'].apply(lambda x:remove_stopwords(x))

# Stemming

In [24]:
#defining the object for stemming
porter_stemmer = PorterStemmer()

#defining a function for stemming
def stemming(text):
    stem_text = [porter_stemmer.stem(word) for word in text]
    return stem_text

In [25]:
df['review_stemmed']=df['review_stopwords'].apply(lambda x: stemming(x))

# Lemmatization

In [27]:
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

#defining the function for lemmatization
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text

In [28]:
df['review_lemmatized']=df['review_stopwords'].apply(lambda x:lemmatizer(x))

# Part of Speech Tagging (POST)

## Spacy 

In [33]:
# Khởi tạo mô hình tiếng Anh của Spacy
nlp = spacy.load("en_core_web_sm")

In [34]:
# Một danh sách các từ đã được xử lý bằng phương pháp Lemmatization
lemmatized_list = df['review_lemmatized']

# Chuyển danh sách thành một chuỗi văn bản
#doc = nlp(" ".join(lemmatized_list))
#my_string = ''.join(str(x) for x in my_list if isinstance(x, str))
#my_string = ''.join(str(x) for x in lemmatized_list if isinstance(x, str))
for i in range(len(lemmatized_list)):
    if isinstance(lemmatized_list[i], list):
        lemmatized_list[i] = ' '.join(lemmatized_list[i])

my_string = ' '.join(lemmatized_list)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lemmatized_list[i] = ' '.join(lemmatized_list[i])


In [36]:
# Tạo một danh sách chứa kết quả phân loại từng dòng
nn_list = []

# Duyệt qua từng dòng trong dataframe
for index, row in df.iterrows():
    # Chuyển đổi nội dung dòng thành đối tượng Doc của Spacy
    doc = nlp(row['review_lemmatized'])
    # Tạo một danh sách chứa các từ danh từ
    word_nn = [token.text for token in doc if token.pos_ in ['NOUN', 'PROPN']]
    # Lưu danh sách danh từ của dòng này vào danh sách chung
    nn_list.append(word_nn)

# Tạo một cột mới trong dataframe chứa danh sách các từ danh từ
df['NN'] = nn_list

# Create Dish from NN 

In [38]:
# Tính số lượng bài đánh giá chứa mỗi danh từ
nouns_count = {}
for nn in nn_list:
    for noun in nn:
        if noun in nouns_count:
            nouns_count[noun] += 1
        else:
            nouns_count[noun] = 1

# Tính giá trị hỗ trợ của mỗi danh từ
total_reviews = len(df)
threshold = 0.03
frequent_nouns = []
for noun, count in nouns_count.items():
    support = count / total_reviews
    if support >= threshold:
        frequent_nouns.append(noun)


In [39]:
# nouns_count

In [40]:
len(nouns_count)

42948

In [41]:
len(frequent_nouns)

132

In [42]:
# frequent_nouns

In [43]:
list_dish=['chicken', 'pork', 'wine', 'prawn', 'beef', 'sauce', 'dessert', 'beer', 'meat', 'egg', 
           'fish', 'coffee', 'bread', 'seafood', 'lamb', 'steak', 'pizza', 'curry', 'duck', 'cocktail', 
           'salad', 'tea', 'cheese', 'pasta', 'salmon', 'burger', 'rice', 'oyster', 'chip', 'sushi']

In [44]:
def find_keywords(text):
    keywords = []
    words = text
    for w in words:
        if w in list_dish:
            keywords.append(w)
    return ', '.join(keywords)

df['Key_Dish'] = df['NN'].apply(find_keywords)

In [45]:
def remove_duplicates(s):
    words = s.split(', ')
    unique_words = set(words)
    result = ' '.join(unique_words)
    return result

# Áp dụng hàm cho cột 'Cuisine'
df['Key_Dish'] = df['Key_Dish'].apply(remove_duplicates)

In [46]:
df['Key_Dish']= df['Key_Dish'].apply(lambda x: nltk.word_tokenize(x))

In [47]:
len(df['Restaurant Name'].unique())

1678

In [48]:
# df = df.drop({'Unnamed: 0','clean_review','review_lower','review_tokenied','review_stopwords','review_stemmed','review_lemmatized'}, axis=1)

In [49]:
df['Key_Dish'][1]

['pizza']

# Create Meal

#### <font color='blue violet'>Breakfast</font>

In [51]:
breakfast=['breakfast']
lunch=['lunch']
dinner=[ 'dinner', 'night', 'dinning']

In [52]:
def find_keywords(text):
    keywords = []
    words = text
    for w in words:
        if w in breakfast:
            keywords.append('Breakfast')
        if w in lunch:
            keywords.append('Lunch')
        if w in dinner:
            keywords.append('Dinner')
    return ', '.join(keywords)

df['Meal'] = df['NN'].apply(find_keywords)

In [53]:
df['Meal'] = df['Meal'].apply(remove_duplicates)

In [54]:
df['Meal']= df['Meal'].apply(lambda x: nltk.word_tokenize(x))

In [55]:
# df['Meal'] = df['Meal'].apply(lambda x: '|'.join(x))
# df = df.join(df['Meal'].str.get_dummies())

# Create Feature

In [56]:
service=['service', 'staff', 'waiter', 'waitress', 'chef', 'view']
atmosphere=['atmosphere', 'ambience']
price=['price', 'value']
location=['place', 'location', 'spot']

In [57]:
def find_keywords(text):
    keywords = []
    words = text
    for w in words:
        if w in service:
            keywords.append('Service')
        if w in atmosphere:
            keywords.append('Atmosphere')
        if w in price:
            keywords.append('Price')
        if w in location:
            keywords.append('Location')
    return ', '.join(keywords)

df['Feature'] = df['NN'].apply(find_keywords)

In [58]:
df['Feature'] = df['Feature'].apply(remove_duplicates)

In [59]:
df['Feature']= df['Feature'].apply(lambda x: nltk.word_tokenize(x))

In [60]:
# df['Feature'] = df['Feature'].apply(lambda x: '|'.join(x))
# df = df.join(df['Feature'].str.get_dummies())

In [75]:
df.rename(columns = {'User Location API ':'User Location', 'Key_Dish':'Dish'}, inplace = True)

In [76]:
df

Unnamed: 0,Restaurant Name,Location Restaurant,Number Of Reviews,Rate,User Name,User Location,Date Review,Date Visited,Score,Review Text,Cuisine,NN,Dish,Meal,Feature,Region
0,Bella Tiarnie Italian Restaurant & Gourmet Pizzas,Brisbane,425,36,Mike_Skidmore,United Kingdom,2022-10-07,2022-10,30,Service a little lack-lustre. One very new mem...,"Italian, Pizza","[service, member, staff, training, problem, is...","[pasta, seafood]",[],[Service],Europe
1,Bella Tiarnie Italian Restaurant & Gourmet Pizzas,Brisbane,425,36,AllanJGJ,New Zealand,2022-07-28,2022-07,20,We were only after a pizza and some chianti bu...,"Italian, Pizza","[pizza, chianti, hour, pizza, arrive, place, h...",[pizza],[Dinner],"[Location, Service]",Oceania
2,La Luna Bistro,Melbourne,380,39,Jane C,United Kingdom,2023-01-19,2022-11,50,We came here after happening upon their sister...,"Australian, European, Steakhouse","[sister, bar, le, bouvier, fantastic, service,...",[],[],"[Atmosphere, Service]",Europe
3,La Luna Bistro,Melbourne,380,39,sammytaylor458,United Kingdom,2021-12-23,2021-12,40,"Delicious food. If you are a meat eater, then ...","Australian, European, Steakhouse","[food, meat, eater, place, suit, ground, corka...",[meat],[],[Location],Europe
4,Red Lantern Restaurant & Private Dining Room,Sydney,615,33,faridap2013,United Kingdom,2020-04-07,2020-02,50,Meeting up with our son and partner after over...,"Asian, Vietnamese, Contemporary","[son, partner, year, celebrate, disappoint, me...",[],[Dinner],[Service],Europe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37405,Black Bar & Grill,Sydney,769,23,rmgmainridge,Thailand,2013-03-14,2013-01,50,Have been to Ezards restaurants in Melbourne m...,"Australian, Steakhouse, Grill, Contemporary","[restaurant, melbourne, time, format, trip, sy...",[],[],[Service],Asia
37406,Black Bar & Grill,Sydney,769,23,Adam S,New Zealand,2012-11-18,2012-11,50,"If you are after something special, and you ha...","Australian, Steakhouse, Grill, Contemporary","[fund, dining, ezard, consideringthe, service,...","[cheese, tea, steak, dessert]",[],[Service],Oceania
37407,Black Bar & Grill,Sydney,769,23,BryonyFrancesca,New Zealand,2012-09-08,2012-08,50,I had the pleasure of dining at this particula...,"Australian, Steakhouse, Grill, Contemporary","[pleasure, place, work, function, table, peopl...","[meat, egg, prawn, steak]",[Dinner],"[Location, Service]",Oceania
37408,Black Bar & Grill,Sydney,769,23,happy12054,United States,2012-07-08,2012-07,50,We happened on this restaurant after seeing a ...,"Australian, Steakhouse, Grill, Contemporary","[restaurant, theater, reservation, staff, othe...","[bread, steak]",[Dinner],[Service],North America


In [77]:
# df.to_excel('Australia_finallllll.xlsx')