# Load the dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import zipfile
import pandas as pd
import xml.etree.ElementTree as ET
!pip install tensorflow_text

dataset_path = "/content/drive/MyDrive/HLT/RestaurantTR.zip"
with zipfile.ZipFile(dataset_path,"r") as zip_ref:
    zip_ref.extractall("./dataset")



In [None]:
import numpy as np
xml_path = "/content/dataset/ABSA15_RestaurantsTrain/ABSA-15_Restaurants_Train_Final.xml"
mytree = ET.parse(xml_path)
reviews = mytree.getroot()
rows_list = []
for rew in reviews:
  rid = rew.attrib['rid']
  for sent in rew[0]:
    txt = sent[0].text
    if len(sent)>=2:
      row_dict = sent[1][0].attrib
      row_dict['rid'] = rid
      row_dict['txt'] = txt
      row_dict['OOTS'] = False
      rows_list.append(row_dict)
    else:
      rows_list.append({'target':np.nan,'category':np.nan,'polarity':np.nan,'from':np.nan,'to':np.nan,'rid':np.nan,'txt':txt,'OOTS':True})
df = pd.DataFrame(rows_list)
#target	category	polarity	from	to	rid	txt	entity	aspect

## Data cleaning

In [None]:
df['target'].value_counts()

NULL                303
food                 94
place                65
service              44
restaurant           23
                   ... 
characters            1
Prune                 1
grilled branzino      1
chicken vindaloo      1
BBQ Salmon            1
Name: target, Length: 376, dtype: int64

In [None]:
df[['entity', 'aspect']] = df['category'].str.split('#', 1, expand=True)
df = df.drop(['category'], axis=1)

In [None]:
df['polarity'] = df['polarity'].map(lambda x: 0 if x == "negative" else 2 if x=="positive" else 1)

In [None]:
df['to'].value_counts()

0      303
8       70
9       38
10      38
11      36
      ... 
67       1
63       1
86       1
138      1
118      1
Name: to, Length: 106, dtype: int64

In [None]:
# df['from'] = pd.to_numeric(df['from'])
# df['to'] = pd.to_numeric(df['to'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1315 entries, 0 to 1314
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   target    1120 non-null   object
 1   polarity  1315 non-null   int64 
 2   from      1120 non-null   object
 3   to        1120 non-null   object
 4   rid       1120 non-null   object
 5   txt       1315 non-null   object
 6   OOTS      1315 non-null   bool  
 7   entity    1120 non-null   object
 8   aspect    1120 non-null   object
dtypes: bool(1), int64(1), object(7)
memory usage: 83.6+ KB


# Sentiment Analysis with BERT

## DatasetPreparation

In [None]:
df=df[["txt","polarity"]]

In [None]:
# {0:"neg",1:"neutral",2:"pos"}
# df['polarity'] = df['polarity']+1
df.polarity.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


2    801
0    276
1    238
Name: polarity, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split
from keras.utils import np_utils
import tensorflow_text as text  # Registers the ops.

x_train, x_test, y_train, y_test = train_test_split(df['txt'],df['polarity'],stratify=df['polarity'])
# hot encoding of the polarity
y_train = np_utils.to_categorical(y_train) 
y_test = np_utils.to_categorical(y_test)
y_train.shape

## Model Training

In [None]:
import tensorflow_hub as hub
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")   #preprocessing layer str -> str for bert
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4") #encoding using bert str -> vector[]

FileNotFoundError: ignored

In [None]:
# Example of usage of the preprocessing+embedding.

def get_sentence_embedding(sentences):
  return bert_encoder(bert_preprocess(sentences))["pooled_output"]  #Pooled output is a vector of 7

get_sentence_embedding(["hello my name is"])

In [None]:
#Creation of the model
from tensorflow.keras import layers
import tensorflow as tf
input_txt = layers.Input(shape=(), dtype=tf.string, name="text")
preprocess_text = bert_preprocess(input_txt)
bert_encoder.trainable=False                                    #Freezed the weights of BERT.
encoded_text = bert_encoder(preprocess_text)

#Neural nel layers
l = layers.Dropout(0.1, name="dropout")(encoded_text["pooled_output"])
l = layers.Dense(64,activation="relu")(l)
l = layers.Dense(3,activation='softmax', name="output")(l)

#Generate the model
model = tf.keras.Model(inputs=[input_txt],outputs=[l])
model.summary()

Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_type_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                    

In [None]:
batch_size = 8
epochs = 10
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=6)
METRICS = [
           tf.keras.metrics.CategoricalAccuracy(),
           tf.keras.metrics.Precision(),
           tf.keras.metrics.Recall()
]
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=METRICS)

In [None]:
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=[x_test,y_test], callbacks=[callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f0bd5ce5790>

## Examples

In [None]:
#Example
polarities = ["negative","neutral","positive"]
exml_sentence = "The service is bad."
out = model.predict([exml_sentence])
polarities[np.argmax(out)]

'negative'

In [None]:
exml_sentence = "This food is very good."
out = model.predict([exml_sentence])
polarities[np.argmax(out)]

'positive'

In [None]:
exml_sentence = "The pastas are incredible, the risottos (particularly the sepia) are fantastic and the braised rabbit is amazing."
out = model.predict([exml_sentence])
polarities[np.argmax(out)]

'positive'

In [None]:
exml_sentence = "Delivery guy sometimes get upset if you don't tip more than 10%."
out = model.predict([exml_sentence])
polarities[np.argmax(out)] 

'negative'

In [None]:
exml_sentence = "the pasta was good but the stake was bad."
out = model.predict([exml_sentence])
out 

AttributeError: ignored

# Sentiment Analysis with Sentnet

This a bert-base-multilingual-uncased model finetuned for sentiment analysis on product reviews in six languages: English, Dutch, German, French, Spanish and Italian. It predicts the sentiment of the review as a number of stars (between 1 and 5).

This model is intended for direct use as a sentiment analysis model for product reviews in any of the six languages above, or for further finetuning on related sentiment analysis tasks.

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

Downloading:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/953 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/851k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/638M [00:00<?, ?B/s]

In [None]:
txt = "the pasta was good but the stake was bad."
out = bert_preprocess("asdfsds")

ValueError: ignored

In [None]:
out = model(tkn_sentence)

AttributeError: ignored