# Sentiment Analysis with aspect and TEXT

a. Text [SEP] Aspect 

b. Text + Aspect

## importing libraries

In [None]:
import json
from tqdm import tqdm
import pandas as pd
import numpy as np
import token

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import word_tokenize  

from collections import Counter
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import  confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


## Processing with training data 

In [None]:
with open('/content/drive/My Drive/SentiHood/SentiHood Dataset/sentihood-train.json', 'r') as fp:
    training_set = json.load(fp)

In [None]:
locations = ['LOCATION1']
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety', 'shopping', 'touristy', 'transit-location']

for location in locations:

  df = pd.DataFrame({'id': [], 'text': [],'aspect':[], 'sentiment': []})

  ii = 0
  for each_example in training_set:
      id = str(int(each_example['id']))
      text = each_example['text'].strip()
      
      # If `location` is present in the text, only then iterate over the  
      # list of opinions to find suitable `location-aspect` datapoints.

      if location in text:
          aspect_found = False
          
          for opinion in each_example['opinions']:
              # Checking if the current example contains a sentiment
              # related to `location-aspect`
              
              if opinion['target_entity'] == location:
                  df.loc[ii] = [id, text,opinion['aspect'], opinion['sentiment']]
                  aspect_found = True
                  ii += 1
                  break
          
          # If no sentiment is found for `location-asppect` in current 
          # example, then add a datapoint with None.
          
          if not aspect_found:
              df.loc[ii] = [id, text,opinion['aspect'], 'None']
              ii += 1
        

## Processing with validation data

In [None]:
with open('/content/drive/My Drive/SentiHood/SentiHood Dataset/sentihood-dev.json', 'r') as fp:
    validation_set = json.load(fp)

In [None]:
locations = ['LOCATION1']
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety', 'shopping', 'touristy', 'transit-location']

for location in locations:

  df_valid = pd.DataFrame({'id': [], 'text': [],'aspect':[], 'sentiment': []})

  ii = 0
  for each_example in validation_set:
      id = str(int(each_example['id']))
      text = each_example['text'].strip()
      
      # If `location` is present in the text, only then iterate over the  
      # list of opinions to find suitable `location-aspect` datapoints.

      if location in text:
          aspect_found = False
          
          for opinion in each_example['opinions']:
              # Checking if the current example contains a sentiment
              # related to `location-aspect`
              
              if opinion['target_entity'] == location:
                  df_valid.loc[ii] = [id, text,opinion['aspect'], opinion['sentiment']]
                  aspect_found = True
                  ii += 1
                  break
          
          # If no sentiment is found for `location-asppect` in current 
          # example, then add a datapoint with None.
          
          if not aspect_found:
              df_valid.loc[ii] = [id, text,opinion['aspect'], 'None']
              ii += 1
        

        # df.to_csv('/content/drive/My Drive/SentiHood/Bert-single/TrainingData/' + str(location) + str(aspect) + '.csv', index=False)
        # print(f"{location}{aspect} DONE!\tLength = {ii}")

## Processing with Testing data

In [None]:
with open('/content/drive/My Drive/SentiHood/SentiHood Dataset/sentihood-test.json', 'r') as fp:
    testing_set = json.load(fp)  

In [None]:
locations = ['LOCATION1']
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety', 'shopping', 'touristy', 'transit-location']

for location in locations:

  df_test = pd.DataFrame({'id': [], 'text': [],'aspect':[], 'sentiment': []})

  ii = 0
  for each_example in testing_set:
      id = str(int(each_example['id']))
      text = each_example['text'].strip()
      
      # If `location` is present in the text, only then iterate over the  
      # list of opinions to find suitable `location-aspect` datapoints.

      if location in text:
          aspect_found = False
          
          for opinion in each_example['opinions']:
              # Checking if the current example contains a sentiment
              # related to `location-aspect`
              
              if opinion['target_entity'] == location:
                  df_test.loc[ii] = [id, text,opinion['aspect'], opinion['sentiment']]
                  aspect_found = True
                  ii += 1
                  break
          
          # If no sentiment is found for `location-asppect` in current 
          # example, then add a datapoint with None.
          
          if not aspect_found:
              df_test.loc[ii] = [id, text,opinion['aspect'], 'None']
              ii += 1

## Mapping labels to apects and sentiments

In [None]:
# Changing aspect to labels
label_encoder=LabelEncoder()
integer_category = label_encoder.fit_transform(df.aspect)
df['label_aspect']=pd.DataFrame(integer_category)

# Changing sentiment to numerical label
df['sentiment']=df['sentiment'].map({'None':0,'Negative':1,'Positive':2})

In [None]:
# Changing aspect to labels
label_encoder=LabelEncoder()
integer_category = label_encoder.fit_transform(df_valid.aspect)
df_valid['label_aspect']=pd.DataFrame(integer_category)

# Changing sentiment to numerical label
df_valid['sentiment']=df_valid['sentiment'].map({'None':0,'Negative':1,'Positive':2})

In [None]:
# Changing aspect to labels
label_encoder=LabelEncoder()
integer_category = label_encoder.fit_transform(df_test.aspect)
df_test['label_aspect']=pd.DataFrame(integer_category)

# Changing sentiment to numerical label
df_test['sentiment']=df_test['sentiment'].map({'None':0,'Negative':1,'Positive':2})

In [None]:
df_test.head()

Unnamed: 0,id,text,aspect,sentiment,label_aspect
0,153,LOCATION1 is in Greater London and is a very ...,safety,2,8
1,1130,All the neighborhoods around LOCATION1 are ver...,general,2,1
2,1271,"Cheap is LOCATION2, LOCATION1, but not really ...",general,1,1
3,1089,Dont Try LOCATION1,general,1,1
4,731,Find the website for 'Museum in LOCATION1' (pa...,general,0,1


In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

  after removing the cwd from sys.path.


In [None]:
df['combined']=df['text']+" [SEP] "+df['aspect']

In [None]:
df_valid['combined']=df_valid['text']+" [SEP] "+df_valid['aspect']

In [None]:
df_test['combined']=df_test['text']+" [SEP] "+df_test['aspect']

In [None]:
print(df['combined'].head(5))

0    LOCATION1 is transforming and the prices will go up and up [SEP] price                                                            
1    Along LOCATION1 there are lots of Electronics shops (independent ones) [SEP] shopping                                             
2    And LOCATION1 is ten mins direct on the tube to LOCATION2: [SEP] transit-location                                                 
3    Another option is LOCATION1 which is very central and has tons of clubs/bars within walking distance of each other [SEP] nightlife
4    Best bet is around LOCATION2 and LOCATION1 area in the northwest corner really nice [SEP] general                                 
Name: combined, dtype: object


## Model Buidling

### i. Aspect [SEP] Text

In [None]:
 # importing transformers
 !pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/fd/1a/41c644c963249fd7f3836d926afa1e3f1cc234a1c40d80c5f03ad8f6f1b2/transformers-4.8.2-py3-none-any.whl (2.5MB)
[K     |▏                               | 10kB 20.1MB/s eta 0:00:01[K     |▎                               | 20kB 25.3MB/s eta 0:00:01[K     |▍                               | 30kB 27.9MB/s eta 0:00:01[K     |▌                               | 40kB 26.6MB/s eta 0:00:01[K     |▋                               | 51kB 26.9MB/s eta 0:00:01[K     |▉                               | 61kB 28.1MB/s eta 0:00:01[K     |█                               | 71kB 28.3MB/s eta 0:00:01[K     |█                               | 81kB 27.6MB/s eta 0:00:01[K     |█▏                              | 92kB 27.5MB/s eta 0:00:01[K     |█▎                              | 102kB 27.7MB/s eta 0:00:01[K     |█▍                              | 112kB 27.7MB/s eta 0:00:01[K     |█▋                              | 

In [None]:
## pretrained model 
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




In [None]:
# Tokenize train, validation and test data
train_encoding= tokenizer(df['combined'].tolist(), padding="max_length",max_length=128, truncation=True)
valid_encoding= tokenizer(df_valid['combined'].tolist(), padding="max_length",max_length=128, truncation=True)
test_encoding= tokenizer(df_test['combined'].tolist(), padding="max_length",max_length=128, truncation=True)

In [None]:
df['combined'][0]

'LOCATION1 is transforming and the prices will go up and up [SEP] price'

In [None]:
## TensorFlow
import tensorflow as tf

# Tokens to tensor slices
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encoding),
    df['sentiment']
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(valid_encoding),
    df_valid['sentiment']
))
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encoding),
    df_test['sentiment']
))

In [None]:
# Pre-training the Model
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels=3)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=363423424.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'vocab_layer_norm', 'vocab_transform', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier', 'dropout_19', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [None]:
# Fine-tuning the model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
model.fit(train_dataset.shuffle(100).batch(16),
          epochs=3,
          batch_size=16,
          validation_data=val_dataset.shuffle(100).batch(16))

Epoch 1/3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f704cfa5fd0>

In [None]:
#Fine-tuning Model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
model.fit(train_dataset.shuffle(100).batch(32),
          epochs=3,
          batch_size=16,
          validation_data=val_dataset.shuffle(100).batch(16))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f704c4f8a50>

In [None]:
# final accuracy of the model 
accuracy_score(label , df_test['sentiment'])

0.8262910798122066

### ii. Aspect + Text

In [None]:
# Combining Aspect + Text
df['combined_text']=df['text']+df['aspect']
df_valid['combined_text']=df_valid['text']+df_valid['aspect']
df_test['combined_text']=df_test['text']+df_test['aspect']

In [None]:
# Tokenizing train, validation and test data 
train_encoding= tokenizer(df['combined_text'].tolist(), padding="max_length",max_length=128, truncation=True)
valid_encoding= tokenizer(df_valid['combined_text'].tolist(), padding="max_length",max_length=128, truncation=True)
test_encoding= tokenizer(df_test['combined_text'].tolist(), padding="max_length",max_length=128, truncation=True)

In [None]:
## TensorFlow
import tensorflow as tf

# tokens to tensor slices
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encoding),
    df['sentiment']
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(valid_encoding),
    df_valid['sentiment']
))
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encoding),
    df_test['sentiment']
))

In [None]:
# fine-tuning the model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
model.fit(train_dataset.shuffle(100).batch(16),
          epochs=3,
          batch_size=16,
          validation_data=val_dataset.shuffle(100).batch(16))

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f700003e390>

In [None]:
#predicting values
tf_batch = tokenizer(df_test['combined_text'].tolist(), max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()

In [None]:
# final Accuracy of aspect + text
accuracy_score(label , df_test['sentiment'])

0.795439302481556