# Sentiment Analysis with Hugging face (DistilBERT)

## importing libraries

In [None]:
import json
from tqdm import tqdm
import pandas as pd
import numpy as np

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import word_tokenize  

from collections import Counter
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import  confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score


In [None]:
 from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


## Preprocessing Train  DataSet

In [None]:
with open('/content/drive/My Drive/SentiHood/SentiHood Dataset/sentihood-train.json', 'r') as fp:
    training_set = json.load(fp)

In [None]:
locations = ['LOCATION1']
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety', 'shopping', 'touristy', 'transit-location']

for location in locations:

  df = pd.DataFrame({'id': [], 'text': [],'aspect':[], 'sentiment': []})

  ii = 0
  for each_example in training_set:
      id = str(int(each_example['id']))
      text = each_example['text'].strip()
      
      # If `location` is present in the text, only then iterate over the  
      # list of opinions to find suitable `location-aspect` datapoints.

      if location in text:
          aspect_found = False
          
          for opinion in each_example['opinions']:
              # Checking if the current example contains a sentiment
              # related to `location-aspect`
              
              if opinion['target_entity'] == location:
                  df.loc[ii] = [id, text,opinion['aspect'], opinion['sentiment']]
                  aspect_found = True
                  ii += 1
                  break
          
          # If no sentiment is found for `location-asppect` in current 
          # example, then add a datapoint with None.
          
          if not aspect_found:
              df.loc[ii] = [id, text,opinion['aspect'], 'None']
              ii += 1
        

In [None]:
df.head(5)

Unnamed: 0,id,text,aspect,sentiment
0,1430,LOCATION1 is transforming and the prices will ...,price,Negative
1,2013,Along LOCATION1 there are lots of Electronics ...,shopping,Positive
2,1244,And LOCATION1 is ten mins direct on the tube t...,transit-location,Positive
3,209,Another option is LOCATION1 which is very cent...,nightlife,Positive
4,2824,Best bet is around LOCATION2 and LOCATION1 are...,general,Positive


In [None]:
df.shape

(2977, 4)

## Processing Validation data-set

In [None]:
with open('/content/drive/My Drive/SentiHood/SentiHood Dataset/sentihood-dev.json', 'r') as fp:
    validation_set = json.load(fp)

In [None]:
locations = ['LOCATION1']
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety', 'shopping', 'touristy', 'transit-location']

for location in locations:

  df_valid = pd.DataFrame({'id': [], 'text': [],'aspect':[], 'sentiment': []})

  ii = 0
  for each_example in validation_set:
      id = str(int(each_example['id']))
      text = each_example['text'].strip()
      
      # If `location` is present in the text, only then iterate over the  
      # list of opinions to find suitable `location-aspect` datapoints.

      if location in text:
          aspect_found = False
          
          for opinion in each_example['opinions']:
              # Checking if the current example contains a sentiment
              # related to `location-aspect`
              
              if opinion['target_entity'] == location:
                  df_valid.loc[ii] = [id, text,opinion['aspect'], opinion['sentiment']]
                  aspect_found = True
                  ii += 1
                  break
          
          # If no sentiment is found for `location-asppect` in current 
          # example, then add a datapoint with None.
          
          if not aspect_found:
              df_valid.loc[ii] = [id, text,opinion['aspect'], 'None']
              ii += 1
        

        # df.to_csv('/content/drive/My Drive/SentiHood/Bert-single/TrainingData/' + str(location) + str(aspect) + '.csv', index=False)
        # print(f"{location}{aspect} DONE!\tLength = {ii}")

In [None]:
df_valid.head()

Unnamed: 0,id,text,aspect,sentiment
0,302,LOCATION1 is just a normal area that happens t...,shopping,Positive
1,460,""" My mate then went on to ask: ""Well, isn't LO...",shopping,
2,582,"""I'm from LOCATION1 so I'm hard""",shopping,
3,465,'Bo-bos' - bourgeois bohemians - are particula...,shopping,
4,270,( I was born n maternity hospital in Clapton )...,shopping,


In [None]:
df_valid.shape

(747, 4)

## Preprocessing Test DataSet

In [None]:
with open('/content/drive/My Drive/SentiHood/SentiHood Dataset/sentihood-test.json', 'r') as fp:
    testing_set = json.load(fp)  

In [None]:
locations = ['LOCATION1']
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety', 'shopping', 'touristy', 'transit-location']

for location in locations:

  df_test = pd.DataFrame({'id': [], 'text': [],'aspect':[], 'sentiment': []})

  ii = 0
  for each_example in testing_set:
      id = str(int(each_example['id']))
      text = each_example['text'].strip()
      
      # If `location` is present in the text, only then iterate over the  
      # list of opinions to find suitable `location-aspect` datapoints.

      if location in text:
          aspect_found = False
          
          for opinion in each_example['opinions']:
              # Checking if the current example contains a sentiment
              # related to `location-aspect`
              
              if opinion['target_entity'] == location:
                  df_test.loc[ii] = [id, text,opinion['aspect'], opinion['sentiment']]
                  aspect_found = True
                  ii += 1
                  break
          
          # If no sentiment is found for `location-asppect` in current 
          # example, then add a datapoint with None.
          
          if not aspect_found:
              df_test.loc[ii] = [id, text,opinion['aspect'], 'None']
              ii += 1

In [None]:
df_test.head(5)

Unnamed: 0,id,text,aspect,sentiment
0,153,LOCATION1 is in Greater London and is a very ...,safety,Positive
1,1130,All the neighborhoods around LOCATION1 are ver...,general,Positive
2,1271,"Cheap is LOCATION2, LOCATION1, but not really ...",general,Negative
3,1089,Dont Try LOCATION1,general,Negative
4,731,Find the website for 'Museum in LOCATION1' (pa...,general,


In [None]:
df_test.shape

(1491, 4)

## Mapping label to the aspect and sentiment label

In [None]:
# Changing aspect to labels
label_encoder=LabelEncoder()
integer_category = label_encoder.fit_transform(df.aspect)
df['label_aspect']=pd.DataFrame(integer_category)

# Changing sentiment to numerical label
df['sentiment']=df['sentiment'].map({'None':0,'Negative':1,'Positive':2})

In [None]:
# Changing aspect to labels
label_encoder=LabelEncoder()
integer_category = label_encoder.fit_transform(df_valid.aspect)
df_valid['label_aspect']=pd.DataFrame(integer_category)

# Changing sentiment to numerical label
df_valid['sentiment']=df_valid['sentiment'].map({'None':0,'Negative':1,'Positive':2})

In [None]:
# Changing aspect to labels
label_encoder=LabelEncoder()
integer_category = label_encoder.fit_transform(df_test.aspect)
df_test['label_aspect']=pd.DataFrame(integer_category)

# Changing sentiment to numerical label
df_test['sentiment']=df_test['sentiment'].map({'None':0,'Negative':1,'Positive':2})

In [None]:
df.head(5)

Unnamed: 0,id,text,aspect,sentiment,label_aspect
0,1430,LOCATION1 is transforming and the prices will ...,price,1,6
1,2013,Along LOCATION1 there are lots of Electronics ...,shopping,2,9
2,1244,And LOCATION1 is ten mins direct on the tube t...,transit-location,2,11
3,209,Another option is LOCATION1 which is very cent...,nightlife,2,5
4,2824,Best bet is around LOCATION2 and LOCATION1 are...,general,2,1


In [None]:
df_valid.head()

Unnamed: 0,id,text,aspect,sentiment,label_aspect
0,302,LOCATION1 is just a normal area that happens t...,shopping,2,9
1,460,""" My mate then went on to ask: ""Well, isn't LO...",shopping,0,9
2,582,"""I'm from LOCATION1 so I'm hard""",shopping,0,9
3,465,'Bo-bos' - bourgeois bohemians - are particula...,shopping,0,9
4,270,( I was born n maternity hospital in Clapton )...,shopping,0,9


In [None]:
df_test.head()

Unnamed: 0,id,text,aspect,sentiment,label_aspect
0,153,LOCATION1 is in Greater London and is a very ...,safety,2,8
1,1130,All the neighborhoods around LOCATION1 are ver...,general,2,1
2,1271,"Cheap is LOCATION2, LOCATION1, but not really ...",general,1,1
3,1089,Dont Try LOCATION1,general,1,1
4,731,Find the website for 'Museum in LOCATION1' (pa...,general,0,1


## Model Building

In [None]:
# Installing transformers 
!pip install transformers


Collecting transformers
  Downloading transformers-4.8.2-py3-none-any.whl (2.5 MB)
[K     |████████████████████████████████| 2.5 MB 34.8 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 49.5 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 48.9 MB/s 
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.0.12 sacremoses-0.0.45 tokenizers-0.10.3 transformers-4.8.2


In [None]:
## importing pretrained DistilBERT model 
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




In [None]:
# Tokenizing the text
train_encoding= tokenizer(df['text'].tolist(), padding="max_length",max_length=128, truncation=True)
valid_encoding= tokenizer(df_valid['text'].tolist(), padding="max_length",max_length=128, truncation=True)
test_encoding= tokenizer(df_test['text'].tolist(), padding="max_length",max_length=128, truncation=True)

In [None]:
df['text'].shape

(2977,)

In [None]:
train_encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [None]:
len(train_encoding["input_ids"][0])


128

In [None]:
## TensorFlow
import tensorflow as tf

#Converting tokens into tensor slices
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encoding),
    df['sentiment']
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(valid_encoding),
    df_valid['sentiment']
))
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encoding),
    df_test['sentiment']
))

In [None]:
# Applying pre-trained model
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels=3)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=363423424.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'vocab_projector', 'vocab_transform', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_19', 'pre_classifier', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [None]:
# fine-tuning the model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
model.fit(train_dataset.shuffle(100).batch(16),
          epochs=3,
          batch_size=16,
          validation_data=val_dataset.shuffle(100).batch(16))

Epoch 1/3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7ff8b007e190>

In [None]:
# predicting the test set labels
tf_batch = tokenizer(df_test['text'].tolist(), max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
label = tf.argmax(tf_predictions, axis=1) 
label = label.numpy()

In [None]:
#final accuracy on test set
accuracy_score(label , df_test['sentiment'])

0.806841046277666

In [None]:
val=accuracy_score(label , df_test['sentiment'])

In [None]:
# confusion matrix on test set
confusion_matrix(label,df_test['sentiment'])

array([[388,  23,  56],
       [ 28, 213,  61],
       [ 75,  45, 602]])

In [None]:
model.save_pretrained("/tmp/sentiment_analysis")

#### Load saved model and run predict function

# loaded_model = TFDistilBertForSequenceClassification.from_pretrained("/tmp/sentiment_analysis")

HYper-tuning

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
model.fit(train_dataset.shuffle(100).batch(16),
          epochs=4,
          batch_size=32,
          validation_data=val_dataset.shuffle(100).batch(16))