# Aspect Analysis for DistilBERT for 12 aspects

## Importing libraries

In [None]:
import json
from tqdm import tqdm
import pandas as pd
import numpy as np
import tensorflow as tf


In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import word_tokenize  

from collections import Counter
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import  confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score


In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


## Model Building

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.8.2-py3-none-any.whl (2.5 MB)
[K     |████████████████████████████████| 2.5 MB 8.5 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 53.5 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 57.9 MB/s 
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.0.12 sacremoses-0.0.45 tokenizers-0.10.3 transformers-4.8.2


In [None]:

# tokenizer
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




In [None]:
# pre-training model
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments
model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased",num_labels=3)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=363423424.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['activation_13', 'vocab_layer_norm', 'vocab_projector', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['dropout_19', 'classifier', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [None]:
# Training for locations and aspects
locations = ['LOCATION1','LOCATION2']
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety','shopping', 'touristy', 'transit-location']
test_score=pd.DataFrame({'Location_Aspect':[],'Test_Accuracy':[]})
for location in locations:
    for aspect in aspects:
      print(f"Starting {location} {aspect}...")
      training_set_path = '/content/drive/My Drive/SentiHood/Bert-single/TrainingData/' + str(location) + str(aspect) + '.csv'
      validation_set_path = '/content/drive/My Drive/SentiHood/Bert-single/ValidationData/' + str(location) + str(aspect) + '.csv'
      testing_set_path = '/content/drive/My Drive/SentiHood/Bert-single/TestingData/' + str(location) + str(aspect) + '.csv'

      df_train = pd.read_csv(training_set_path)
      df_valid = pd.read_csv(validation_set_path)
      df_test=pd.read_csv(testing_set_path)
      sentiment_mapping = {
          'Positive': 2,
          'Negative': 1,
          'None': 0
      }
      df_train['sentiment'] = df_train['sentiment'].map(sentiment_mapping)
      df_valid['sentiment'] = df_valid['sentiment'].map(sentiment_mapping)
      df_test['sentiment'] = df_test['sentiment'].map(sentiment_mapping)
      df_train = df_train.reset_index(drop=True)
      df_valid = df_valid.reset_index(drop=True)
      df_test = df_test.reset_index(drop=True)

      # coverting into tensor objects
      train_encoding= tokenizer(df_train['text'].tolist(), padding="max_length",max_length=128, truncation=True)
      valid_encoding= tokenizer(df_valid['text'].tolist(), padding="max_length",max_length=128, truncation=True)

      train_dataset = tf.data.Dataset.from_tensor_slices((
      dict(train_encoding),
      df_train['sentiment']
      ))
      val_dataset = tf.data.Dataset.from_tensor_slices((
      dict(valid_encoding),
      df_valid['sentiment'] 
      ))

      optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
      model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
      model.fit(train_dataset.shuffle(100).batch(16),
          epochs=3,
          batch_size=16,
          validation_data=val_dataset.shuffle(100).batch(16))
      
      print("Testset Accuracy score")
      tf_batch = tokenizer(df_test['text'].tolist(), max_length=128, padding=True, truncation=True, return_tensors='tf')
      tf_outputs = model(tf_batch)
      tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
      label = tf.argmax(tf_predictions, axis=1)
      label = label.numpy()

      print(accuracy_score(label,df_test['sentiment']))
      val=accuracy_score(label,df_test['sentiment'])
      new_row = {'Location_Aspect':str(location) + str(aspect), 'Test_Accuracy':val}
      #append row to the dataframe
      test_score = test_score.append(new_row, ignore_index=True)

      predicted_result=pd.DataFrame({'id':df_test['id'],'predicted_values':label,'Actual_values':df_test['sentiment']})
      
      
      
      #predicted_result.to_csv('/content/drive/My Drive/SentiHood/Bert-single/PredictedResults/' + str(location) + str(aspect) + '.csv', index=False)

Starting LOCATION1 dining...
Epoch 1/3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Cause: while/else statement not yet supported
Cause: while/else statement not yet supported
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Epoch 2/3
Epoch 3/3
Testset Accuracy score
0.994634473507713
Starting LOCATION1 general...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Testset Accuracy score
0.8303152246814218
Starting LOCAT

KeyboardInterrupt: ignored

In [None]:
# store predicted values in a csv
test_score.to_csv('/content/drive/My Drive/SentiHood/Bert-single/Predicted_resutls', index=False) 

## Evalution

In [None]:
df_true_location1 = pd.DataFrame({'id': [], 'location': [] , 'dining': [], 'general': [], 'green-nature': [], 'live': [], 'multicultural': [], 'nightlife': [], 'price': [], 'quiet': [], 'safety': [],'shopping': [], 'touristy': [], 'transit-location': []})
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety','shopping', 'touristy', 'transit-location']

for aspect in aspects:
  testing_set_path =  '/content/drive/My Drive/SentiHood/Bert-single/TestingData/LOCATION1' + str(aspect) + '.csv'

  df_test = pd.read_csv(testing_set_path)
  sentiment_mapping = {
     'Positive': 2,
          'Negative': 1,
          'None': 0
  }
  df_test['sentiment'] = df_test['sentiment'].map(sentiment_mapping)
  df_test = df_test.reset_index(drop=True)

  df_true_location1[aspect] = df_test['sentiment']

df_true_location1['location'] = 'LOCATION1'
df_true_location1['id'] = df_test['id']

In [None]:
df_true_location2 = pd.DataFrame({'id': [], 'location': [] , 'dining': [], 'general': [], 'green-nature': [], 'live': [], 'multicultural': [], 'nightlife': [], 'price': [], 'quiet': [], 'safety': [],'shopping': [], 'touristy': [], 'transit-location': []})
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety','shopping', 'touristy', 'transit-location']

for aspect in aspects:
  testing_set_path = '/content/drive/My Drive/SentiHood/Bert-single/TestingData/LOCATION2' + str(aspect) + '.csv'

  df_test = pd.read_csv(testing_set_path)
  sentiment_mapping = {
      'Positive': 2,
          'Negative': 1,
          'None': 0
  }
  df_test['sentiment'] = df_test['sentiment'].map(sentiment_mapping)
  df_test = df_test.reset_index(drop=True)

  df_true_location2[aspect] = df_test['sentiment']

df_true_location2['location'] = 'LOCATION2'
df_true_location2['id'] = df_test['id']

In [None]:
df_true = pd.concat([df_true_location1, df_true_location2])
df_true

Unnamed: 0,id,location,dining,general,green-nature,live,multicultural,nightlife,price,quiet,safety,shopping,touristy,transit-location
0,153,LOCATION1,0,0,0,0,0,0,0,0,2,0,0,0
1,1130,LOCATION1,0,2,0,0,0,0,0,0,2,0,0,0
2,1271,LOCATION1,0,1,0,0,0,0,2,0,0,0,0,0
3,1089,LOCATION1,0,1,0,0,0,0,0,0,0,0,0,0
4,731,LOCATION1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
383,1431,LOCATION2,0,2,0,0,0,0,0,0,0,0,0,0
384,1290,LOCATION2,0,2,0,0,0,0,0,0,0,0,0,0
385,363,LOCATION2,0,0,0,0,0,0,0,0,0,0,0,0
386,1304,LOCATION2,0,2,0,0,0,0,0,0,0,0,0,0


In [None]:
df_predicted_location1 = pd.DataFrame({'id': [], 'location': [] , 'dining': [], 'general': [], 'green-nature': [], 'live': [], 'multicultural': [], 'nightlife': [], 'price': [], 'quiet': [], 'safety': [],'shopping': [], 'touristy': [], 'transit-location': []})
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety','shopping', 'touristy', 'transit-location']

for aspect in aspects:
  testing_set_path = '/content/drive/My Drive/SentiHood/Bert-single/PredictedResults/LOCATION1' + str(aspect) + '.csv'

  df_test = pd.read_csv(testing_set_path)
  df_test = df_test.reset_index(drop=True)

  df_predicted_location1[aspect] = df_test['predicted_values']

df_predicted_location1['location'] = 'LOCATION1'
df_predicted_location1['id'] = df_test['id']

In [None]:
df_predicted_location2 = pd.DataFrame({'id': [], 'location': [] , 'dining': [], 'general': [], 'green-nature': [], 'live': [], 'multicultural': [], 'nightlife': [], 'price': [], 'quiet': [], 'safety': [],'shopping': [], 'touristy': [], 'transit-location': []})
aspects = ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety','shopping', 'touristy', 'transit-location']

for aspect in aspects:
  testing_set_path = '/content/drive/My Drive/SentiHood/Bert-single/PredictedResults/LOCATION2' + str(aspect) + '.csv'

  df_test = pd.read_csv(testing_set_path)
  df_test = df_test.reset_index(drop=True)

  df_predicted_location2[aspect] = df_test['predicted_values']

df_predicted_location2['location'] = 'LOCATION2'
df_predicted_location2['id'] = df_test['id']

In [None]:
df_predicted = pd.concat([df_predicted_location1, df_predicted_location2])
df_predicted

Unnamed: 0,id,location,dining,general,green-nature,live,multicultural,nightlife,price,quiet,safety,shopping,touristy,transit-location
0,153,LOCATION1,0,0,0,0,0,0,0,0,0,0,0,0
1,1130,LOCATION1,0,0,0,0,0,0,0,0,0,0,0,0
2,1271,LOCATION1,0,2,0,0,0,0,1,0,0,0,0,0
3,1089,LOCATION1,0,1,0,0,0,0,0,0,0,0,0,0
4,731,LOCATION1,0,0,0,0,0,0,0,0,0,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
383,1431,LOCATION2,0,2,0,0,0,0,0,0,0,0,0,0
384,1290,LOCATION2,0,2,0,0,0,0,0,0,0,0,0,0
385,363,LOCATION2,0,0,0,0,0,0,0,0,0,0,0,0
386,1304,LOCATION2,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
# Find the accuracy
def compute_sentiment_accuracy(df_true, df_predicted):
  """This function computes the sentiment classfication accuracy"""

  count = 0
  total = 0

  for aspect in ['dining', 'general', 'green-nature', 'live', 'multicultural', 'nightlife', 'price', 'quiet', 'safety','shopping', 'touristy', 'transit-location']:
    count += np.sum(df_true[aspect].values == df_predicted[aspect].values)
    total += df_true.shape[0]

  accuracy = float(count)/float(total) * 100
  
  return round(accuracy, 2)

In [None]:
# find accuracy
print(f"Sentiment Accuracy: {compute_sentiment_accuracy(df_true, df_predicted)}")

Sentiment Accuracy: 94.71


In [None]:
from collections import Counter

In [None]:
Counter(df_predicted['dining'])

Counter({0: 1839, 2: 40})

In [None]:
Counter(df_true['dining'])

Counter({0: 1842, 1: 2, 2: 35})

In [None]:
Counter(df_predicted['general'])

Counter({0: 1416, 1: 80, 2: 383})

In [None]:
Counter(df_true['general'])

Counter({0: 1293, 1: 139, 2: 447})

In [None]:
from sklearn.metrics import  confusion_matrix

confusion_matrix(df_true['general'],df_predicted['general'])

array([[1205,   12,   76],
       [  58,   57,   24],
       [ 153,   11,  283]])