In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Using BERT for Aspect Based Sentiment Analysis (ABSA)

## Installing Dependencies

In [2]:
!pip install tqdm boto3 requests regex sentencepiece sacremoses --quiet
!pip install transformers --quiet

## Imports

In [3]:
from tqdm.auto import tqdm
from time import perf_counter

import xml.etree.ElementTree as ET
from xml.dom import minidom

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dataclasses import dataclass

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

import transformers
from transformers import BertModel, BertTokenizer, get_scheduler, set_seed

## Configuration Class for our model

In [4]:
# CONFIGURATION CLASS
@dataclass
class Config:
    batch_size: int = 8  # Reduce batch size for a smaller dataset
    val_size: int = 50   # Use a smaller validation set size
    train_size: int = 204  # The remaining samples for training

    pin_memory: bool = True
    num_workers: int = 2
    seed: int = 42
    lr: float = 5e-5  # Adjust learning rate for a smaller dataset

In [5]:
config = Config()

In [6]:
set_seed(config.seed)

## Preprocessing Data

In [7]:
import pandas as pd
df = pd.read_csv("/content/restaurant_reviews.csv")

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
df.head()

Unnamed: 0,Review_ID,Review_ID#,Review_IDtoID#,Review_OutofScopeBool,Review_SentenceID,Review_Opinion_Category,Review_Opinion_From,Review_Opinion_Polarity,Review_Opinion_Target,Review_Opinion_To,Review_Text
0,1004293,1,1004293:0,,1,RESTAURANT#GENERAL,51.0,negative,place,56.0,Judging from previous posts this used to be a ...
1,1004293,1,1004293:1,,2,SERVICE#GENERAL,75.0,negative,staff,80.0,"We, there were four of us, arrived at noon - t..."
2,1004293,1,1004293:2,,3,SERVICE#GENERAL,0.0,negative,,0.0,"They never brought us complimentary noodles, i..."
3,1004293,1,1004293:3,,4,FOOD#QUALITY,4.0,negative,food,8.0,The food was lousy - too sweet or too salty an...
4,1004293,1,1004293:3,,4,FOOD#STYLE_OPTIONS,52.0,negative,portions,60.0,The food was lousy - too sweet or too salty an...


In [10]:
df = df[df['Review_OutofScopeBool'] != True]
columns_to_delete = ['Review_OutofScopeBool', 'Review_IDtoID#', 'Review_ID', 'Review_Opinion_From', 'Review_Opinion_To', 'Review_Opinion_Target']
df = df.drop(columns = columns_to_delete)

In [11]:
distinct_values = df['Review_Opinion_Category'].unique()
print("Distinct Values:", distinct_values)

Distinct Values: ['RESTAURANT#GENERAL' 'SERVICE#GENERAL' 'FOOD#QUALITY'
 'FOOD#STYLE_OPTIONS' 'DRINKS#STYLE_OPTIONS' 'DRINKS#PRICES'
 'RESTAURANT#PRICES' 'AMBIENCE#GENERAL' nan 'RESTAURANT#MISCELLANEOUS'
 'FOOD#PRICES' 'LOCATION#GENERAL' 'DRINKS#QUALITY' 'FOOD#GENERAL']


In [12]:
# making distinct categories
def clean_category(input_string):
    if pd.notna(input_string):
        input_lower = input_string.lower()
        if 'quality' in input_lower:
            return 'quality'
        elif 'price' in input_lower:
            return 'price'
        elif 'service' in input_lower:
            return 'service'
        elif 'style_options' in input_lower:
            return 'options'
        else:
            return 'general'
    else:
        return input_string

df['Review_Opinion_Category'] = df['Review_Opinion_Category'].apply(clean_category)

In [13]:
df.head()

Unnamed: 0,Review_ID#,Review_SentenceID,Review_Opinion_Category,Review_Opinion_Polarity,Review_Text
0,1,1,general,negative,Judging from previous posts this used to be a ...
1,1,2,service,negative,"We, there were four of us, arrived at noon - t..."
2,1,3,service,negative,"They never brought us complimentary noodles, i..."
3,1,4,quality,negative,The food was lousy - too sweet or too salty an...
4,1,4,options,negative,The food was lousy - too sweet or too salty an...


In [14]:
df = df.dropna(subset=['Review_Opinion_Category', 'Review_Opinion_Polarity'])

In [15]:
df.head()

Unnamed: 0,Review_ID#,Review_SentenceID,Review_Opinion_Category,Review_Opinion_Polarity,Review_Text
0,1,1,general,negative,Judging from previous posts this used to be a ...
1,1,2,service,negative,"We, there were four of us, arrived at noon - t..."
2,1,3,service,negative,"They never brought us complimentary noodles, i..."
3,1,4,quality,negative,The food was lousy - too sweet or too salty an...
4,1,4,options,negative,The food was lousy - too sweet or too salty an...


In [16]:
# handling cases with multiple of same category in a single review
sentiment_conversion_dict = {'positive': 2, 'neutral': 1, 'negative': 0}
def polarity_number_conversion(input_string):
    if pd.notna(input_string):
        if 'positive' in input_string:
            return '2'
        elif 'negative' in input_string:
            return '0'
        elif 'neutral' in input_string:
            return '1'
    else:
        return '-1'

df['Review_Opinion_Polarity'] = df['Review_Opinion_Polarity'].apply(polarity_number_conversion)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Review_Opinion_Polarity'] = df['Review_Opinion_Polarity'].apply(polarity_number_conversion)


In [17]:
df.head(12)

Unnamed: 0,Review_ID#,Review_SentenceID,Review_Opinion_Category,Review_Opinion_Polarity,Review_Text
0,1,1,general,0,Judging from previous posts this used to be a ...
1,1,2,service,0,"We, there were four of us, arrived at noon - t..."
2,1,3,service,0,"They never brought us complimentary noodles, i..."
3,1,4,quality,0,The food was lousy - too sweet or too salty an...
4,1,4,options,0,The food was lousy - too sweet or too salty an...
5,1,5,service,0,"After all that, they complained to me about th..."
6,1,6,general,0,Avoid this place!
7,2,7,quality,2,"I have eaten at Saul, many times, the food is ..."
8,2,8,general,2,Saul is the best restaurant on Smith Street an...
9,2,9,quality,2,The duck confit is always amazing and the foie...


In [18]:
# Convert 'Review_Opinion_Polarity' to a string
df['Review_Opinion_Polarity'] = df['Review_Opinion_Polarity'].astype(str)

# Group by 'Review_SentenceID' and aggregate data for each review
df_grouped = df.groupby('Review_SentenceID').agg({
    'Review_ID#': 'first',
    'Review_Opinion_Category': lambda x: ', '.join(x),
    'Review_Opinion_Polarity': lambda x: [int(p) for sublist in [p.split(', ') for p in x] for p in sublist if p],
    'Review_Text': 'first'
}).reset_index()

# Group by 'Review_ID#' and aggregate data for each review
grouped_df = df_grouped.groupby('Review_ID#').agg({
    'Review_Opinion_Category': lambda x: list(x),
    'Review_Opinion_Polarity': lambda x: list(x),
    'Review_Text': lambda x: ' '.join(x)  # Concatenate review texts
}).reset_index()

grouped_df.head()

Unnamed: 0,Review_ID#,Review_Opinion_Category,Review_Opinion_Polarity,Review_Text
0,1,"[general, service, service, quality, options, ...","[[0], [0], [0], [0, 0], [0], [0]]",Judging from previous posts this used to be a ...
1,2,"[quality, general, quality, quality, options, ...","[[2], [2], [2, 2], [2, 2], [2, 2]]","I have eaten at Saul, many times, the food is ..."
2,3,"[general, service, service, quality, quality, ...","[[0], [0], [0], [1], [0, 0, 0], [0]]",I was very disappointed with this restaurant. ...
3,4,"[general, service, quality, price, general]","[[2], [2, 2, 2], [2]]","Went on a 3 day oyster binge, with Fish bringi..."
4,5,"[general, quality, service, general, quality, ...","[[2], [2, 2, 2], [2], [2]]",Every time in New York I make it a point to vi...


In [19]:
# flatten the 'Review_Opinion_Polarity' list
grouped_df['Review_Opinion_Polarity'] = grouped_df['Review_Opinion_Polarity'].apply(lambda x: [item for sublist in x for item in sublist])


In [20]:
grouped_df.head()

Unnamed: 0,Review_ID#,Review_Opinion_Category,Review_Opinion_Polarity,Review_Text
0,1,"[general, service, service, quality, options, ...","[0, 0, 0, 0, 0, 0, 0]",Judging from previous posts this used to be a ...
1,2,"[quality, general, quality, quality, options, ...","[2, 2, 2, 2, 2, 2, 2, 2]","I have eaten at Saul, many times, the food is ..."
2,3,"[general, service, service, quality, quality, ...","[0, 0, 0, 1, 0, 0, 0, 0]",I was very disappointed with this restaurant. ...
3,4,"[general, service, quality, price, general]","[2, 2, 2, 2, 2]","Went on a 3 day oyster binge, with Fish bringi..."
4,5,"[general, quality, service, general, quality, ...","[2, 2, 2, 2, 2, 2]",Every time in New York I make it a point to vi...


In [21]:
import re

def generate_category_polarity_mapping(row):
    category_polarity_dict = {}
    category_count = {}

    # ensure each entry is a list
    categories = row['Review_Opinion_Category']
    if not isinstance(categories, list):
        categories = [categories]

    # convert polarity values to integers
    polarities = [int(p) for p in row['Review_Opinion_Polarity']]

    # split categories using a regex pattern that considers commas and spaces
    split_pattern = re.compile(r',\s*|\s+')
    categories = [category for sublist in [split_pattern.split(cat) for cat in categories] for category in sublist]

    for category, polarity in zip(categories, polarities):
        # if the category is not in the dictionary, initialize with the current polarity and count
        if category not in category_count:
            category_count[category] = {'sum': polarity, 'count': 1}
        else:
            # Update the sum of polarity and count for the category
            category_count[category]['sum'] += polarity
            category_count[category]['count'] += 1

    # calculate the average polarity for each category and round to the nearest integer
    for category, values in category_count.items():
        average_polarity = round(values['sum'] / values['count'])
        category_polarity_dict[category] = int(average_polarity)

    return category_polarity_dict

# Apply the function to the DataFrame
grouped_df['Category_Polarity_Mapping'] = grouped_df.apply(generate_category_polarity_mapping, axis=1)

In [22]:
grouped_df.head()

Unnamed: 0,Review_ID#,Review_Opinion_Category,Review_Opinion_Polarity,Review_Text,Category_Polarity_Mapping
0,1,"[general, service, service, quality, options, ...","[0, 0, 0, 0, 0, 0, 0]",Judging from previous posts this used to be a ...,"{'general': 0, 'service': 0, 'quality': 0, 'op..."
1,2,"[quality, general, quality, quality, options, ...","[2, 2, 2, 2, 2, 2, 2, 2]","I have eaten at Saul, many times, the food is ...","{'quality': 2, 'general': 2, 'options': 2, 'pr..."
2,3,"[general, service, service, quality, quality, ...","[0, 0, 0, 1, 0, 0, 0, 0]",I was very disappointed with this restaurant. ...,"{'general': 0, 'service': 0, 'quality': 0}"
3,4,"[general, service, quality, price, general]","[2, 2, 2, 2, 2]","Went on a 3 day oyster binge, with Fish bringi...","{'general': 2, 'service': 2, 'quality': 2, 'pr..."
4,5,"[general, quality, service, general, quality, ...","[2, 2, 2, 2, 2, 2]",Every time in New York I make it a point to vi...,"{'general': 2, 'quality': 2, 'service': 2}"


In [23]:
columns_to_drop = ['Review_Opinion_Category', 'Review_Opinion_Polarity']
grouped_df = grouped_df.drop(columns = columns_to_drop)

THE FINAL DF

In [24]:
final_df = pd.DataFrame(columns=['Review_Text', 'aspect1', 'sentiment1', 'aspect2', 'sentiment2', 'aspect3', 'sentiment3', 'aspect4', 'sentiment4', 'aspect5', 'sentiment5'])

# Iterate over each row in the original DataFrame
for index, row in grouped_df.iterrows():
    # Initialize lists to store aspects and sentiments
    aspects = []
    sentiments = []

    # Iterate over the keys and values in the Category_Polarity_Mapping dictionary
    for aspect, sentiment in row['Category_Polarity_Mapping'].items():
        aspects.append(aspect)
        sentiments.append(sentiment)

    # Pad the lists with None values to ensure a consistent length
    aspects += [None] * (5 - len(aspects))
    sentiments += [None] * (5 - len(sentiments))

    final_df.loc[index] = [row['Review_Text'], aspects[0], sentiments[0], aspects[1], sentiments[1], aspects[2], sentiments[2], aspects[3], sentiments[3], aspects[4], sentiments[4]]

In [25]:
final_df.shape

(254, 11)

In [26]:
final_df.head()

Unnamed: 0,Review_Text,aspect1,sentiment1,aspect2,sentiment2,aspect3,sentiment3,aspect4,sentiment4,aspect5,sentiment5
0,Judging from previous posts this used to be a ...,general,0,service,0,quality,0,options,0.0,,
1,"I have eaten at Saul, many times, the food is ...",quality,2,general,2,options,2,price,2.0,,
2,I was very disappointed with this restaurant. ...,general,0,service,0,quality,0,,,,
3,"Went on a 3 day oyster binge, with Fish bringi...",general,2,service,2,quality,2,price,2.0,,
4,Every time in New York I make it a point to vi...,general,2,quality,2,service,2,,,,


In [27]:
aspect_conversion_dict = {'quality': 0, 'price': 1, 'service': 2, 'options': 3, 'general': 4}

# Convert aspect columns using replace and fill missing values with 0
aspect_columns = ['aspect1', 'aspect2', 'aspect3', 'aspect4', 'aspect5']
for column in aspect_columns:
    # Ensure the column is of type object to handle None properly
    final_df[column] = final_df[column].replace(aspect_conversion_dict).fillna(-1).astype(int)


In [28]:
final_df.head()

Unnamed: 0,Review_Text,aspect1,sentiment1,aspect2,sentiment2,aspect3,sentiment3,aspect4,sentiment4,aspect5,sentiment5
0,Judging from previous posts this used to be a ...,4,0,2,0,0,0,3,0.0,-1,
1,"I have eaten at Saul, many times, the food is ...",0,2,4,2,3,2,1,2.0,-1,
2,I was very disappointed with this restaurant. ...,4,0,2,0,0,0,-1,,-1,
3,"Went on a 3 day oyster binge, with Fish bringi...",4,2,2,2,0,2,1,2.0,-1,
4,Every time in New York I make it a point to vi...,4,2,0,2,2,2,-1,,-1,


In [29]:
# creating unique labels by combining aspect and sentiment values
aspect_conversion_dict = {'quality': 0, 'price': 1, 'service': 2, 'options': 3, 'general': 4}
polarity_conversion_dict = {'negative': 0, 'neutral': 1, 'positive': 2}
num_p = len(polarity_conversion_dict)

a1 = final_df['aspect1']
s1 = final_df['sentiment1']
final_df['labels1'] = a1*num_p + s1 + 1

a2 = final_df['aspect2']
s2 = final_df['sentiment2']

final_df['labels2'] = a2*num_p + s2 + 1

a3 = final_df['aspect3']
s3 = final_df['sentiment3']

final_df['labels3'] = a3*num_p + s3 + 1

a4 = final_df['aspect4']
s4 = final_df['sentiment4']

final_df['labels4'] = a4*num_p + s4 + 1

a5 = final_df['aspect5']
s5 = final_df['sentiment5']

final_df['labels5'] = a5*num_p + s5 + 1

final_df = final_df.where(pd.notna(final_df),0)

final_df.head()

Unnamed: 0,Review_Text,aspect1,sentiment1,aspect2,sentiment2,aspect3,sentiment3,aspect4,sentiment4,aspect5,sentiment5,labels1,labels2,labels3,labels4,labels5
0,Judging from previous posts this used to be a ...,4,0,2,0,0,0,3,0,-1,0,13,7,1,10,0
1,"I have eaten at Saul, many times, the food is ...",0,2,4,2,3,2,1,2,-1,0,3,15,12,6,0
2,I was very disappointed with this restaurant. ...,4,0,2,0,0,0,-1,0,-1,0,13,7,1,0,0
3,"Went on a 3 day oyster binge, with Fish bringi...",4,2,2,2,0,2,1,2,-1,0,15,9,3,6,0
4,Every time in New York I make it a point to vi...,4,2,0,2,2,2,-1,0,-1,0,15,3,9,0,0


In [30]:
final_df.columns

Index(['Review_Text', 'aspect1', 'sentiment1', 'aspect2', 'sentiment2',
       'aspect3', 'sentiment3', 'aspect4', 'sentiment4', 'aspect5',
       'sentiment5', 'labels1', 'labels2', 'labels3', 'labels4', 'labels5'],
      dtype='object')

## Model Preparation

In [31]:
classifier = nn.Sequential(nn.Dropout(), nn.Linear(768, 16), nn.ReLU())
classifier2 = nn.Sequential(nn.Dropout(), nn.Linear(768, 16), nn.ReLU())

In [36]:
class BertForABSA(nn.Module):
    def __init__(self, bert, num_labels=15):
        super(BertForABSA, self).__init__()
        self.bert = bert
        self.classifier = classifier
        # self.dropout = nn.Dropout()
        # self.classifier = nn.Linear(768, num_labels)
        # self.relu = nn.ReLU()

    def forward(self, input_ids, token_type_ids, attention_mask):
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask,
                                     return_dict=False)

        logits = self.classifier(pooled_output)
#         pooled_output = self.dropout(pooled_output)
#         logits = self.classifier(pooled_output)
#         logits = self.relu(logits)

        return logits

## Dataset and DataLoaders

In [37]:
class ReviewsDataset(Dataset):
    def __init__(self, train_data, tokenizer, label_col, max_sequence_len=96, as_float=False):
        self.as_float = as_float
        print("Starting Process ...")
        labels = list(train_data[label_col].values)
        # Number of exmaples.
        self.n_examples = len(labels)
        # Use tokenizer on texts. This can take a while.
        print('Using tokenizer on all texts ...')

        texts = list(train_data['Review_Text'].values)
        self.inputs = tokenizer(texts, add_special_tokens=True, \
                                truncation=True, padding=True, \
                                return_tensors='pt')

        # Get maximum sequence length.
        self.sequence_len = self.inputs['input_ids'].shape[-1]
        print('Texts padded or truncated to %d length!' % self.sequence_len)
        # Add labels.
        # Handle None values in labels
        self.labels = torch.tensor(labels)
        #self.labels = torch.tensor([label if label is not None else 0 for label in labels], dtype=torch.float)
        #self.labels = torch.tensor([1.0 if label is not None and label > 0 else 0.0 for label in labels], dtype=torch.float)
        print('Finished!\n')

    def __len__(self):
        return self.n_examples

    def __getitem__(self, i):
        if self.as_float:
            return {key: self.inputs[key][i] for key in self.inputs.keys()}, self.labels[i].to(torch.float)
        else:
            return {key: self.inputs[key][i] for key in self.inputs.keys()}, self.labels[i]

In [34]:
final_df.head()

Unnamed: 0,Review_Text,aspect1,sentiment1,aspect2,sentiment2,aspect3,sentiment3,aspect4,sentiment4,aspect5,sentiment5,labels1,labels2,labels3,labels4,labels5
0,Judging from previous posts this used to be a ...,4,0,2,0,0,0,3,0,-1,0,13,7,1,10,0
1,"I have eaten at Saul, many times, the food is ...",0,2,4,2,3,2,1,2,-1,0,3,15,12,6,0
2,I was very disappointed with this restaurant. ...,4,0,2,0,0,0,-1,0,-1,0,13,7,1,0,0
3,"Went on a 3 day oyster binge, with Fish bringi...",4,2,2,2,0,2,1,2,-1,0,15,9,3,6,0
4,Every time in New York I make it a point to vi...,4,2,0,2,2,2,-1,0,-1,0,15,3,9,0,0


In [35]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert = BertModel.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [38]:
dataset = ReviewsDataset(final_df, tokenizer, 'labels1')

Starting Process ...
Using tokenizer on all texts ...
Texts padded or truncated to 218 length!
Finished!



In [39]:
inputs, labels = dataset[0]
print(inputs, labels)

{'input_ids': tensor([  101, 13325,  2013,  3025,  8466,  2023,  2109,  2000,  2022,  1037,
         2204,  2173,  1010,  2021,  2025,  2151,  2936,  1012,  2057,  1010,
         2045,  2020,  2176,  1997,  2149,  1010,  3369,  2012, 11501,  1011,
         1996,  2173,  2001,  4064,  1011,  1998,  1996,  3095,  6051,  2066,
         2057,  2020, 16625,  2006,  2068,  1998,  2027,  2020,  2200, 12726,
         1012,  2027,  2196,  2716,  2149, 19394,  5649, 27130,  1010,  6439,
         5567, 11186,  2005,  5699,  1010,  1998,  4711,  2256, 10447,  2006,
         1996,  2795,  1012,  1996,  2833,  2001, 10223,  6508,  1011,  2205,
         4086,  2030,  2205, 23592,  1998,  1996,  8810,  4714,  1012,  2044,
         2035,  2008,  1010,  2027, 10865,  2000,  2033,  2055,  1996,  2235,
         5955,  1012,  4468,  2023,  2173,   999,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 

In [40]:
config.train_size = len(dataset) - config.val_size
train_ds, val_ds = random_split(dataset, [config.train_size, config.val_size])

In [41]:
train_loader = DataLoader(train_ds, config.batch_size, shuffle=True,
                          num_workers=config.num_workers,
                          pin_memory=config.pin_memory)


val_loader = DataLoader(val_ds, config.batch_size, shuffle=False,
                        num_workers=config.num_workers,
                        pin_memory=config.pin_memory)

In [42]:
for input, label in train_loader:
    print(input)
    print()
    print(label)
    break

  self.pid = os.fork()


{'input_ids': tensor([[  101,  1996, 28305,  ...,     0,     0,     0],
        [  101,  2673,  2001,  ...,     0,     0,     0],
        [  101,  2001, 10889,  ...,     0,     0,     0],
        ...,
        [  101,  2023,  2173,  ...,     0,     0,     0],
        [  101,  2307,  2796,  ...,     0,     0,     0],
        [  101,  1045,  2031,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

tensor([12, 15,  2,  2,  9,  6,  3,  2])


In [43]:
def get_one_hot(outputs, k=2):
    outputs = outputs.detach()
    x = torch.topk(outputs, k)
    for i, t in enumerate(outputs):
#         x = torch.topk(t, k[i])
        for j, _ in enumerate(t):
            if j in x.indices[i]:
                t[j] = 1
            else:
                t[j] = 0
            outputs[i] = t

    outputs.requires_grad = True
    return outputs.to(torch.float)

def get_accuracy(outputs, labels):
    preds = torch.argmax(outputs, dim=1)
    return (preds == labels).sum()

def one_hot_acc(one_hot_outputs, labels):
    result = torch.all(one_hot_outputs.eq(labels))
    return result.sum()

In [44]:
model = BertForABSA(bert)

## Training our Model(s)

In [45]:
def train(model, train_dataloader, val_dataloader, learning_rate, epochs):
    # track the time and history
    start = perf_counter()
    history = []
    # check for cuda use
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        torch.cuda.empty_cache()

    # prepare optimizer, loss-criterion and lr-scheduler
#     criterion = nn.BCELoss()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
    num_training_steps = epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(name="linear", optimizer=optimizer,
                                 num_warmup_steps=0,
                                 num_training_steps=num_training_steps)

    # prepare a progress bar
    progress_bar = tqdm(range(num_training_steps))

    # move model to the GPU
    if use_cuda:
            # linear = linear.cuda()
            model = model.cuda()
            criterion = criterion.cuda()

    # Start Epoch wise training
    for epoch_num in range(epochs):
        epoch_start = perf_counter()
        total_acc_train = 0
        total_loss_train = 0

        # Training Phase
        model.train()
        for inputs, label in train_dataloader:
            label = label.to(torch.long)
            label = label.to(device)
            # label = torch.argmax(label, dim=1)

            inputs['attention_mask'] = inputs['attention_mask'].to(device)
            inputs['input_ids'] = inputs['input_ids'].to(device)
            inputs['token_type_ids'] = inputs['token_type_ids'].to(device)

            outputs = model(**inputs)
            ## k = label.sum(dim=1).to(torch.int)
            ## assert len(k) == config.batch_size
#           ##  one_hot_outputs = get_one_hot(outputs)

            # loss calculation step
            batch_loss = criterion(outputs, label)
            # batch_loss = criterion(one_hot_outputs, label)
            batch_loss.backward()

            # collate losses and acc.
            total_loss_train += batch_loss.item()
            acc = get_accuracy(outputs, label)
            # acc = one_hot_acc(one_hot_outputs, label)
            total_acc_train += acc

            # Update step
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

        # Validation Phase
        total_acc_val = 0
        total_loss_val = 0
        model.eval()
        with torch.no_grad():
            for val_input, val_label in val_dataloader:
                val_label = val_label.to(torch.long)
                val_label = val_label.to(device)
#                 val_label = torch.argmax(val_label, dim=1)

                val_input['attention_mask'] = val_input['attention_mask'].to(device)
                val_input['input_ids'] = val_input['input_ids'].to(device)
                val_input['token_type_ids'] = val_input['token_type_ids'].to(device)

                outputs = model(**val_input)
#                 k = label.sum(dim=1).to(torch.int)
#                 assert len(k) == config.batch_size
#                 one_hot_outputs = get_one_hot(outputs)

                batch_loss = criterion(outputs, val_label)
#                 batch_loss = criterion(one_hot_outputs, val_label)
                total_loss_val += batch_loss.item()

                acc = get_accuracy(outputs, val_label)
#                 acc = one_hot_acc(one_hot_outputs, label)
                total_acc_val += acc

        # measure epoch-time
        epoch_time = perf_counter() - epoch_start

        # print results
        print(f'\nEpochs: {epoch_num + 1}/{epochs} | Train Loss: {total_loss_train / len(train_ds): .3f} \
| Train Accuracy: {total_acc_train / len(train_ds): .3f} | Val Loss: {total_loss_val / len(val_ds): .3f} \
| Val Accuracy: {total_acc_val / len(val_ds): .3f} | Epoch Time: {epoch_time//60:.0f}m {epoch_time%60:.2f}s')

        # store results
        result = {'epoch': epoch_num + 1,
                  'train_loss': total_loss_train / len(train_ds),
                  'train_acc': total_acc_train / len(train_ds),
                  'val_loss': total_loss_val / len(val_ds),
                  'val_acc': total_acc_val / len(val_ds),
                  'epoch_time': epoch_time
                 }
        history.append(result)

    time_taken = perf_counter() - start
    print(f"\nTime Taken to train the model: {time_taken//60:.0f}m {time_taken%60:.2f}s")

    return history

In [46]:
def train_together(model, train_dataloader, val_dataloader, learning_rate, epochs):
    # track the time and history
    start = perf_counter()
    history = []
    # check for cuda use
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        torch.cuda.empty_cache()

    # prepare optimizer, loss-criterion and lr-scheduler
    criterion = nn.BCELoss()
#     criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
    num_training_steps = epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(name="linear", optimizer=optimizer,
                                 num_warmup_steps=0,
                                 num_training_steps=num_training_steps)

    # prepare a progress bar
    progress_bar = tqdm(range(num_training_steps))

    # move model to the GPU
    if use_cuda:
            # linear = linear.cuda()
            model = model.cuda()
            criterion = criterion.cuda()

    # Start Epoch wise training
    for epoch_num in range(epochs):
        epoch_start = perf_counter()
        total_acc_train = 0
        total_loss_train = 0

        # Training Phase
        model.train()
        for inputs, label in train_dataloader:
#             label = label.to(torch.long)
            label = label.to(device)
            # label = torch.argmax(label, dim=1)

            inputs['attention_mask'] = inputs['attention_mask'].to(device)
            inputs['input_ids'] = inputs['input_ids'].to(device)
            inputs['token_type_ids'] = inputs['token_type_ids'].to(device)

            outputs = model(**inputs)
            ## k = label.sum(dim=1).to(torch.int)
            ## assert len(k) == config.batch_size
            one_hot_outputs = get_one_hot(outputs)

            # loss calculation step
#             batch_loss = criterion(outputs, label)
            batch_loss = criterion(one_hot_outputs, label)
            batch_loss.backward()

            # collate losses and acc.
            total_loss_train += batch_loss.item()
#             acc = get_accuracy(outputs, label)
            try:
                acc = one_hot_acc(one_hot_outputs, label)
            except:
                acc = 0
            total_acc_train += acc

            # Update step
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

        # Validation Phase
        total_acc_val = 0
        total_loss_val = 0
        model.eval()
        with torch.no_grad():
            for val_input, val_label in val_dataloader:
#                 val_label = val_label.to(torch.long)
                val_label = val_label.to(device)
#                 val_label = torch.argmax(val_label, dim=1)

                val_input['attention_mask'] = val_input['attention_mask'].to(device)
                val_input['input_ids'] = val_input['input_ids'].to(device)
                val_input['token_type_ids'] = val_input['token_type_ids'].to(device)

                outputs = model(**val_input)
#                 k = label.sum(dim=1).to(torch.int)
#                 assert len(k) == config.batch_size
                one_hot_outputs = get_one_hot(outputs)

#                 batch_loss = criterion(outputs, val_label)
                batch_loss = criterion(one_hot_outputs, val_label)
                total_loss_val += batch_loss.item()

#                 acc = get_accuracy(outputs, val_label)
                try:
                    acc = one_hot_acc(one_hot_outputs, label)
                except:
                    acc = 0
                total_acc_val += acc

        # measure epoch-time
        epoch_time = perf_counter() - epoch_start

        # print results
        print(f'\nEpochs: {epoch_num + 1}/{epochs} | Train Loss: {total_loss_train / len(train_ds): .3f} \
| Train Accuracy: {total_acc_train / len(train_ds): .3f} | Val Loss: {total_loss_val / len(val_ds): .3f} \
| Val Accuracy: {total_acc_val / len(val_ds): .3f} | Epoch Time: {epoch_time//60:.0f}m {epoch_time%60:.2f}s')

        # store results
        result = {'epoch': epoch_num + 1,
                  'train_loss': total_loss_train / len(train_ds),
                  'train_acc': total_acc_train / len(train_ds),
                  'val_loss': total_loss_val / len(val_ds),
                  'val_acc': total_acc_val / len(val_ds),
                  'epoch_time': epoch_time
                 }
        history.append(result)

    time_taken = perf_counter() - start
    print(f"\nTime Taken to train the model: {time_taken//60:.0f}m {time_taken%60:.2f}s")

    return history

### Training 1st model for 1st AE and ASC

In [47]:
history = train(model, train_loader, val_loader, 5e-5, 5)

  0%|          | 0/130 [00:00<?, ?it/s]

KeyboardInterrupt: 

### Training 2nd model for 2nd AE and ASC

In [None]:
final_df.head()

Unnamed: 0,Review_Text,aspect1,sentiment1,aspect2,sentiment2,aspect3,sentiment3,aspect4,sentiment4,aspect5,sentiment5,labels1,labels2,labels3,labels5
0,Judging from previous posts this used to be a ...,5,1,3,1,1,1,4,1.0,1,,4,2,0,
1,"I have eaten at Saul, many times, the food is ...",1,3,5,3,4,3,2,3.0,1,,2,14,11,
2,I was very disappointed with this restaurant. ...,5,1,3,1,1,1,1,,1,,4,2,0,
3,"Went on a 3 day oyster binge, with Fish bringi...",5,3,3,3,1,3,2,3.0,1,,14,8,2,
4,Every time in New York I make it a point to vi...,5,3,1,3,3,3,1,,1,,14,2,8,


In [None]:
dataset2 = ReviewsDataset(final_df, tokenizer, label_col='labels2')
train_ds2, val_ds2 = random_split(dataset2, [config.train_size, config.val_size])

Starting Process ...
Using tokenizer on all texts ...
Texts padded or truncated to 218 length!
Finished!



In [None]:
i, l = train_ds2[0]
print(l)

tensor(1.)


In [None]:
train_loader2 = DataLoader(train_ds2, config.batch_size, shuffle=True,
                          num_workers=config.num_workers,
                          pin_memory=config.pin_memory)


val_loader2 = DataLoader(val_ds2, config.batch_size, shuffle=False,
                        num_workers=config.num_workers,
                        pin_memory=config.pin_memory)

In [None]:
bert2 = BertModel.from_pretrained("bert-base-uncased")
model2 = BertForABSA(bert2)

In [None]:
history2 = train(model2, train_loader2, val_loader2, 5e-5, 10)

### Training Together

In [None]:
# df.head()

In [None]:
# dataset3 = ReviewsDataset(df, tokenizer, 'one_hot', as_float=True)
# train_ds3, val_ds3 = random_split(dataset3, [config.train_size, config.val_size])

In [None]:
# list(df['one_hot'].values)[0]

In [None]:
# i, l = train_ds3[0]
# print(l)

In [None]:
# train_loader3 = DataLoader(train_ds3, config.batch_size, shuffle=True,
#                           num_workers=config.num_workers,
#                           pin_memory=config.pin_memory)


# val_loader3 = DataLoader(val_ds3, config.batch_size, shuffle=False,
#                         num_workers=config.num_workers,
#                         pin_memory=config.pin_memory)

In [None]:
# bert3 = BertModel.from_pretrained("bert-base-uncased")
# model3 = BertForABSA(bert3)

In [None]:
# for i, l in train_loader3:
#     print(l)
#     break

In [None]:
# history3 = train_together(model3, train_loader3, val_loader3, 5e-5, 5)

## Prediction

In [None]:
def predict(sentence):
    inputs = tokenizer(sentence, add_special_tokens=True, \
                                truncation=True, padding=True, \
                                return_tensors='pt')

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    output1 = model(**inputs)
    preds1 = torch.argmax(output1, dim=1)

    output2 = model2(**inputs)
    preds2 = torch.argmax(output2, dim=1)

    a1 = preds1.item()//3
    s1 = preds1.item()%3

    a2 = preds2.item()//3
    s2 = preds2.item()%3

    aspect_conversion_dict = {'quality': 0, 'price': 1, 'service': 2, 'options': 3, 'general': 4}
    sentiment_conversion_dict = {2: 'positive', 1: 'neutral', 0: 'negative'}

    aspect1 = aspect_conversion_dict[a1]
    sentiment1 = sentiment_conversion_dict[s1]

    aspect2 = aspect_conversion_dict[a2]
    sentiment2 = sentiment_conversion_dict[s2]

    prediction = {aspect1: sentiment1, aspect2: sentiment2}

    return prediction #, a1, a2, s1, s2

In [None]:
# def predict_together(sentence):
#     inputs = tokenizer(sentence, add_special_tokens=True, \
#                                 truncation=True, padding=True, \
#                                 return_tensors='pt')

#     use_cuda = torch.cuda.is_available()
#     device = torch.device("cuda" if use_cuda else "cpu")
#     inputs = {k: v.to(device) for k, v in inputs.items()}

#     outputs = model(**inputs)
#     oh = get_one_hot(outputs)


In [None]:
sentence = "Only quibbles are so-so wine service, and while Prix-fixe is reasonable at $68, extra charges for additional dishes/tastings can be high."
# inputs = tokenizer(sentence)
# print(inputs)
prediction = predict(sentence)
print(prediction)

KeyError: ignored

In [None]:
test_df = pd.read_xml(config.test_path)
test_df.head()

AttributeError: ignored

In [None]:
sents_aspects = test_df['text'].apply(predict)

In [None]:
# APPEND ASPECTS AND POLARITIES IN THE TEST DATAFRAME
aspect1 = []
aspect2 = []
sentiment1 = []
sentiment2 = []
for result in sents_aspects:
    aspects = list(result.keys())
    sentiments = list(result.values())

    aspect1.append(aspects[0])
    if len(aspects) == 2:
        aspect2.append(aspects[1])
    else:
        aspect2.append(None)

    sentiment1.append(sentiments[0])
    if len(sentiments) == 2:
        sentiment2.append(sentiments[1])
    else:
        sentiment2.append(None)

In [None]:
test_df['aspect1'] = aspect1
test_df['sentiment1'] = sentiment1
test_df['aspect2'] = aspect2
test_df['sentiment2'] = sentiment2

In [None]:
test_df

### Saving Result to XML file

In [None]:
root = minidom.Document()
sentences = root.createElement('sentences')

for text, aspect1, sentiment1, aspect2, sentiment2 in zip(test_df['text'], test_df['aspect1'], test_df['sentiment1'], test_df['aspect2'], test_df['sentiment2']):
    sentence = root.createElement('sentence')
    text_child = root.createElement('text')
    text_child.appendChild(root.createTextNode(text))

    aspect_cats_child = root.createElement('aspectCategories')

    aspect_cat_child1 = root.createElement('aspectCategory')
    aspect_cat_child1.setAttribute('category', aspect1)
    aspect_cat_child1.setAttribute('polarity', sentiment1)

    aspect_cat_child2 = root.createElement('aspectCategory')
    aspect_cat_child2.setAttribute('category', aspect2)
    aspect_cat_child2.setAttribute('polarity', sentiment2)

    aspect_cats_child.appendChild(aspect_cat_child1)
    aspect_cats_child.appendChild(aspect_cat_child2)
    sentence.appendChild(text_child)
    sentence.appendChild(aspect_cats_child)

    sentences.appendChild(sentence)


root.appendChild(sentences)
test_xml = root.toprettyxml(indent ="\t")

with open(config.output_path, "w") as f:
    f.write(test_xml)

## Plotting Results

In [None]:
train_losses = [x['train_loss'] for x in history]
val_losses = [x['val_loss'] for x in history]
epochs = [x['epoch'] for x in history]

plt.locator_params(axis='x', nbins=5)
plt.plot(epochs, train_losses, label='Train-Losses')
plt.plot(epochs, val_losses, label='Validation-Losses')
plt.xlabel('Epcohs')
plt.ylabel('Losses')
plt.title('Losses vs Epochs')
plt.legend()

# print(train_loss)

In [None]:
def plot(history, name="HistoryPlot", figsize=(20, 9)):
    fig = plt.figure(figsize=figsize)
    epochs = [x['epoch'] for x in history]

    # Plotting Losses
    ax1 = fig.add_subplot(121)
    ax1.locator_params(axis='x', nbins=5)
    train_losses = [x['train_loss'] for x in history]
    val_losses = [x['val_loss'] for x in history]
    ax1.plot(epochs, train_losses, label='Train-Losses')
    ax1.plot(epochs, val_losses, label='Validation-Losses')
    plt.xlabel('Epochs')
    plt.ylabel('Losses')
    plt.title('Losses vs Epochs')
    plt.legend()

    # Plotting Accuracies
    ax2 = fig.add_subplot(122)
    ax2.locator_params(axis='x', nbins=5)
    train_accs = [x['train_acc'].cpu() for x in history]
    val_accs = [x['val_acc'].cpu() for x in history]
    ax2.plot(epochs, train_accs, label='Train-Accuracies')
    ax2.plot(epochs, val_accs, label='Validation-Accuracies')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracies')
    plt.title('Accuracies vs Epochs')
    plt.legend()

    fig.savefig('./'+name+".jpg")
    plt.show()

In [None]:
plot(history, name="HistoryPlot1")

In [None]:
plot(history2, name="HistoryPlot2")

## Save Model

In [None]:
import os
os.mkdir('./models')
torch.save(model.state_dict(), './models/model1.pt')
torch.save(model2.state_dict(), './models/model2.pt')