<a href="https://colab.research.google.com/github/kylehiroyasu/opinion-lab-group-1.3/blob/master/notebooks/Load_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup Notebook

In [1]:
import os
from pathlib import Path
import sys
colab = False

In [2]:
if colab:
    from getpass import getpass
    import urllib
    from google.colab import output

    user = input('User name: ')
    password = getpass('Password: ')
    password = urllib.parse.quote(password) # your password is converted into url format
    repo_name = "kylehiroyasu/opinion-lab-group-1.3"

    cmd_string = 'git clone https://{0}:{1}@github.com/{2}.git'.format(user, password, repo_name)

    os.system(cmd_string)
    # Removing the password from the variable
    cmd_string, password = "", "" 

    # Remove the output of this cell (removes authetication information)
    output.clear()

Change the directory to the repository and pull latest changes (if any). Only needed when you are on Google Colab

In [3]:
if colab:
    %cd opinion-lab-group-1.3/
    ! git pull
    ! ls

Only **execute** the next cells, if you are **local** and you are in the notebooks directory! This is not needed in Google Colab

In [4]:
%cd ..
! ls

/home/ibes222/Documents/Master/NLPLab/GitHub
data  notebooks  opinion  README.md  requirements.txt  src


In [5]:
if colab:
    %pip install -r requirements.txt
    output.clear()

## Constants

In [6]:
ROOT = Path(os.getcwd())
DATA = ROOT/'data'
SRC =  ROOT/'src'
RAW_DATA = DATA/'raw'
RAW_FILES = [
    'ABSA16_Laptops_Train_SB1.xml',
    'ABSA16_Laptops_Test_SB1_GOLD.xml',
    'ABSA16_Restaurants_Train_SB1.xml',
    'ABSA16_Restaurants_Test_SB1_GOLD.xml'
]
print(ROOT)

/home/ibes222/Documents/Master/NLPLab/GitHub


In [7]:
sys.path.append(str(SRC))

## Imports

In [8]:
import numpy as np
import preprocess

## Data Import and Preprocessing

All the data is stored in `data/raw` as `xml` files. The data is stored in an hierarchical format of course with information stored in tags and tag properties.

To make the data easier to work with we've created functionality to denormalize the datasets.

In [9]:
laptops_train = preprocess.load_data_as_df(RAW_DATA/RAW_FILES[0])
laptops_test = preprocess.load_data_as_df(RAW_DATA/RAW_FILES[1])

restaurants_train = preprocess.load_data_as_df(RAW_DATA/RAW_FILES[2])
restaurants_test = preprocess.load_data_as_df(RAW_DATA/RAW_FILES[3])

### Sample

In [10]:
restaurants_train.head()

Unnamed: 0,rid,entity,attribute,polarity,id,text,outofscope
0,1004293,RESTAURANT,GENERAL,negative,1004293:0,Judging from previous posts this used to be a ...,
1,1004293,SERVICE,GENERAL,negative,1004293:1,"We, there were four of us, arrived at noon - t...",
2,1004293,SERVICE,GENERAL,negative,1004293:2,"They never brought us complimentary noodles, i...",
3,1004293,FOOD,QUALITY,negative,1004293:3,The food was lousy - too sweet or too salty an...,
4,1004293,FOOD,STYLE_OPTIONS,negative,1004293:3,The food was lousy - too sweet or too salty an...,


# Model Training



In [11]:
import time
import math

import torch as t
import torch.nn as nn
from torch.utils.data import DataLoader, random_split
from flair.data import Sentence
from flair.embeddings import WordEmbeddings, BertEmbeddings

from Dataset import AspectDataset, dfToDataset, collate
from Model import Model
from Learners import Learner_Classification, Learner_Clustering
from Loss import KCL, MCL, Class2Simi

In [12]:
laptop_entities = {"BATTERY": 0, "COMPANY": 1, "CPU": 2, "DISPLAY": 3, "FANS_COOLING": 4, "GRAPHICS": 5, "HARDWARE": 6, "HARD_DISC": 7, "KEYBOARD": 8, "LAPTOP": 9, "MEMORY": 10, "MOTHERBOARD": 11, "MOUSE": 12, "MULTIMEDIA_DEVICES": 13, "OPTICAL_DRIVES": 14, "OS": 15, "PORTS": 16, "POWER_SUPPLY": 17, "SHIPPING": 18, "SOFTWARE": 19, "SUPPORT": 20, "WARRANTY": 21, "NaN": 22}
laptop_attributes = {"CONNECTIVITY": 0, "DESIGN_FEATURES": 1, "GENERAL": 2, "MISCELLANEOUS": 3, "OPERATION_PERFORMANCE": 4,"PORTABILITY": 5, "PRICE": 6, "QUALITY": 7, "USABILITY": 8, "NaN": 9}
restaurant_entities = {"AMBIENCE": 0, "DRINKS": 1, "FOOD": 2, "LOCATION": 3, "RESTAURANT": 4, "SERVICE": 5, "NaN": 6}
restaurant_attributes = {"GENERAL": 0, "MISCELLANEOUS": 1, "PRICES": 2, "QUALITY": 3, "STYLE_OPTIONS": 4, "NaN": 5}

glove_embeddings = WordEmbeddings('glove')
hidden_dim = 100
output_dim = len(restaurant_entities)

train_dataset = dfToDataset(restaurants_train, restaurant_entities, restaurant_attributes, glove_embeddings)
test_dataset = dfToDataset(restaurants_test, restaurant_entities, restaurant_attributes, glove_embeddings)

In [16]:
param = {
    "embedding_dim": hidden_dim,
    "output_dim": output_dim,
    "epochs": 40,
    "lr": 0.0025,
    "batch_size": 256,
    "cuda": False,
    "use_kcl": True,
    "with_supervised": False,
    "train_entities": True
}

model = Model(param["embedding_dim"], param["output_dim"])

validation_length = math.ceil(len(train_dataset) * 0.1)
train_length = len(train_dataset) - validation_length
datasetsList = random_split(train_dataset, [train_length, validation_length])
train_dataset = datasetsList[0]
validation_dataset = datasetsList[1]

dataloader = DataLoader(train_dataset, batch_size=param["batch_size"], shuffle=True, collate_fn=collate)
validloader = DataLoader(validation_dataset, batch_size=param["batch_size"], collate_fn=collate)

optimizer = t.optim.Adam(model.parameters(), lr=param["lr"])
learner_classification = Learner_Classification(nn.CrossEntropyLoss())
if param["use_kcl"]:
    learner_clustering = Learner_Clustering(KCL())
else:
    learner_clustering = Learner_Clustering(MCL())

if param["cuda"] and t.cuda.is_available():
    print("Using GPU")
    device = t.device('cuda:0')
    model.to(device)
else:
    print("Using CPU")
    param["cuda"] = False
    device = t.device('cpu')

for e in range(param["epochs"]):
    print("Epoch:", e)
    model.train()
    agg_cost = None
    for sentences, entities, attributes in dataloader:
        #TODO How can we possibly sample different amounts of similar and dissimilar samples?
        if param["train_entities"]:
            target = entities
        else:
            target = attributes
        if param["cuda"]:
            sentences = sentences.cuda()
            target = target.cuda()

        optimizer.zero_grad()
        
        output = []
        for sentence in sentences:
            output.append(model(t.unsqueeze(sentence, dim=0)))
        output = t.cat(output, dim=0)
        

        similarity = Class2Simi(target)
        loss = learner_clustering.calculate_criterion(output, similarity)
        if param["with_supervised"]:
            loss += learner_classification.calculate_criterion(output, target)

        loss.backward()
        if agg_cost is None:
            agg_cost = loss
        else:
            agg_cost += loss
        optimizer.step()
    model.eval()
    print(agg_cost)
    # TODO How do we do the evaluation, if we are not in the supervised case? Assign output to majority label? Compute centroids?


Using CPU
Epoch: 0
tensor(11.7875, grad_fn=<AddBackward0>)
Epoch: 1
tensor(11.4279, grad_fn=<AddBackward0>)
Epoch: 2
tensor(10.8503, grad_fn=<AddBackward0>)
Epoch: 3
tensor(10.4990, grad_fn=<AddBackward0>)
Epoch: 4
tensor(10.1413, grad_fn=<AddBackward0>)
Epoch: 5
tensor(9.7603, grad_fn=<AddBackward0>)
Epoch: 6
tensor(9.2305, grad_fn=<AddBackward0>)
Epoch: 7
tensor(8.7407, grad_fn=<AddBackward0>)
Epoch: 8
tensor(8.3216, grad_fn=<AddBackward0>)
Epoch: 9
tensor(8.0251, grad_fn=<AddBackward0>)
Epoch: 10
tensor(7.9133, grad_fn=<AddBackward0>)
Epoch: 11
tensor(7.7847, grad_fn=<AddBackward0>)
Epoch: 12
tensor(7.6341, grad_fn=<AddBackward0>)
Epoch: 13
tensor(7.5728, grad_fn=<AddBackward0>)
Epoch: 14
tensor(7.3847, grad_fn=<AddBackward0>)
Epoch: 15
tensor(7.4050, grad_fn=<AddBackward0>)
Epoch: 16
tensor(7.3305, grad_fn=<AddBackward0>)
Epoch: 17
tensor(7.4437, grad_fn=<AddBackward0>)
Epoch: 18
tensor(7.2604, grad_fn=<AddBackward0>)
Epoch: 19
tensor(7.2565, grad_fn=<AddBackward0>)
Epoch: 20
tenso