<a href="https://colab.research.google.com/github/kylehiroyasu/opinion-lab-group-1.3/blob/master/notebooks/Load_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup Notebook

In [None]:
import os
from pathlib import Path
import sys

In [None]:

from getpass import getpass
import urllib
from google.colab import output

user = input('User name: ')
password = getpass('Password: ')
password = urllib.parse.quote(password) # your password is converted into url format
repo_name = "kylehiroyasu/opinion-lab-group-1.3"

cmd_string = 'git clone https://{0}:{1}@github.com/{2}.git'.format(user, password, repo_name)

os.system(cmd_string)
# Removing the password from the variable
cmd_string, password = "", "" 

# Remove the output of this cell (removes authetication information)
output.clear()

Change the directory to the repository and pull latest changes (if any). Only needed when you are on Google Colab

In [None]:
%cd opinion-lab-group-1.3/
! git pull
! ls

Only **execute** the next cells, if you are **local** and you are in the notebooks directory! This is not needed in Google Colab

In [None]:
%cd ..
! ls

In [None]:
%pip install -r requirements.txt
output.clear()

## Constants

In [None]:
ROOT = Path(os.getcwd())
DATA = ROOT/'data'
SRC =  ROOT/'src'
RAW_DATA = DATA/'raw'
RAW_FILES = [
    'ABSA16_Laptops_Train_SB1.xml',
    'ABSA16_Laptops_Test_SB1_GOLD.xml',
    'ABSA16_Restaurants_Train_SB1.xml',
    'ABSA16_Restaurants_Test_SB1_GOLD.xml'
]
print(ROOT)

In [None]:
sys.path.append(str(SRC))

## Imports

In [None]:
import numpy as np
import preprocess

## Data Import and Preprocessing

All the data is stored in `data/raw` as `xml` files. The data is stored in an hierarchical format of course with information stored in tags and tag properties.

To make the data easier to work with we've created functionality to denormalize the datasets.

In [None]:
laptops_train = preprocess.load_data_as_df(RAW_DATA/RAW_FILES[0])
laptops_test = preprocess.load_data_as_df(RAW_DATA/RAW_FILES[1])

restaurants_train = preprocess.load_data_as_df(RAW_DATA/RAW_FILES[2])
restaurants_test = preprocess.load_data_as_df(RAW_DATA/RAW_FILES[3])

### Sample

In [None]:
restaurants_train.head()

# Model Training



In [None]:
import time
import math

import torch as t
import torch.nn as nn
from torch.utils.data import DataLoader, random_split
from flair.data import Sentence
from flair.embeddings import WordEmbeddings, BertEmbeddings

from Dataset import AspectDataset, dfToDataset
from Model import Model

In [None]:
laptop_entities = {"BATTERY": 0, "COMPANY": 1, "CPU": 2, "DISPLAY": 3, "FANS_COOLING": 4, "GRAPHICS": 5, "HARDWARE": 6, "HARD_DISC": 7, "KEYBOARD": 8, "LAPTOP": 9, "MEMORY": 10, "MOTHERBOARD": 11, "MOUSE": 12, "MULTIMEDIA_DEVICES": 13, "OPTICAL_DRIVES": 14, "OS": 15, "PORTS": 16, "POWER_SUPPLY": 17, "SHIPPING": 18, "SOFTWARE": 19, "SUPPORT": 20, "WARRANTY": 21}
laptop_attributes = {"CONNECTIVITY": 0, "DESIGN_FEATURES": 1, "GENERAL": 2, "MISCELLANEOUS": 3, "OPERATION_PERFORMANCE": 4,"PORTABILITY": 5, "PRICE": 6, "QUALITY": 7, "USABILITY": 8}
restaurant_entities = {"AMBIENCE": 0, "DRINKS": 1, "FOOD": 2, "LOCATION": 3, "RESTAURANT": 4, "SERVICE": 5}
restaurant_attributes = {"GENERAL": 0, "MISCELLANEOUS": 1, "PRICES": 2, "QUALITY": 3, "STYLE_OPTIONS": 4}

glove_embeddings = WordEmbeddings('glove')
hidden_dim = 100
output_dim = len(restaurant_entities)

train_dataset = dfToDataset(restaurants_train, restaurant_entities, restaurant_attributes)
train_dataset.addEmbeddings(glove_embeddings)
test_dataset = dfToDataset(restaurants_test, restaurant_entities, restaurant_attributes)
test_dataset.addEmbeddings(glove_embeddings)

In [None]:
param = {
    "embedding_dim": hidden_dim,
    "output_dim": output_dim,
    "epochs": 0,
    "lr": 0.00025,
    "cuda": False,
    "with_kcl": True,
    "with_supervised": True,
    "train_entities": True
}

network = Model(param["embedding_dim"], param["output_dim"])
if param["cuda"] and t.cuda.is_available():
    print("Using GPU")
    device = t.device('cuda:0')
else:
    print("Using CPU")
    param["cuda"] = False
    device = t.device('cpu')

validation_length = math.ceil(len(train_dataset) * 0.1)
train_length = len(train_dataset) - validation_length
datasetsList = random_split(train_dataset, [train_length, validation_length])
train_dataset = datasetsList[0]
validation_dataset = datasetsList[1]

dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
validloader = DataLoader(validation_dataset, batch_size=128)

optimizer = t.optim.Adam(network.parameters(), lr=param["lr"])
supervised_loss = nn.CrossEntropyLoss()

for e in range(param["epochs"]):
    model.train()
    for sentences, entities, attributes in dataloader:
        #TODO How can we possibly sample different amounts of similar and dissimilar samples?
        if param["train_entities"]:
            target = entities
        else:
            target = attributes
        if param["cuda"]:
            sentences = sentences.cuda()
            target = target.cuda()
        optimizer.zero_grad()

        output = network(sentences)

        if param["with_kcl"]:
            loss = kcl(output, target)
        else:
            loss = mcl(output, target)
        if param["with_supervised"]:
            loss += supervised_loss(output, target)

        loss.backward()
        optimizer.step()
    model.eval()
    # TODO How do we do the evaluation, if we are not in the supervised case? Assign output to majority label? Compute centroids?
