In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!\n')
  from google.colab import drive
  drive.mount('/content/drive')

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Your runtime has 27.3 gigabytes of available RAM

You are using a high-RAM runtime!

Sun Jul 31 23:41:07 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   65C    P8    11W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+--

## Libraries

In [None]:
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

In [None]:
if IN_COLAB:
    !pip install transformers
    !pip install datasets
    !pip install evaluate
    !pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 37.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 16.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 8.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 52.6 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling P

In [None]:
import gc
gc.collect()

import torch

import os
import pandas as pd
import numpy as np

from datasets import Dataset
from datasets.dataset_dict import DatasetDict

from transformers import AutoTokenizer, T5Tokenizer

from transformers import DataCollator, DataCollatorWithPadding, DataCollatorForLanguageModeling, DataCollatorForSeq2Seq
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, T5ForConditionalGeneration


from evaluate import load

## SemEval 2014 Dataset Load & Preprocessing

In [None]:
from datasets import load_dataset

dataset = load_dataset("Yaxin/SemEval2014")

Downloading builder script:   0%|          | 0.00/7.22k [00:00<?, ?B/s]

No config specified, defaulting to: sem_eval2014/All


Downloading and preparing dataset sem_eval2014/All to /root/.cache/huggingface/datasets/Yaxin___sem_eval2014/All/0.0.1/24f2a9b2b752a3d615013f4dbc87c5d192f5d93ec99a0240cb411698c77fdc48...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/170k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/145k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/4.75k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.48k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/49.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/35.0k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset sem_eval2014 downloaded and prepared to /root/.cache/huggingface/datasets/Yaxin___sem_eval2014/All/0.0.1/24f2a9b2b752a3d615013f4dbc87c5d192f5d93ec99a0240cb411698c77fdc48. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
#Check data sample
dataset['train'][10]

{'aspectCategories': [{'category': 'food', 'polarity': 'negative'}],
 'aspectTerms': [{'from': '126',
   'polarity': 'negative',
   'term': 'bacon',
   'to': '131'},
  {'from': '170', 'polarity': 'neutral', 'term': 'plate', 'to': '175'},
  {'from': '18', 'polarity': 'negative', 'term': 'mayonnaise', 'to': '28'},
  {'from': '41', 'polarity': 'negative', 'term': 'toast', 'to': '46'},
  {'from': '57', 'polarity': 'negative', 'term': 'ingredients', 'to': '68'},
  {'from': '73', 'polarity': 'neutral', 'term': 'cheese', 'to': '79'},
  {'from': '86', 'polarity': 'neutral', 'term': 'omelet', 'to': '92'}],
 'domain': 'restaurants',
 'sentenceId': '296',
 'text': 'They did not have mayonnaise, forgot our toast, left out ingredients (ie cheese in an omelet), below hot temperatures and the bacon was so over cooked it crumbled on the plate when you touched it.'}

In [None]:
#Convert huggingface datasets to pandas for data preprocessing
tr = pd.DataFrame(dataset['train'])
te = pd.DataFrame(dataset['test'])
val = pd.DataFrame(dataset['validation'])

In [None]:
#Create duplicate records based on number of aspect term labels in the dataset
def duplicateRecordsByListCol(df, by, min_val = None):
    df['len'] = df[by].apply(lambda x: len(x))
    if min_val is not None:
        df.loc[df['len'] == 0, 'len'] = min_val
    df = df.loc[df.index.repeat(df['len'])]
    df['record_idx'] = df.groupby(df.index).cumcount()
    return df

tr = duplicateRecordsByListCol(tr, by = 'aspectTerms', min_val = 1)
te = duplicateRecordsByListCol(te, by = 'aspectTerms', min_val = 1)
val = duplicateRecordsByListCol(val, by = 'aspectTerms', min_val = 1)

In [None]:
#Extract each aspect term for each row for reviews with muliple aspect term entries. Do same for polarities and create new column for the same
def extractRowWiseAspectAndPolarity(df):
    df['record_idx'] = df.groupby(df.index).cumcount()
    df['aspect'] = df[['aspectTerms', 'record_idx']].apply(lambda x : (x[0][x[1]]['term'], x[0][x[1]]['polarity']) if len(x[0]) != 0 else ('',''), axis=1)
    df['polarity'] = df['aspect'].apply(lambda x: x[-1])
    df['aspect'] = df['aspect'].apply(lambda x: x[0])
    df = df.drop(['len', 'record_idx'], axis=1).reset_index(drop = True)
    return df

tr = extractRowWiseAspectAndPolarity(tr)
te = extractRowWiseAspectAndPolarity(te)
val = extractRowWiseAspectAndPolarity(val)

In [None]:
tr.head()

Unnamed: 0,text,aspectTerms,aspectCategories,domain,sentenceId,aspect,polarity
0,But the staff was so horrible to us.,"[{'from': '8', 'polarity': 'negative', 'term':...","[{'category': 'service', 'polarity': 'negative'}]",restaurants,3121,staff,negative
1,"To be completely fair, the only redeeming fact...","[{'from': '57', 'polarity': 'positive', 'term'...","[{'category': 'food', 'polarity': 'positive'},...",restaurants,2777,food,positive
2,"The food is uniformly exceptional, with a very...","[{'from': '141', 'polarity': 'neutral', 'term'...","[{'category': 'food', 'polarity': 'positive'}]",restaurants,1634,menu,neutral
3,"The food is uniformly exceptional, with a very...","[{'from': '141', 'polarity': 'neutral', 'term'...","[{'category': 'food', 'polarity': 'positive'}]",restaurants,1634,food,positive
4,"The food is uniformly exceptional, with a very...","[{'from': '141', 'polarity': 'neutral', 'term'...","[{'category': 'food', 'polarity': 'positive'}]",restaurants,1634,kitchen,positive


In [None]:
#Convert the dataframes back to huggingface DatasetDict object
dataset = DatasetDict({'train': Dataset.from_pandas(tr), 'test': Dataset.from_pandas(te), 'validation': Dataset.from_pandas(val)})

In [None]:
def prepare_data(sample):
    model_inp = 'The domain is ' + (', '.join(sample["domain"]) if isinstance(sample['domain'], list) else sample['domain']) + \
                ' and the category is ' + ', '.join([cat['category'] for cat in sample["aspectCategories"]]) + '.'\
                ' The text given is: ' + sample['text'] + \
                ' The aspect identified is: ' + sample['aspect'] + '.'\
                ' The polarity of the identified aspect is: '
    sample['text_model'] = model_inp
    return sample

In [None]:
#Apply above function to the datasets --> Create text and convert polarity to numerical values
dataset['train'] = dataset['train'].map(prepare_data)
dataset['test'] = dataset['test'].map(prepare_data)
dataset['validation'] = dataset['validation'].map(prepare_data)
dataset = dataset.remove_columns(["aspectCategories", "domain", "aspectTerms", "text"])
dataset = dataset.rename_column("text_model", "text")

  0%|          | 0/8628 [00:00<?, ?ex/s]

  0%|          | 0/2360 [00:00<?, ?ex/s]

  0%|          | 0/251 [00:00<?, ?ex/s]

In [None]:
#Example
print(dataset['train'][0]['text'] + dataset['train'][0]['polarity'])

The domain is restaurants and the category is service. The text given is: But the staff was so horrible to us. The aspect identified is: staff. The polarity of the identified aspect is: negative


In [None]:
trials = ["t5-small", "bigscience/T0", "facebook/opt-125m", "facebook/opt-350m", "facebook/opt-1.3b"]

In [None]:
model_checkpoint = trials[0]

In [None]:
#Tokenizing - inputids & labels
tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)

def tokenize_function_inputs(sample):
    sample['input_ids'] = tokenizer(sample["text"], padding = True, truncation = True).input_ids
    sample['labels'] = tokenizer(sample["polarity"], padding = True, truncation = True).input_ids
    return sample

tokenized_datasets = dataset.map(tokenize_function_inputs, batched=True)

Downloading spiece.model:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
#Remove unwanted columns
tokenized_datasets = tokenized_datasets.remove_columns(["sentenceId", "aspect", "text", "polarity"])

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 8628
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 2360
    })
    validation: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 251
    })
})

In [None]:
#Check tokenization - text
tokenizer.decode(tokenized_datasets['train'][1]['input_ids'])

"The domain is restaurants and the category is food, anecdotes/miscellaneous. The text given is: To be completely fair, the only redeeming factor was the food, which was above average, but couldn't make up for all the other deficiencies of Teodora. The aspect identified is: food. The polarity of the identified aspect is:</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>"

In [None]:
#Check tokenization - label
tokenizer.decode(tokenized_datasets['train'][1]['labels'])

'positive</s>'

## Model Training - T5 - small

In [None]:
batch_size = 4

In [None]:
if IN_COLAB:
    root_path = '/content/drive/Othercomputers/My MacBook Pro/Kevin Files/Sentiment Analysis/instructional_learning_based_fsabsa/'
else:
    root_path = os.getcwd()

In [None]:
model_name = model_checkpoint.split("/")[-1]
print(model_name)

t5-small


In [None]:
#Set training arguments
args = TrainingArguments(
    root_path + f"{model_name}-finetuned-absa-trial1/checkpoints",
    evaluation_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=12,
    weight_decay=0.01,
    warmup_ratio = 0.1,
    save_strategy='epoch',
    load_best_model_at_end = True,
    push_to_hub=False,
    eval_accumulation_steps  = 1
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
#Metrics function
accuracy_metric = load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

In [None]:
#Load pretrained model
# model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)

In [None]:
#Define trainer object
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    # .select(range(100)),
    tokenizer=tokenizer,
    # data_collator=dclm,
    # compute_metrics=compute_metrics,
)

In [None]:
#Fit the model and evaluate
torch.cuda.empty_cache()
trainer.train()

***** Running training *****
  Num examples = 8628
  Num Epochs = 12
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 25884


Epoch,Training Loss,Validation Loss
1,0.4121,0.176657
2,0.3018,0.117808
3,0.2545,0.065275
4,0.2136,0.037608
5,0.2012,0.04162
6,0.1901,0.021478
7,0.1828,0.016648
8,0.1457,0.017313
9,0.148,0.017071
10,0.1229,0.016093


***** Running Evaluation *****
  Num examples = 251
  Batch size = 4
Saving model checkpoint to /content/drive/Othercomputers/My MacBook Pro/Kevin Files/Sentiment Analysis/knowledge_based_fsabsa/t5-small-finetuned-absa-trial1/checkpoints/checkpoint-2157
Configuration saved in /content/drive/Othercomputers/My MacBook Pro/Kevin Files/Sentiment Analysis/knowledge_based_fsabsa/t5-small-finetuned-absa-trial1/checkpoints/checkpoint-2157/config.json
Model weights saved in /content/drive/Othercomputers/My MacBook Pro/Kevin Files/Sentiment Analysis/knowledge_based_fsabsa/t5-small-finetuned-absa-trial1/checkpoints/checkpoint-2157/pytorch_model.bin
tokenizer config file saved in /content/drive/Othercomputers/My MacBook Pro/Kevin Files/Sentiment Analysis/knowledge_based_fsabsa/t5-small-finetuned-absa-trial1/checkpoints/checkpoint-2157/tokenizer_config.json
Special tokens file saved in /content/drive/Othercomputers/My MacBook Pro/Kevin Files/Sentiment Analysis/knowledge_based_fsabsa/t5-small-finetu

TrainOutput(global_step=25884, training_loss=0.3480309529275766, metrics={'train_runtime': 1974.8657, 'train_samples_per_second': 52.427, 'train_steps_per_second': 13.107, 'total_flos': 3558464798982144.0, 'train_loss': 0.3480309529275766, 'epoch': 12.0})

In [None]:
#Save best model
trainer.save_model()

Saving model checkpoint to /content/drive/Othercomputers/My MacBook Pro/Kevin Files/Sentiment Analysis/knowledge_based_fsabsa/t5-small-finetuned-absa-trial1/checkpoints
Configuration saved in /content/drive/Othercomputers/My MacBook Pro/Kevin Files/Sentiment Analysis/knowledge_based_fsabsa/t5-small-finetuned-absa-trial1/checkpoints/config.json
Model weights saved in /content/drive/Othercomputers/My MacBook Pro/Kevin Files/Sentiment Analysis/knowledge_based_fsabsa/t5-small-finetuned-absa-trial1/checkpoints/pytorch_model.bin
tokenizer config file saved in /content/drive/Othercomputers/My MacBook Pro/Kevin Files/Sentiment Analysis/knowledge_based_fsabsa/t5-small-finetuned-absa-trial1/checkpoints/tokenizer_config.json
Special tokens file saved in /content/drive/Othercomputers/My MacBook Pro/Kevin Files/Sentiment Analysis/knowledge_based_fsabsa/t5-small-finetuned-absa-trial1/checkpoints/special_tokens_map.json


In [None]:
#Train Accuracy
def compute_accuracy(model_trainer, tokenized_dataset, sample_set = 'train'):
    output_ids = np.argmax(model_trainer.predict(test_dataset=tokenized_dataset[sample_set]).predictions[0], axis=2)
    trainer_outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    label = tokenizer.batch_decode(tokenized_dataset[sample_set]['labels'], skip_special_tokens=True)
    resdf = pd.DataFrame({'pred': trainer_outputs, 'label':label})
    acc = resdf.apply(lambda x: x[0] == x[1], axis=1).sum()/len(resdf)
    resdf = None
    del resdf
    return acc

In [None]:
print('Train accuracy: ', compute_accuracy(model_trainer = trainer, tokenized_dataset = tokenized_datasets, sample_set = 'train'))

***** Running Prediction *****
  Num examples = 8628
  Batch size = 4


Train accuracy:  0.9607093184979137


In [None]:
print('Validation accuracy: ', compute_accuracy(model_trainer = trainer, tokenized_dataset = tokenized_datasets, sample_set = 'validation'))

***** Running Prediction *****
  Num examples = 251
  Batch size = 4


Validation accuracy:  0.9920318725099602


## Test on sample data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from transformers import Text2TextGenerationPipeline

In [None]:
model_path = '/content/drive/Othercomputers/My MacBook Pro/Kevin Files/Sentiment Analysis/instructional_learning_based_fsabsa/best_model_dump'

In [None]:
ft_tokenizer = T5Tokenizer.from_pretrained(model_path)
ft_model = T5ForConditionalGeneration.from_pretrained(model_path)

In [None]:
absa_pipeline = Text2TextGenerationPipeline(model = ft_model, tokenizer = ft_tokenizer, framework = 'pt')

In [None]:
#With instructions
print('With instructions: ', absa_pipeline("The domain is covid-19 and the category is vaccine. The text given is: new guideline fully vaccinate covid-19 free enjoy libertiesa:. The aspect identified is: vaccinated. The polarity of the identified aspect is:"))

#Without instructions
print('Without instructions: ', absa_pipeline("new guideline fully vaccinate covid-19 free enjoy libertiesa:"))

With instructions:  [{'generated_text': 'positive'}]
Without instructions:  [{'generated_text': 'neutral'}]


In [None]:
#Without instructions
print('Without instructions: ', absa_pipeline("The domain is covid-19 and the category is vaccine. The text given is: bruh i want to be vaccinated so i can go on vacation but sadly iÃ¢Â€Â™m in ontarioÃ¢Â€Â™s 100th lockdown. The aspect identified is: vaccinated. The polarity of the identified aspect is:"))

#Without instructions
print('Without instructions: ', absa_pipeline("bruh i want to be vaccinated so i can go on vacation but sadly iÃ¢Â€Â™m in ontarioÃ¢Â€Â™s 100th lockdown"))

Without instructions:  [{'generated_text': 'neutral'}]
Without instructions:  [{'generated_text': 'negative'}]


In [None]:
#With instructions
print('With instructions: ', absa_pipeline("The domain is covid-19 and the category is vaccine. The text given is: bruh i want to be vaccinated so i can go on vacation but sadly iÃ¢Â€Â™m in ontarioÃ¢Â€Â™s 100th lockdown . The aspect identified is: vaccination. The polarity of the identified aspect is:"))

#Without instructions
print('Without instructions: ', absa_pipeline("bruh i want to be vaccinated so i can go on vacation but sadly iÃ¢Â€Â™m in ontarioÃ¢Â€Â™s 100th lockdown"))

With instructions:  [{'generated_text': 'neutral'}]
Without instructions:  [{'generated_text': 'negative'}]


In [None]:
#With instructions
print('With instructions: ', absa_pipeline("The domain is covid-19 and the category is vaccine. The text given is: the âs new guidelines say, once fully vaccinated against covid-19, you will be free to enjoy some libertiesâ¦. The aspect identified is: vaccination. The polarity of the identified aspect is:"))

#Without instructions
print('Without instructions: ', absa_pipeline("the âs new guidelines say, once fully vaccinated against covid-19, you will be free to enjoy some libertiesâ¦"))

With instructions:  [{'generated_text': 'neutral'}]
Without instructions:  [{'generated_text': 'âs new guidelines say, once fully vaccinated against covid-19,'}]


In [None]:
#With instructions
print('With instructions: ', absa_pipeline("The domain is covid-19 and the category is vaccine. The text given is: ever since iÃ¢Â€Â™ve taken the vaccine iÃ¢Â€Â™ve become obsessed with taylor swiftÃ¢Â€Â™s music. i guess the vaccine gave me immunity *and* good tasteÃ°ÂŸÂ˜Â. The aspect identified is: vaccination. The polarity of the identified aspect is:"))

#Without instructions
print('Without instructions: ', absa_pipeline("ever since iÃ¢Â€Â™ve taken the vaccine iÃ¢Â€Â™ve become obsessed with taylor swiftÃ¢Â€Â™s music. i guess the vaccine gave me immunity *and* good tasteÃ°ÂŸÂ˜Â "))

With instructions:  [{'generated_text': 'positive'}]
Without instructions:  [{'generated_text': 'positive'}]


In [None]:
#With instructions
print('With instructions: ', absa_pipeline("The domain is covid-19 and the category is vaccine. The text given is: chest pain, paralysis, seizures: woman details horrific j&amp;j covid vaccine side effects from hospital bed #newsÃ¢Â€Â¦ https://t.co/awg3ehcezo. The aspect identified is: vaccination. The polarity of the identified aspect is:"))

#Without instructions
print('Without instructions: ', absa_pipeline("chest pain, paralysis, seizures: woman details horrific j&amp;j covid vaccine side effects from hospital bed #newsÃ¢Â€Â¦ https://t.co/awg3ehcezo"))

With instructions:  [{'generated_text': 'negative'}]
Without instructions:  [{'generated_text': 'negative'}]


In [None]:
one_shot_sample = "The domain is covid-19 and the category is vaccine. The text given is: how was ur day? it was ok!! i got my second covid vaccine so kinda tired rn. The aspect identified is: covid vaccine. The polarity of the identified aspect is: negative" +\
"\nThe domain is covid-19 and the category is vaccine. The text given is: had a long day after my booster dose. super tired after my vaccine shot. The aspect identified is: vaccine. The polarity of the identified aspect is: negative"

In [None]:
one_shot_sample = "The domain is covid-19 and the category is vaccine. The text given is: how was ur day? it was ok!! i got my second covid vaccine so kinda tired rn. The aspect identified is: covid vaccine. The polarity of the identified aspect is: negative" +\
"\nThe domain is covid-19 and the category is vaccine. The text given is: my vaccine booster dose was fine. It is my job that is tiring. The aspect identified is: vaccine. The polarity of the identified aspect is: positive"

In [None]:
shot2_sample = one_shot_sample + "\nThe domain is covid-19 and the category is vaccine. The text given is: had no issues today after my vaccine booster dose. super tired from work. The aspect identified is: booster. The polarity of the identified aspect is: "

In [None]:
print(shot2_sample)

The domain is covid-19 and the category is vaccine. The text given is: how was ur day? it was ok!! i got my second covid vaccine so kinda tired rn. The aspect identified is: covid vaccine. The polarity of the identified aspect is: negative
The domain is covid-19 and the category is vaccine. The text given is: my vaccine booster dose was fine. It is my job that is tiring. The aspect identified is: vaccine. The polarity of the identified aspect is: positive
The domain is covid-19 and the category is vaccine. The text given is: had no issues today after my vaccine booster dose. super tired from work. The aspect identified is: booster. The polarity of the identified aspect is: 


In [None]:
knowledge = 'Knowledge: getting vaccinated is a good thing. refusing vaccine is bad. development of vaccine is good. side effects due to vaccine are bad'

In [None]:
test_sample1 = knowledge + 'The text is: new funding for covid vaccine approved. The aspect idenitifed is vaccine. The polarity of the identified aspect is: '
test_sample2 = knowledge + 'The text is: planning to get vaccinated to today finally. The aspect idenitifed is vaccine. The polarity of the identified aspect is: '
test_sample3 = knowledge + 'The text is: new guideline fully vaccinate covid-19 free enjoy libertiesa. The aspect idenitifed is vaccine. The polarity of the identified aspect is: '

In [None]:
#With instructions
print('With instructions: ', absa_pipeline(test_sample1))
print('With instructions: ', absa_pipeline(test_sample2))
print('With instructions: ', absa_pipeline(test_sample3))

With instructions:  [{'generated_text': 'negative'}]
With instructions:  [{'generated_text': 'positive'}]
With instructions:  [{'generated_text': 'positive'}]


**Other domain sample check**

In [None]:
#With instructions
absa_pipeline("The domain is covid-19 and the category is vaccine. The text given is: To be completely fair, moderna vaccine has no side effects. I just had a bad day at work :p. The aspect identified is: moderna. The polarity of the identified aspect is:")

[{'generated_text': 'positive'}]

In [None]:
absa_pipeline("The domain is laptop and the category is features. The text given is: The new dell laptop is soo fast, the RAM is big and cool. I think the screen could have more brightness as well as better speakers. The aspect identified is: speakers. The polarity of the identified aspect is:")

[{'generated_text': 'positive'}]

In [None]:
absa_pipeline("The domain is laptop and the category is features. The text given is: The new dell laptop is soo fast, the RAM is big and cool. I think the screen could have more brightness as well as better speakers. The aspect identified is: speed. The polarity of the identified aspect is:")

[{'generated_text': 'positive'}]

In [None]:
absa_pipeline("The domain is restaurants and the category is generic. The text given is: Indian Coffee House is the place for indian cusine over here. The ambience captures the Indian vibe. The identified aspect is ambience. The polarity of the identified aspect is:")

[{'generated_text': 'positive'}]

In [None]:
absa_pipeline("The domain is taxis and the category is cab company. The text given is: The drivers are so rude but the service is not pricey so we have to deal with it. The identified aspect is service. The polarity of the identified aspect is:")

[{'generated_text': 'positive'}]

In [None]:
absa_pipeline("The domain is corporate and the category is upper management. The text given is: The director is the best person to probably ruin your sleep for months together. The identified aspect is service. The polarity of the identified aspect is:")

In [None]:
#With instructions
absa_pipeline("The domain is cab aggregators and the category is service. The text given is: I had an bumpy ride with my cab driver who was very friendly. The aspect identified is: ride. The polarity of the identified aspect is:")

[{'generated_text': 'negative'}]

In [None]:
#With instructions
absa_pipeline("The domain is cab aggregators and the category is service. The text given is: I had an bumpy ride with my cab driver who was very friendly. The aspect identified is: driver. The polarity of the identified aspect is:")

[{'generated_text': 'positive'}]

In [None]:
# #Model Zip
# import shutil
# shutil.make_archive('best_model_dump', 'zip', '/content/drive/Othercomputers/My MacBook Pro/Kevin Files/Sentiment Analysis/knowledge_based_fsabsa/t5-small-finetuned-absa-trial1/checkpoints/checkpoint-25884')

## Testing on unrefinded model

In [None]:
tkx = T5Tokenizer.from_pretrained("t5-small")
mdl = T5ForConditionalGeneration.from_pretrained("t5-small")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading pytorch_model.bin:   0%|          | 0.00/231M [00:00<?, ?B/s]

In [None]:
absa_pipeline_wofinteunte = Text2TextGenerationPipeline(model = mdl, tokenizer = tkx, framework = 'pt')

In [None]:
#With instructions
print('With instructions: ', absa_pipeline_wofinteunte("The domain is covid-19 and the category is vaccine. The text given is: bruh i want to be vaccinated so i can go on vacation but sadly iÃ¢Â€Â™m in ontarioÃ¢Â€Â™s 100th lockdown . The aspect identified is: vaccination. The polarity of the identified aspect is:"))

#Without instructions
print('Without instructions: ', absa_pipeline_wofinteunte("bruh i want to be vaccinated so i can go on vacation but sadly iÃ¢Â€Â™m in ontarioÃ¢Â€Â™s 100th lockdown"))

With instructions:  [{'generated_text': 'i want to be vaccinated so i can go on vacation but'}]
Without instructions:  [{'generated_text': 'i want to be vaccinated so i can go on vacation but unfortunately'}]
