<a href="https://colab.research.google.com/github/hjesse92/style_transfer_w266/blob/main/Few_Shot_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Few Show Learning

## Setup

In [1]:
#!pip install -q transformers datasets sentencepiece rouge_score accelerate evaluate

In [2]:
#Am I running a GPU and what type is it?
!nvidia-smi

Sat Mar 11 01:07:45 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.48.07    Driver Version: 515.48.07    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:0A:00.0  On |                  N/A |
|  0%   35C    P8    24W / 240W |    368MiB /  8192MiB |      4%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [3]:
import torch

if torch.cuda.is_available():     
    device = torch.device("cuda")
    print('Number of GPU(s) available:', torch.cuda.device_count())
    print('GPU device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available')
    device = torch.device("cpu")

Number of GPU(s) available: 1
GPU device name: NVIDIA GeForce RTX 3070


In [32]:
from logging import warning
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, TensorDataset

from transformers import AdamW

from sklearn.utils import resample
from sklearn.model_selection import train_test_split

import re
import random
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pprint

import warnings
warnings.filterwarnings('ignore')

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7efcac2db990>

In [7]:
# from google.colab import drive
# drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
#!cd drive/MyDrive/data

In [10]:
train_file = './data/original-train.tsv'
dev_file = './data/original-dev.tsv'
test_file = './data/original-test.tsv'
df_train = pd.read_csv(train_file, sep='\t')
df_dev = pd.read_csv(dev_file, sep='\t')
df_test = pd.read_csv(test_file, sep='\t')

In [11]:
print(f'''mean length of offensive text: {df_train['offensive-text'].map(len).mean()}''')
print(f'''min length of offensive text: {df_train['offensive-text'].map(len).min()}''')
print(f'''max length of offensive text: {df_train['offensive-text'].map(len).max()}''')
print(f'''mean length of neutralized text: {df_train['style-transferred-text'].map(len).mean()}''')
print(f'''min length of neutralized text: {df_train['style-transferred-text'].map(len).min()}''')
print(f'''max length of neutralized text: {df_train['style-transferred-text'].map(len).max()}''')

mean length of offensive text: 69.85353535353535
min length of offensive text: 9
max length of offensive text: 238
mean length of neutralized text: 60.48800505050505
min length of neutralized text: 1
max length of neutralized text: 174


# Trial with Flan T5

In [12]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

t5tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
t5model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base", device_map="auto", torch_dtype=torch.float16)

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [13]:
#@title 3-Shot Learning
df_fewshot = df_test.copy()
n_shots = 3

for row in range(len(df_fewshot)):
    #For each item in the test set, we use 3 examples from the training set to as n-shots
    sources_targets = df_train.sample(n_shots, replace=False)

    for i in range(n_shots):
        df_fewshot.loc[row, f'shot{i+1}_source'] = sources_targets.iloc[i,0]
        df_fewshot.loc[row, f'shot{i+1}_target'] = sources_targets.iloc[i,1]


df_fewshot = df_fewshot.rename(columns={'offensive-text':'source', 'style-transferred-text':'target'})

In [14]:
df_fewshot['prompt'] = df_fewshot.apply(lambda x: 
                 'Rewrite the toxic text in non-toxic style. \n\n'
                 'Toxic text: ' + x['shot1_source'] + '\n' + 'Non-toxic text: ' + x['shot1_target'] + '\n\n' + \
                 'Toxic text: ' + x['shot2_source'] + '\n' + 'Non-toxic text: ' + x['shot2_target'] + '\n\n' + \
                 'Toxic text: ' + x['shot3_source'] + '\n' + 'Non-toxic text: ' + x['shot3_target'] + '\n\n' + \
                 'Toxic text: ' + x['source'] + '\n' + 'Non-toxic text: ', axis=1)

In [15]:
print(df_fewshot.prompt[0])

Rewrite the toxic text in non-toxic style. 

Toxic text: Has been a US military puppet from jump
Non-toxic text: Has been a tool for the US military from the beginning

Toxic text: It's not though you fucking idiot
Non-toxic text: It's not though

Toxic text: Maybe you shouldn't turn your new country into the shithole you just left.
Non-toxic text: You should be careful not to bring your old problems from your old country to your new country.

Toxic text: So maybe you should be more retarded.
Non-toxic text: 


In [16]:
for i in range(len(df_fewshot)):
    if i%10 == 0:
        print(f'Working on number: {i}')
    t5_inputs = t5tokenizer([df_fewshot.prompt[i]], return_tensors='pt').input_ids.to("cuda")
    t5_summary_ids = t5model.generate(t5_inputs, 
                                    num_beams=5,
                                    no_repeat_ngram_size=1,
                                    num_return_sequences=1,
                                    top_p = 0.92,
                                    top_k = 100,
                                    min_length = 1,
                                    max_length = 200,
                                    do_sample=True,
                                    )
    output = t5tokenizer.batch_decode(t5_summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    df_fewshot.loc[i,'fewshot_output'] = output

Working on number: 0
Working on number: 10
Working on number: 20
Working on number: 30
Working on number: 40
Working on number: 50
Working on number: 60
Working on number: 70
Working on number: 80
Working on number: 90
Working on number: 100
Working on number: 110
Working on number: 120
Working on number: 130
Working on number: 140
Working on number: 150
Working on number: 160
Working on number: 170
Working on number: 180
Working on number: 190


In [17]:
df_fewshot.loc[:,['source','target','prompt','fewshot_output']]

Unnamed: 0,source,target,prompt,fewshot_output
0,So maybe you should be more retarded.,So maybe you should be more backward,Rewrite the toxic text in non-toxic style. \n\...,You should be more retarded.
1,THERES A MEGATHREAD FOR VACCINE OR COVID RELAT...,THERES ACTUALLY A MEGATHREAD FOR VACCINE OR CO...,Rewrite the toxic text in non-toxic style. \n\...,THERE IS A MEGATHREAD FOR VACCINE OR COVIDE RE...
2,the fuck.... you are on drugs,uh..... you are not being realistic,Rewrite the toxic text in non-toxic style. \n\...,the fuck.... you are on drugs
3,NTA. Dump his ass. Tablemanners are no rocke...,You should leave him. Tablemanners are no rock...,Rewrite the toxic text in non-toxic style. \n\...,Tablemanners are no rocket science. Treating o...
4,Youre soft as baby shit,Youre really soft,Rewrite the toxic text in non-toxic style. \n\...,You're soft as baby shit.
...,...,...,...,...
194,"NTA, Jes-us your brother is dumb. How long did...","NTA, your brother is not thinking straight. Ho...",Rewrite the toxic text in non-toxic style. \n\...,Jes-us your brother is dumb. How long did he t...
195,Formula one…nascar….shit it exactly the same e...,Formula one…nascar….it exactly the same except...,Rewrite the toxic text in non-toxic style. \n\...,Formula one...nascar....shit it exactly the sa...
196,"BRB, gonna go call the mods pathetic egotistic...","BRB, gonna go call the mods.",Rewrite the toxic text in non-toxic style. \n\...,"BRB, gonna go call the mods."
197,CUCKOLD Carlson is a problem,Carlson is the problem,Rewrite the toxic text in non-toxic style. \n\...,Carlson is a problem.


In [18]:
df_fewshot.to_csv('test_output.csv',sep='\t',index=False)

## Evaluation

In [19]:
import evaluate

rouge = evaluate.load('rouge')
bleu = evaluate.load('bleu')
# bleurt = evaluate.load('bleurt')

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [20]:
#@title Baseline Score on the source and target
print(rouge.compute(predictions=df_fewshot.source,
              references=df_fewshot.target))
print(bleu.compute(predictions=df_fewshot.source,
              references=df_fewshot.target))

# If my predictions did nothing but repeat the same toxic text, I'd get these scores

{'rouge1': 0.6887918451280337, 'rouge2': 0.5605071385408666, 'rougeL': 0.6829326508983988, 'rougeLsum': 0.6845511504130748}
{'bleu': 0.5391232310503405, 'precisions': [0.6839945280437757, 0.5702752293577982, 0.49604117181314333, 0.436613665663945], 'brevity_penalty': 1.0, 'length_ratio': 1.1498230436492332, 'translation_length': 2924, 'reference_length': 2543}


In [21]:
#@title Score after few shot learning
print(rouge.compute(predictions=df_fewshot.fewshot_output,
              references=df_fewshot.target))
print(bleu.compute(predictions=df_fewshot.fewshot_output,
              references=df_fewshot.target))

{'rouge1': 0.6425955601886706, 'rouge2': 0.496170261974479, 'rougeL': 0.6344722227896112, 'rougeLsum': 0.6359447933623146}
{'bleu': 0.45508569897921963, 'precisions': [0.6467203682393556, 0.4962624584717608, 0.4051607062019013, 0.32985074626865674], 'brevity_penalty': 1.0, 'length_ratio': 1.0251671254423909, 'translation_length': 2607, 'reference_length': 2543}


# Tune the model with training set, then do few-shot learning again

In [22]:
class ToxicData(Dataset):
    def __init__(self, data, tokenizer):
        super(Dataset, self).__init__()
        self.source_texts = data['offensive-text']
        self.target_texts = data['style-transferred-text']
        self.tokenizer=tokenizer

    def __len__(self):
        return len(self.target_texts)

    def __getitem__(self, idx):
        source_text = self.source_texts[idx]
        target_text = self.target_texts[idx]

        source_tokens = self.tokenizer.encode(source_text, padding='max_length',truncation=True)
        source_tensor = torch.tensor(source_tokens, dtype=torch.long)

        target_tokens = self.tokenizer.encode(target_text, padding='max_length',truncation=True)
        target_tensor = torch.tensor(target_tokens, dtype=torch.long)

        if torch.cuda.is_available():
            source_tensor = source_tensor.cuda()
            target_tensor = target_tensor.cuda()

        return source_tensor, target_tensor

In [30]:
t5model.to(device)

train_data = ToxicData(df_train, t5tokenizer)
dev_data = ToxicData(df_dev, t5tokenizer)

In [31]:
train_loader = DataLoader(train_data, batch_size=8, shuffle=True)
dev_loader = DataLoader(dev_data, batch_size=8, shuffle=True)

In [38]:
optimizer = AdamW(t5model.parameters(), lr=5e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
num_epochs = 10

for epoch in range(num_epochs):
    # Training step
    t5model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        source_batch, target_batch = batch
        
        # Encode the source text with Flaubert and feed it into the T5 model
        outputs = t5model(input_ids=source_batch, labels=target_batch)
        loss = outputs.loss
        
        # Backpropagate the loss and update the weights
        loss.backward()
        optimizer.step()
        scheduler.step()
    
    t5model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in dev_loader:
            source_batch, target_batch = batch
            outputs = t5model(input_ids=source_batch, labels=target_batch)
            loss = outputs.loss
            total_loss += loss.item() * len(source_batch)

    # Print the loss for each epoch
    train_loss = loss.item()
    eval_loss = total_loss / len(eval_dataset)
    print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {train_loss:.4f} - Eval Loss: {eval_loss:.4f}")

OutOfMemoryError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 7.79 GiB total capacity; 6.26 GiB already allocated; 100.75 MiB free; 6.35 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF