# Fine-tuned RoBERTa with other data from [style transfer paraphrase paper repo](https://github.com/martiansideofthemoon/style-transfer-paraphrase)
(11000 sentences of styles provided in the paper)

In [3]:
!pip install transformers[torch]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split

In [5]:
torch.cuda.device_count()

1

In [6]:
import pandas as pd
book_names = ['ferdydurke','gombrowicz diary', 'gombrowicz diary_2','gombrowicz diary_3', 'gombrowicz-cosmospdf']

df_g = pd.concat([pd.read_csv(f"./input/processed_books/{book_name}.csv", sep = ";") for book_name in book_names])

In [7]:
df_g.head(3)

Unnamed: 0,context1,context2,context3,context4,context5,context6,context7,response
0,And this is only a foretaste of insolence to c...,"Published in late 1937, when its author was th...","The title of his first, Memoirs of a Time of ...",Perhaps this is why Gombrowicz opted for jabbe...,"That first book, whose title was pounced on by...",Had the title of his volume of fanciful storie...,Now he would really provoke.,"Published in late 1937, when its author was th..."
1,"Published in late 1937, when its author was th...","The title of his first, Memoirs of a Time of ...",Perhaps this is why Gombrowicz opted for jabbe...,"That first book, whose title was pounced on by...",Had the title of his volume of fanciful storie...,Now he would really provoke.,He would write an epic in defense of immaturity.,"The title of his first, Memoirs of a Time of ..."
2,"The title of his first, Memoirs of a Time of ...",Perhaps this is why Gombrowicz opted for jabbe...,"That first book, whose title was pounced on by...",Had the title of his volume of fanciful storie...,Now he would really provoke.,He would write an epic in defense of immaturity.,"As he declared toward the end of his life: ""Im...",Perhaps this is why Gombrowicz opted for jabbe...


In [10]:
import os
import numpy as np
import random

data_g = df_g['response'].tolist()
data = random.sample(data_g, 11000)
labels_g = [1] * len(data)

style_names=['aae', 'bible', 'coha_1810-1830', 'coha_1890-1910', 'coha_1990-2000', 'english_tweets', 'joyce', 'lyrics', 'romantic_poetry', 'shakespeare', 'switchboard']

data_o = np.array([open(os.path.join("./style_samples", style + ".txt"), "r").read().splitlines() for style in style_names]).flatten()
labels_o = [0] * len(data_o)

data.extend(data_o)
labels = labels_g + labels_o

In [11]:
train_texts, test_texts, train_labels, test_labels = train_test_split(data, labels, test_size=0.3)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2)

In [12]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

model = RobertaForSequenceClassification.from_pretrained('roberta-base')

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should pr

In [13]:
train_texts = [str(l) for l in train_texts]
val_texts = [str(l) for l in val_texts]
test_texts = [str(l) for l in test_texts]

train_encodings = tokenizer(train_texts, truncation=True, padding=True, add_special_tokens=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, add_special_tokens=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, add_special_tokens=True)

In [14]:
class ClsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = ClsDataset(train_encodings, train_labels)
val_dataset = ClsDataset(val_encodings, val_labels)
test_dataset = ClsDataset(test_encodings, test_labels)

In [15]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)


trainer.train()



Step,Training Loss
10,0.6932
20,0.7116
30,0.6904
40,0.6908
50,0.6921
60,0.6959
70,0.6904
80,0.6929
90,0.675
100,0.6592


TrainOutput(global_step=2310, training_loss=0.21180257849895207, metrics={'train_runtime': 3413.4002, 'train_samples_per_second': 10.828, 'train_steps_per_second': 0.677, 'total_flos': 9724584606105600.0, 'train_loss': 0.21180257849895207, 'epoch': 3.0})

In [28]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = model.to(device)

inputs = tokenizer("Because, gentlemen, Siowacki—oh, what a great poet he was!", return_tensors="pt")

inputs = {k: v.to(device) for k, v in inputs.items()}

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.softmax(logits, dim=-1)
    predicted_class = torch.argmax(probabilities, dim=-1)

print(predicted_class)

tensor([1], device='cuda:0')
