In [1]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
book_names = ['ferdydurke','gombrowicz diary', 'gombrowicz diary_2','gombrowicz diary_3', 'gombrowicz-cosmospdf']

df_g = pd.concat([pd.read_csv(f"./input/processed_books/{book_name}.csv", sep = ";") for book_name in book_names])

In [3]:
df_g.head(3)

Unnamed: 0,context1,context2,context3,context4,context5,context6,context7,response
0,And this is only a foretaste of insolence to c...,"Published in late 1937, when its author was th...","The title of his first, Memoirs of a Time of ...",Perhaps this is why Gombrowicz opted for jabbe...,"That first book, whose title was pounced on by...",Had the title of his volume of fanciful storie...,Now he would really provoke.,"Published in late 1937, when its author was th..."
1,"Published in late 1937, when its author was th...","The title of his first, Memoirs of a Time of ...",Perhaps this is why Gombrowicz opted for jabbe...,"That first book, whose title was pounced on by...",Had the title of his volume of fanciful storie...,Now he would really provoke.,He would write an epic in defense of immaturity.,"The title of his first, Memoirs of a Time of ..."
2,"The title of his first, Memoirs of a Time of ...",Perhaps this is why Gombrowicz opted for jabbe...,"That first book, whose title was pounced on by...",Had the title of his volume of fanciful storie...,Now he would really provoke.,He would write an epic in defense of immaturity.,"As he declared toward the end of his life: ""Im...",Perhaps this is why Gombrowicz opted for jabbe...


In [4]:
import os
import numpy as np
import random

data_g = df_g['response'].tolist()
data = random.sample(data_g, 11000)
labels_g = [1] * len(data)

style_names=['aae', 'bible', 'coha_1810-1830', 'coha_1890-1910', 'coha_1990-2000', 'english_tweets', 'joyce', 'lyrics', 'romantic_poetry', 'shakespeare', 'switchboard']

data_o = np.array([open(os.path.join("./style_samples", style + ".txt"), "r").read().splitlines() for style in style_names]).flatten()
labels_o = [0] * len(data_o)

data.extend(data_o)
labels = labels_g + labels_o

In [5]:
train_texts, test_texts, train_labels, test_labels = train_test_split(data, labels, test_size=0.3)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2)

In [9]:
from chatbot.classifier import Classifier

classifier = Classifier(model_path='./roberta-checkpoint/', device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))

predictions = []
for text in test_texts:
    prediction = classifier.get_prediction(text)
    predictions.append(prediction)

In [10]:
from sklearn.metrics import accuracy_score, f1_score

accuracy = accuracy_score(test_labels, predictions)
print(f'Accuracy: {accuracy}')

f1 = f1_score(test_labels, predictions, average='weighted')
print(f'F1 score: {f1}')

Accuracy: 0.95
F1 score: 0.94999499949995
