# RoBERTa Regression

In [2]:
import sys
sys.path.append('../')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm
import torch
from collections import defaultdict, Counter
import random
import math
import pickle

import src.eval_metric
import src.model
import src.dataloader

%matplotlib inline
%load_ext autoreload
%autoreload 2
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [3]:
train_df = pd.read_csv("../../data/training_data/train.csv")
valid_df = pd.read_csv("../../data/training_data/dev.csv")
test_df = pd.read_csv("../../data/test_data/test_task1.csv")

## Fine-tune model

In [4]:
print(valid_df['text_name'].unique())

['ZuCo1' 'ZuCo2' 'Provo' 'BSC' 'RSC' 'PAHEC' 'PoTeC' 'GECO-NL']


In [5]:
text_name = 'Provo'

In [6]:
model_trainer = src.model.ModelTrainer(text_name=text_name)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
model_trainer.train(train_df, valid_df, num_epochs=100)

▁
T
h
e
y
0
50 67 0
▁
I
t
0
71 91 0
▁
d
a
y
s
0
44 59 0
▁
W
i
t
h
0
74 95 0
▁
c
0
16 27 0
▁
I
n
0
27 40 0
▁
A
s
0
55 72 0
▁
B
u
t
0
18 29 0
▁
h
i
m
s
e
l
f
0
105 133 0
▁
S
h
e
0
8 14 0
▁
s
i
m
i
l
a
r
0
54 71 0
▁
T
h
e
0
67 87 0
▁
T
h
e
y
0
47 64 0
▁
e
a
r
l
y
0
59 76 0
▁
a
r
e
0
0 1 0
▁
b
e
c
a
u
s
e
0
77 98 0
▁
e
a
r
l
y
0
53 70 0
▁
i
s
0
99 125 0
▁
M
u
r
p
h
y
0
22 35 0
▁
I
n
0
104 130 0
▁
T
h
e
0
68 88 0
▁
v
e
n
t
i
l
a
t
i
o
n
0
83 106 0
▁
T
h
e
0
19 32 0
▁
h
u
m
a
n
0
13 21 0
▁
o
n
l
y
0
45 61 0
▁
i
n
0
78 100 0
▁
H
i
s
0
29 42 0
▁
t
h
a
t
0
75 96 0
▁
t
h
e
0
97 122 0
▁
E
v
e
n
t
u
a
l
0
58 75 0
▁
O
n
e
0
42 57 0
▁
P
e
o
p
l
e
0
33 48 0
▁
o
f
t
e
n
0
81 104 0
▁
B
0
86 110 0
▁
s
c
h
o
o
l
s
0
26 39 0
▁
O
t
h
e
r
0
62 81 0
▁
v
e
h
i
c
l
e
s
0
49 66 0
▁
a
c
c
o
u
n
t
0
66 86 0
▁
H
o
w
e
v
e
r
0
85 108 0
▁
T
h
e
0
80 103 0
▁
S
o
0
106 134 0
▁
N
o
0
96 121 0
▁
O
0
52 69 0
▁
i
n
0
35 50 0
▁
d
i
d
n
0
11 18 0
▁
O
p
e
n
0
84 107 0
▁
S
h
e
0
12 19 0
▁
w
a
s
0
5 10 0
▁
c
e
n
t
u
r
i
0
95 1

KeyboardInterrupt: 

## Make predictions

In [69]:
test_df = test_df[test_df.text_name == text_name]

In [70]:
predict_df = model_trainer.test(test_df)

ValueError: need at least one array to concatenate

In [71]:
predict_df

Unnamed: 0,language,sentence_id,word_id,word,text_name,FFDAvg,FFDStd,TRTAvg,TRTStd
994,en,3,0.0,With,ZuCo2,14.201734,6.923609,19.918240,14.100842
995,en,3,1.0,his,ZuCo2,12.138766,3.690258,18.772739,9.896083
996,en,3,2.0,interest,ZuCo2,12.235614,4.245877,20.214228,13.504408
997,en,3,3.0,in,ZuCo2,12.606339,4.326952,15.290634,6.981796
998,en,3,4.0,race,ZuCo2,13.503754,5.206844,20.432705,12.336987
...,...,...,...,...,...,...,...,...,...
2116,en,346,22.0,known,ZuCo2,12.836598,4.043539,17.862795,8.498557
2117,en,346,23.0,as,ZuCo2,12.745499,4.463977,13.680372,5.261281
2118,en,346,24.0,the,ZuCo2,11.971578,3.904126,13.483535,5.500702
2119,en,346,25.0,Bush,ZuCo2,12.097002,3.669006,15.308820,6.885437


In [None]:
predict_df.to_csv(text_name+"_predictions.csv", index=False)

In [None]:
src.eval_metric.evaluate(predict_df, valid_df)