# **CommonLit Readability**

### **Import libraries**

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pylab import rcParams
import torch
import torch.nn.functional as F
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup, RobertaTokenizer, RobertaModel
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from torch.utils import data
from torch import nn, optim
from collections import defaultdict
import warnings
%matplotlib inline

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
pd.set_option("display.max_colwidth", None)

In [None]:
warnings.filterwarnings("ignore")

## **Data preparation**

### **Load data**

#### Load and describe the training data

In [None]:
filename = "../input/commonlitreadabilityprize/train.csv"
df_train = pd.read_csv(filename)
df_train = df_train.drop(["url_legal", "license"], axis = 1)

In [None]:
df_train.head(1)

In [None]:
df_train.info()

In [None]:
df_train['target'].describe()

In [None]:
std = df_train['target'].std()
mean = df_train['target'].mean()
print('mean:', mean)
print('std: ', std)

#### Load the test data

In [None]:
filename = "../input/commonlitreadabilityprize/test.csv"
df_test = pd.read_csv(filename)
df_test = df_test.drop(["url_legal", "license"], axis = 1)

In [None]:
df_test.head(1)

### **Transform data**

In [None]:
def to_string(row_text):
  lines = row_text.split('\n')
  string = ""
  for line in lines:
    string = string + " " + line
  return string

#### Remove new lines from the training data

In [None]:
df_train['excerpt'] = df_train['excerpt'].apply(to_string)

In [None]:
df_train.head(1)

#### Remove new lines from the test data

In [None]:
df_test['excerpt'] = df_test['excerpt'].apply(to_string)

In [None]:
df_test.head(1)

### **Exploratory Data Analysis**

In [None]:
sns.set_style("darkgrid")
rcParams['figure.figsize'] = 9, 6

In [None]:
sns.kdeplot(df_train.target, shade=True, color="r")
plt.xlabel('Average ratings')
plt.show()

In [None]:
sns.kdeplot(df_train.standard_error, shade=True, color="r")
plt.xlabel('Standard errors')
plt.show()

In [None]:
x=df_train['target']
y=df_train['standard_error']
plt.scatter(x=x, y=y)
plt.annotate("remove", xy=(0, 0), arrowprops=dict(facecolor='orange', shrink=0.05), 
             xytext=(0.6, 0.3), textcoords='axes fraction', fontsize=12, weight='bold',
             horizontalalignment='right', verticalalignment='top', color='orange')
plt.xlabel('Targets')
plt.ylabel('Standard errors')
plt.show()

In [None]:
ind = df_train[df_train['target'] == 0].index
df_train = df_train.drop(ind)

Bin the target column and group it according to standard errors. 1 standard error around the mean represents _group: 0_ and more than 1 standard error around the mean makes up _group: 1_.

In [None]:
lower_bound = mean - std
upper_bound = mean + std
lower_bound, upper_bound

In [None]:
plt.scatter(x=df_train['target'], y=df_train['standard_error'])

plt.axvline(x=lower_bound, ymin=0, ymax=1, linewidth=1.5, linestyle="--", color='darkorchid')
plt.axvline(x=upper_bound, ymin=0, ymax=1, linewidth=1.5, linestyle="--", color='darkorchid')

plt.text(x=-1.3, y=0.60, s='group: 0', size='large', color='red')
plt.text(x=-3.3, y=0.45, s='group: 1', size='large', color='red')
plt.text(x=0.8, y=0.45, s='group: 1', size='large', color='red')

plt.xlabel('Targets')
plt.ylabel('Standard errors')
plt.show()

In [None]:
min_value = df_train["target"].min()
max_value = df_train["target"].max()
print("min: ",  min_value)
print("max: ",  max_value)

### **Group the data**

#### Add group name to the dataframe.

In [None]:
def group_by(row):
  if row <  lower_bound or row >  upper_bound:
    group_name = 1
  else:
    group_name = 0
  return group_name

In [None]:
df_train['group'] = df_train['target'].apply(group_by)
df_train.head(1)

In [None]:
print("group 0:", len(df_train[df_train['group'] == 0]))
print("group 1:", len(df_train[df_train['group'] == 1]))

In [None]:
input_size = len(df_bert_embed['last_hidden_states'].head(1)[0])
input_size

### **Choose sequence length**

In [None]:
PRE_TRAINED_MODEL = 'roberta-base'

In [None]:
tokenizer = RobertaTokenizer.from_pretrained(PRE_TRAINED_MODEL, do_lower_case=True)

#### Sequence length for the training and test data

In [None]:
%%time

for df in [("training data", df_train), ("test data", df_test)]:
  excerpt_tokens = []
  for excerpt in df[1].excerpt:
    tokens = tokenizer.tokenize(excerpt)
    excerpt_tokens.append(len(tokens))

  min_tokens = min(excerpt_tokens)
  max_tokens = max(excerpt_tokens)
  print(df[0],":")
  print("-" * 100)
  print('min ve max tokens:', min_tokens, max_tokens)
  print('\n')

  sns.histplot(excerpt_tokens)
  plt.xlim([min_tokens-50, max_tokens+50]);
  plt.xlabel('Token count');
  plt.show()

  print('\n')


## **Dataset creation**

### **Parameters**

In [None]:
BS = 4
MAX_LEN = 320
EPOCHS = 5
RANDOM_SEED = 42
BIAS = False
SPLIT_RATIO = 0.2
DROPOUT = 0.3

WD = 0
LEARNING_RATE = 1e-6
NUM_WARMUP_PERCENTAGE = 0.1

In [None]:
def define_scheduler(data_loader):

  total_steps = len(data_loader) * EPOCHS

  scheduler = get_linear_schedule_with_warmup(
      optimizer,
      num_warmup_steps=int(NUM_WARMUP_PERCENTAGE*total_steps),
      num_training_steps=total_steps
  )

  return scheduler

### **Train-validation split**

In [None]:
def split_train_val(full_data, split_ratio):
  df_train, df_val = train_test_split(
      full_data,
      test_size=split_ratio,
      random_state=RANDOM_SEED
      )

  print("training data:", df_train.shape)
  print("validation data:", df_val.shape)

  return df_train, df_val

### **Encoding**

In [None]:
class ExcerptDataset(data.Dataset):

  def __init__(self, ids, excerpts, labels, label_dtype, tokenizer, max_len):
    self.ids = ids
    self.excerpts = excerpts
    self.labels = labels
    self.label_dtype = label_dtype
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.excerpts)

  def __getitem__(self, item):
    excerptid = str(self.ids[item])
    excerpt = str(self.excerpts[item])
    label = self.labels[item]

    encoding = tokenizer.encode_plus(
        excerpt,
        max_length=self.max_len,
        truncation=True,
        add_special_tokens=True,
        padding='max_length',
        return_attention_mask=True, 
        return_token_type_ids=False,
        return_tensors='pt'
    )

    return {
      'id': excerptid,
      'excerpt_text': excerpt,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'label': torch.tensor(label.reshape(-1,), dtype=self.label_dtype)
      #'label': torch.tensor(label, dtype=self.label_dtype)
    }

### **Create data loaders**

In [None]:
def create_data_loader(df, label_name, dtype, tokenizer, max_len, batch_size=4, shuffle=True):
  dataset = ExcerptDataset(
    ids = df.id.to_numpy(),
    excerpts=df.excerpt.to_numpy(),
    labels=df[label_name].to_numpy(),
    label_dtype=dtype,
    tokenizer=tokenizer,
    max_len=max_len
  )

  data_loader = data.DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=shuffle,
    num_workers=2
  )

  data_item = next(iter(data_loader))

  print(data_item.keys())
  print('\n')
  print(data_item['input_ids'].shape)
  print(data_item['attention_mask'].shape)
  print(data_item['label'].shape)
  print('\n')
  print("input_ids:", data_item['input_ids'])
  print("attention_mask:", data_item['attention_mask'])
  print("labels:", data_item['label'])

  return data_loader

## **Training, Evaluation and Prediction**

### **1. Classification**

#### **Split train-validation for the classification model**

In [None]:
class_train_data, class_val_data = split_train_val(df_train, SPLIT_RATIO)

#### **Create data loaders for the classification model**

In [None]:
label_name = "group" 
dtype = torch.float

class_train_data_loader = create_data_loader(class_train_data, label_name, dtype, tokenizer, MAX_LEN, batch_size=BS, shuffle=True)
class_val_data_loader = create_data_loader(class_val_data, label_name, dtype, tokenizer, MAX_LEN, batch_size=BS, shuffle=True)

#### **Classify the data**

#### Classification model

In [None]:
class ExcerptClassification(nn.Module):

  def __init__(self):
    super(ExcerptClassification, self).__init__()
    self.roberta = RobertaModel.from_pretrained(PRE_TRAINED_MODEL)
    self.drop = nn.Dropout(p=DROPOUT)
    self.linear = nn.Linear(self.roberta.config.hidden_size, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, input_ids, attention_mask):
    _, output = self.roberta(input_ids=input_ids, attention_mask=attention_mask, return_dict = False)
    output = self.drop(output)
    output = self.linear(output)
    output = self.sigmoid(output)

    return output

In [None]:
class_model = ExcerptClassification()

class_model = class_model.to(device)

#### Optimizer

In [None]:
optimizer = AdamW(class_model.parameters(), lr=LEARNING_RATE, correct_bias=BIAS) # This works ok

loss_fn = nn.BCELoss().to(device)

#### Training

In [None]:
def class_train(class_model, data_loader, loss_fn, optimizer, device, no_samples):

    scheduler = define_scheduler(data_loader)
    class_model = class_model.train()
    losses = []
    correct_predictions = 0

    for dl in data_loader:
        input_ids = dl["input_ids"].to(device)
        attention_mask = dl["attention_mask"].to(device)
        groups = dl['label'].to(device)

        outputs = class_model(
          input_ids=input_ids,
          attention_mask=attention_mask,
        )

        predictions = outputs.round()
        loss = loss_fn(outputs, groups)
        correct_predictions += torch.sum(predictions == groups)
        losses.append(loss.item())

        loss.backward()
        #nn.utils.clip_grad_norm_(class_model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / no_samples, np.mean(losses)

#### Evaluation

In [None]:
def class_evaluate(class_model, data_loader, loss_fn, device, no_samples):

    class_model = class_model.eval()
    losses = []
    real_values = []
    predicted_values = []
    correct_predictions = 0

    with torch.no_grad():

        for dl in data_loader:
            input_ids = dl["input_ids"].to(device)
            attention_mask = dl["attention_mask"].to(device)
            groups = dl['label'].to(device)

            outputs = class_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            )

            predictions = outputs.round()
            real_values.extend(groups)
            predicted_values.extend(predictions)
            loss = loss_fn(outputs, groups)
            correct_predictions += torch.sum(predictions == groups)
            losses.append(loss.item())

    return correct_predictions.double() / no_samples, np.mean(losses), real_values, predicted_values

#### Run the model

In [None]:
%%time

results = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 60)

    class_train_acc, class_train_loss = class_train(
    class_model,
    class_train_data_loader,
    loss_fn, 
    optimizer,
    device,
    len(class_train_data)
    )

    print(f'Train loss {class_train_loss} accuracy {class_train_acc}')

    class_val_acc, class_val_loss, real_values, predicted_values = class_evaluate(
    class_model, 
    class_val_data_loader, 
    loss_fn, 
    device,
    len(class_val_data)
    )

    print(f'Val   loss {class_val_loss} accuracy {class_val_acc}')
    print('\n')

    results['class_train_acc'].append(class_train_acc)
    results['class_train_loss'].append(class_train_loss)
    results['class_val_acc'].append(class_val_acc)
    results['class_val_loss'].append(class_val_loss)

    if class_val_acc > best_accuracy:
        name = "best_classification_model.pt"
        torch.save(class_model.state_dict(), './' + name)
        y = [x.item() for x in real_values]
        yhat = [x.item() for x in predicted_values]
        best_accuracy = class_val_acc

In [None]:
train_accuracy = results['class_train_acc']
validation_accuracy = results['class_val_acc']

max_train_accuracy= max(train_accuracy).item()
max_val_accuracy = max(validation_accuracy).item()

print("maximum train accuracy:", max_train_accuracy)
print("maximum validation accuracy:", max_val_accuracy)

print('\n')

print("model: ", PRE_TRAINED_MODEL)
print("batch size:", BS)
print("maximum sequence length:", MAX_LEN)
print("number of epochs:", EPOCHS)
print("random seed:", RANDOM_SEED)
print("learning rate:", LEARNING_RATE)
print("weight decay:", WD)
print("warmup percentage:", NUM_WARMUP_PERCENTAGE)
print("bias correction:", BIAS)
print("dropout:", DROPOUT)
print("split ratio:", SPLIT_RATIO)

#### Plot accuracy

In [None]:
plt.plot(results['class_train_acc'], label='classification train accuracy')
plt.plot(results['class_val_acc'], label='classification validation accuracy')
plt.title('Training results')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1])
plt.axhline(y=max_val_accuracy, linewidth=1.5, linestyle="--", color='darkorchid')

plt.show()

#### Classification report

In [None]:
group_names = [str(0), str(1)]

In [None]:
print(classification_report(y, yhat, target_names=group_names))

In [None]:
cm = confusion_matrix(y, yhat)
df_cm = pd.DataFrame(cm, index=group_names, columns=group_names)

hmap = sns.heatmap(df_cm, annot=True, fmt="d", cmap="Blues")
hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
plt.ylabel('True group')
plt.xlabel('Predicted group')
plt.show()

#### **Predict for the validation and test data**

#### Load the best model

In [None]:
class_model = ExcerptClassification()
class_model.load_state_dict(torch.load('../input/commonlit-readability/best_classification_model_0.7743.pt', map_location=device))
class_model = class_model.to(device)

#### Create data loaders of the validation data and test data

In [None]:
df_test['group'] = 100
label_name = 'group'
dtype = torch.float

predict_test_data_loader = create_data_loader(df_test, label_name, dtype, tokenizer, MAX_LEN, batch_size=BS, shuffle=False)

#### Predict

In [None]:
def find_class_predictions(class_model, data_loader):

    id_data = []
    excerpt_data = []
    output_data = []
    targets = []

    class_model = class_model.eval()

    with torch.no_grad():

        for dl in data_loader:

            excerpt_id = dl['id']
            excerpt = dl['excerpt_text']
            input_ids = dl["input_ids"].to(device)
            attention_mask = dl["attention_mask"].to(device)

            outputs = class_model(
            input_ids=input_ids,
            attention_mask=attention_mask
            )

            prediction = outputs.round()
            
            output_data.extend([int(x.item()) for x in prediction])
            id_data.extend(excerpt_id)
            excerpt_data.extend(excerpt)

            if dl['label'] is not None:
                target = dl['label']
                targets.extend([x.item() for x in target])

    return id_data, excerpt_data, output_data, targets

#### Predict the class for the test data

In [None]:
id_test, excerpt_test, output_test, targets_test = find_class_predictions(class_model, predict_test_data_loader)

#### Return the output table

In [None]:
def output(id_data, excerpt_data, output_data, target_data):
  df_outputs = pd.DataFrame(list(zip(id_data, excerpt_data, output_data, target_data)), columns =['id', 'excerpt', 'group', "target"])
  return df_outputs

In [None]:
df_test_output = output(id_test, excerpt_test, output_test, targets_test)
df_test_output

#### **Save the results**

In [None]:
df_test_output.to_csv('./classified_test.csv')

### **2. Regression**

#### **Training dataset creation**

#### Split the data across groups

In [None]:
data_0 = df_train[df_train['group'] == 0]
data_1 = df_train[df_train['group'] == 1]

#### Train-validation split

In [None]:
reg_train_data_0, reg_val_data_0 = split_train_val(data_0, SPLIT_RATIO)
reg_train_data_1, reg_val_data_1 = split_train_val(data_1, SPLIT_RATIO)

#### **Create data loaders for the regression model**

In [None]:
label_name = "target"
dtype = torch.float

reg_train_data_loader_0 = create_data_loader(reg_train_data_0, label_name, dtype, tokenizer, MAX_LEN, batch_size=BS, shuffle=True)
reg_val_train_data_loader_0 = create_data_loader(reg_val_data_0, label_name, dtype, tokenizer, MAX_LEN, batch_size=BS, shuffle=True)

reg_train_data_loader_1 = create_data_loader(reg_train_data_1, label_name, dtype, tokenizer, MAX_LEN, batch_size=BS, shuffle=True)
reg_val_train_data_loader_1 = create_data_loader(reg_val_data_1, label_name, dtype, tokenizer, MAX_LEN, batch_size=BS, shuffle=True)

#### **Implement regression**

#### Regression model

In [None]:
class ExcerptRegression(nn.Module):

  def __init__(self):
    super(ExcerptRegression, self).__init__()
    self.roberta = RobertaModel.from_pretrained(PRE_TRAINED_MODEL)
    self.drop = nn.Dropout(p=DROPOUT)
    self.linear = nn.Linear(self.roberta.config.hidden_size, 1)

  def forward(self, input_ids, attention_mask):
    _, output = self.roberta(input_ids=input_ids, attention_mask=attention_mask, return_dict = False)
    output = self.drop(output)
    output = self.linear(output)
    return output

In [None]:
reg_model = ExcerptRegression()

reg_model = reg_model.to(device)

#### **Define loss function RMSE**

In [None]:
class RMSELoss(nn.Module):
    def __init__(self):
        super(RMSELoss,self).__init__()
        self.mse = nn.MSELoss()

    def forward(self,pred,y):
        loss_fn = torch.sqrt(self.mse(pred, y))
        return loss_fn

#### Optimizer

In [None]:
LEARNING_RATE = 2e-5

In [None]:
optimizer = AdamW(reg_model.parameters(), lr=LEARNING_RATE, correct_bias=BIAS)

loss_fn = RMSELoss().to(device)

#### Training

In [None]:
def train(reg_model, data_loader, loss_fn, optimizer, device):

    scheduler = define_scheduler(data_loader)
    reg_model = reg_model.train()
    losses = []

    for dl in data_loader:
        input_ids = dl["input_ids"].to(device)
        attention_mask = dl["attention_mask"].to(device)
        targets = dl["label"].to(device)

        outputs = reg_model(
          input_ids=input_ids,
          attention_mask=attention_mask,
        )

        loss = loss_fn(outputs.view(-1), targets.view(-1)) # pred, y
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(reg_model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return np.mean(losses)

#### Evaluation

In [None]:
def evaluate(reg_model, data_loader, loss_fn, device):

    reg_model = reg_model.eval()
    losses = []

    with torch.no_grad():

        for dl in data_loader:
            input_ids = dl["input_ids"].to(device)
            attention_mask = dl["attention_mask"].to(device)
            targets = dl["label"].to(device)

            outputs = reg_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            )

            loss = loss_fn(outputs.view(-1), targets.view(-1)) 
            losses.append(loss.item())

    return np.mean(losses)

#### Run the model

In [None]:
%%time

results_by_group = {}
training_data = {'group_0': [reg_train_data_loader_0, reg_val_train_data_loader_0], 
                 'group_1': [reg_train_data_loader_1, reg_val_train_data_loader_1]}

for key, value in training_data.items():
    epoch_results = defaultdict(list)
    min_loss = 1000.0

    print("TRAINING RESULTS FOR ", key, ":")
    print('*' * 50)
    print('\n')

    for epoch in range(EPOCHS):
        print(f'Epoch {epoch + 1}/{EPOCHS}')
        print('-' * 50)

        train_loss = train(reg_model, value[0], loss_fn, optimizer, device)

        print(f'Train loss for: {train_loss}')
        print('\n')

        val_loss = evaluate(reg_model, value[1], loss_fn, device)

        print(f'Validation loss: {val_loss}')
        print('\n')

        epoch_results['train_loss'].append(train_loss)
        epoch_results['validation_loss'].append(val_loss)

        if val_loss < min_loss:
            name = f"best_regression_model_for_{key}.pt"
            torch.save(reg_model.state_dict(), './' + name)
            min_loss = val_loss

    results_by_group[key] = epoch_results

#### Plot the results

In [None]:
group_0 = results_by_group['group_0']
group_1 = results_by_group['group_1']

print(group_0['train_loss'])
print(group_0['validation_loss'])
print('\n')

print(group_1['train_loss'])
print(group_1['validation_loss'])
print('\n')

group_0_minval = min(group_0['validation_loss'])
group_1_minval = min(group_1['validation_loss'])

print("minimum validation loss for group_0:", group_0_minval)
print("minimum validation loss for group_1:", group_1_minval)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18,9))
fig.suptitle('Loss Functions for Groups 0 & 1', fontsize=18, fontweight='bold')
fig.tight_layout(pad=8)

group_0_index = group_0['validation_loss'].index(group_0_minval)
group_1_index = group_1['validation_loss'].index(group_1_minval)

x = range(1, EPOCHS+1)

ya1 = group_0['train_loss']
ya2 = group_0['validation_loss']

ax1.plot(x, ya1, label='train loss')
ax1.plot(x, ya2, label='validation loss')
ax1.set_title('Training history for group_0', fontsize=15)
ax1.set_xlabel('Epoch', fontsize=12)
ax1.set_ylabel('Loss', fontsize=12)
ax1.legend()
ax1.axvline(x=x[group_0_index], ymin=0, ymax=1, linewidth=1.5, linestyle="--", color='darkorchid')
ax1.axhline(y = group_0_minval, linewidth=1.5, linestyle="--", color='darkorchid')

yb1 = group_1['train_loss']
yb2 = group_1['validation_loss']

ax2.plot(x, yb1, label='train loss')
ax2.plot(x, yb2, label='validation loss')
ax2.set_title('Training history for group_1', fontsize=15)
ax2.set_xlabel('Epoch', fontsize=12)
ax2.set_ylabel('Loss', fontsize=12)
ax2.legend()
ax2.axvline(x=x[group_1_index], ymin=0, ymax=1, linewidth=1.5, linestyle="--", color='darkorchid')
ax2.axhline(y = group_1_minval, linewidth=1.5, linestyle="--", color='darkorchid')

plt.subplots_adjust(wspace=0.2)

plt.show()

print('\n')

min_val = (group_0_minval + group_1_minval) / 2

print(f"Average minimum loss value: {min_val}")

#### **Predict the results**

#### Load the best model for group_0

In [None]:
reg_model_0 = ExcerptRegression()
reg_model_0.load_state_dict(torch.load('../input/best-model/best_regression_model_for_group_0_0.4022.pt'))
reg_model_0 = reg_model_0.to(device)

#### Load the best model for group_1

In [None]:
reg_model_1 = ExcerptRegression()
reg_model_1.load_state_dict(torch.load('../input/best-model/best_regression_model_for_group_1_0.4256.pt'))
reg_model_1 = reg_model_1.to(device)

#### Load the test data and split accross groups

In [None]:
df_test_0 = df_test_output[df_test_output['group'] == 0]
df_test_1 = df_test_output[df_test_output['group'] == 1]

#### Create data loader of the test data

In [None]:
label_name = "group" 
dtype = torch.float
BS = 8

reg_test_data_loader_0 = create_data_loader(df_test_0, label_name, dtype, tokenizer, MAX_LEN, batch_size=BS, shuffle=False)
reg_test_data_loader_1 = create_data_loader(df_test_1, label_name, dtype, tokenizer, MAX_LEN, batch_size=BS, shuffle=False)

#### Predict

In [None]:
def find_regression_predictions(reg_model, data_loader):

    id_data = []
    excerpt_data = []
    output_data = []

    reg_model = reg_model.eval()

    with torch.no_grad():

        for dl in data_loader:

            ids = dl['id']
            excerpts = dl['excerpt_text']
            input_ids = dl["input_ids"].to(device)
            attention_mask = dl["attention_mask"].to(device)

            outputs = reg_model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            outputs = outputs.flatten().tolist()
            output_data.extend(outputs)
            id_data.extend(ids)
            excerpt_data.extend(excerpts)

    return id_data, excerpt_data, output_data

In [None]:
id_data_0, excerpt_data_0, output_data_0 = find_regression_predictions(reg_model_0, reg_test_data_loader_0)
id_data_1, excerpt_data_1, output_data_1 = find_regression_predictions(reg_model_1, reg_test_data_loader_1)

In [None]:
df_predictions_0 = pd.DataFrame(list(zip(id_data_0, excerpt_data_0, output_data_0)), columns =['id', 'excerpt', 'target'])
df_predictions_0

In [None]:
df_predictions_1 = pd.DataFrame(list(zip(id_data_1, excerpt_data_1, output_data_1)), columns =['id', 'excerpt', 'target'])
df_predictions_1

#### Merge predicted test dataframes

In [None]:
df_predictions = df_predictions_0.append(df_predictions_1, ignore_index = True)
df_predictions

#### Format the test dataframe

In [None]:
df_predictions = df_predictions.drop(["excerpt"], axis = 1)
df_predictions = df_predictions.set_index('id')
df_predictions

In [None]:
index_list = df_test['id'].tolist()
index_list

In [None]:
df_predictions = df_predictions.loc[index_list]
df_predictions = df_predictions.reset_index()
df_predictions

#### **Save the results**

In [None]:
df_predictions.to_csv('./submission.csv', index=False, float_format='%.6f')