# Imports and setup

In [1]:
import pandas as pd
import numpy as np
import os
import random
import re

# Data processing.
import constants # constants.py
import dataset # dataset.py
import torch

# Model.
import models # models.py
import torch.nn as nn
from transformers import DistilBertModel

# Training.
import training # training.py
import utils # utils.py

# If you make a code change that doesn't get picked up by
# Jupyter notebook, try reloading like below:
# import imp
# imp.reload(training)

# Read the data
Skip this section if you've already ran the notebook once and have the csvs locally.

In [29]:
data_df = dataset.read_multiple_datasets([1,2,3], 'Creativity_Combined', shuffle=True)

In [30]:
test_df = data_df[1000:] # roughly 203 test examples set aside

In [31]:
print(test_df.head(1))

                                                 text  label
82  The final idea would be running shoes that wou...  4.225


In [32]:
def scramble(text):
    words = text.split()
    n = len(words)
    scrambled = random.sample(words, n)
    return(" ".join(scrambled))

scrambled_test_df = test_df.copy()
scrambled_test_df['text'] = test_df['text'].apply(scramble)

print(scrambled_test_df.head(1))

                                                 text  label
82  calories or few have a you you to spent alert ...  4.225


In [33]:
# write them to CSV files
scrambled_test_df.to_csv('scrambled_ktest.csv', index=False, header=False)

In [40]:
def word_rep(word):
    if len(word) == 1:
        word = "a"
    if len(word) == 2:
        word = "an"
    if len(word) == 3:
        word = "and"
    if len(word) == 4:
        word = "andy"
    if len(word) == 5:
        word = "antic"
    if len(word) == 6:
        word = "accent"
    if len(word) == 7:
        word = "ancient"
    if len(word) == 8:
        word = "accident"
    if len(word) == 9:
        word = "accidents"
    if len(word) == 10:
        word = "accidental"
    if len(word) >= 11:
        word = "accidentally"
    return(word)

def homogenize(text):
    words = text.split()
    homogenized = [word_rep(word) for word in words]
    return(" ".join(homogenized))

hom_test_df = test_df.copy()
hom_test_df['text'] = test_df['text'].apply(homogenize)
print(hom_test_df.head(1))

                                                 text  label
82  and antic andy antic an ancient antic andy ant...  4.225


In [41]:
# write them to CSV files
hom_test_df.to_csv('hom_ktest.csv', index=False, header=False)

## Preprocessing and transform into torchtext Dataset format.

In [45]:
# Replace with scrambled_ktest.csv or hom_ktest.csv
_, test_dataset = dataset.get_train_test_datasets('ktrain.csv','hom_ktest.csv', add=False)

# Test the trained model on held-out dataset.

In [46]:
# Get a test iterator
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
test_iterator = training.get_iterator(test_dataset, 8, device)

In [None]:
# load the best model saved
bert = DistilBertModel.from_pretrained(constants.WEIGHTS_NAME)
model = models.BERTLinear(bert, constants.OUTPUT_DIM, 0.2)
model.load_state_dict(torch.load("linear_best_valid_loss.pt"))
model.to(device)
model.eval()
# If you change the criterion, make sure it matches with the training criterion in training.py
criterion = nn.MSELoss(size_average=False)
criterion = criterion.to(device)
test_loss, test_corr = training.evaluate(model, test_iterator, criterion)
print(test_loss)
print(test_corr)

# Misc other stuff

Link to the trainer class: https://huggingface.co/transformers/main_classes/trainer.html



Default training arguments: https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments

Batch size per device: 8

Epoch: 3



This should be the model I used to generate my initial results: https://huggingface.co/transformers/model_doc/distilbert.html#distilbertforsequenceclassification
"DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks."