In [1]:
import numpy as np
import pickle
import bcolz

# Use GloVe embeddings
# Based on code by Martín Pellarolo
# https://medium.com/@martinpella/how-to-use-pre-trained-word-embeddings-in-pytorch-71ca59249f76
words = []
idx = 0
word2idx = {}
vectors = bcolz.carray(np.zeros(1), rootdir="6B.50.dat", mode='w')

# Go through GloVe embedding file and create dictionary of embeddings
with open("glove.6B.50d.txt", "rb") as file:
    for line in file:
        line = line.decode().split()
        word = line[0]
        words.append(word)
        word2idx[word] = idx
        idx += 1
        vect = np.array(line[1:]).astype(np.float)
        vectors.append(vect)

# Store embeddings in file for later use
vectors = bcolz.carray(vectors[1:].reshape((400000, 50)), rootdir="6B.50.dat", mode='w')
vectors.flush()
pickle.dump(words, open("6B.50_words.pkl", "wb"))
pickle.dump(word2idx, open("6B.50_idx.pkl", "wb"))

In [8]:
import bcolz
import os
import codecs
import math
from collections import Counter, defaultdict
import random
import pickle
import numpy as np
import torch

# Load GloVe embeddings
vectors = bcolz.open("6B.50.dat")[:]
words = pickle.load(open("6B.50_words.pkl", "rb"))
word2idx = pickle.load(open("6B.50_idx.pkl", "rb"))

# Create GloVe dictionary
glove = defaultdict(lambda: np.zeros(50))
for word in words:
    glove[word] = vectors[word2idx[word]]


levels = [0, 1, 2, 3, 4]
data = []
counts = {}
total_words = {}

# input text example class
# contains the raw text, label, and embeddings
class InputExample(object):

    def __init__(self, text, label, embeddings):
        self.text = text
        self.label = label
        self.embeddings = embeddings

    def summary(self):
        return "[" + str(self.label) + " : " + self.text[0:20] + "]"

# Function to load data
def load_data(n_data):
    global data
    data = []
    current_id = 0
    
    path = "articles"
    for filename in os.listdir(path):
        if filename.endswith(".txt"):
            file = open(os.path.join(path, filename), encoding="utf-8")
            
            # Extract level from file name
            level = float(filename[-5:-4])
            
            # Exclude the few level 5 examples
            if level == 5:
                continue
            
            # Preprocess data to work with pretrained embeddings
            text = file.read().lower()
            text = text.replace(",", " ,").replace("?", " ?").replace(".", " .").replace("!", " !")
            
            # Create list of embeddings of the text
            embeddings = []
            for word in text.split():
                embeddings.append(glove[word])
            embeddings = torch.Tensor(embeddings)
            
            # Create example object
            data.append(InputExample(text, level, embeddings))
            
            # Stop if enough examples were loaded
            current_id += 1
            if current_id >= n_data:
                break
    
    print("Data loaded.")


In [None]:
load_data(3000)
for i in range(10):
    print(data[i].summary())
    
# Check that the data is in the right format
print(len(data), "samples loaded")
print(data[0].embeddings)
print(data[0].label)

In [None]:
test_data = []
train_data = []

# Split data into train and test sets
def split_data():
    global test_data
    global train_data
    global data
    n_test_data = math.ceil(len(data)*0.1)
    test_data = data[0:n_test_data]
    train_data = data[n_test_data:]

split_data()
print(len(train_data), " training examples")
print(len(test_data), " test examples")

In [11]:
import torch, pickle, os, sys, random, time
from torch import nn, optim
import numpy as np

# RNN class, based on homework code
class RNNLM(nn.Module):
    def __init__(self, params):
        super(RNNLM, self).__init__()
        self.d_emb = params['d_emb']
        self.d_hid = params['d_hid']
        self.n_layer = 1
        self.batch_size = params['batch_size']
        
        self.rnn = nn.RNN(self.d_emb, self.d_hid, self.n_layer, batch_first=True)
        self.predict = nn.Linear(self.d_hid, 1)
        
    def forward(self, batch):
        
        # Pass input to RNN
        output, hn = self.rnn(batch)
        
        # Predict based on final RNN state
        prediction = self.predict(hn).squeeze(1).squeeze(1)
        
        return prediction

In [14]:
import dill

# Training loop, based on homework code
def train_lm(params, net):
    
    # MSE loss
    criterion = nn.MSELoss()
    criterion.to(device)
    
    optimizer = optim.Adam(net.parameters(), lr=params['learning_rate'])
   
    num_examples = len(train_data)
    
    for epoch in range(params['epochs']):
        ep_loss = 0.
        start_time = time.time()
        random.shuffle(train_data)
        net.train()
                 
        for i in range(num_examples):
            
            # Make 1-example batch
            batch = train_data[i].embeddings[None, :, :]
            
            # Get model prediction
            output = net(batch)
            
            # Get actual label
            target = torch.Tensor([train_data[i].label])
            
            # Calculate loss
            loss = criterion(output, target)
            
            # Optimize
            loss.backward()
            torch.nn.utils.clip_grad_norm_(net.parameters(), 3)
            optimizer.step()
            optimizer.zero_grad()
            ep_loss += loss
        
        net.eval()
        print('epoch: %d, loss: %0.2f, time: %0.2f sec ' % (epoch, ep_loss, time.time()-start_time))
        dill.dump_session('latest_epoch.db')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

params = {}
params['d_emb'] = 50
params['d_hid'] = 64
params['batch_size'] = 1
params['epochs'] = 20
params['learning_rate'] = 0.001

RNNnet = RNNLM(params)
RNNnet.to(device)
train_lm(params, RNNnet)

In [142]:
# Evaluate model on test set

total_error = 0.0
total_squared_error = 0.0
total_correct = 0

for test_example in test_data:
    output, hn = RNNnet.rnn(test_example.embeddings[None, :, :])
    predicted = float(RNNnet.predict(hn).squeeze(1).squeeze(1))
    actual = test_example.label
    print(round(predicted), round(actual))
    error = abs(predicted - actual)
    total_error += error
    total_squared_error += error*error
    if round(predicted) == round(actual):
        total_correct += 1

print("Test MAE: ", total_error/len(test_data))
print("Test MSE: ", total_squared_error/len(test_data))
print("Test Accuracy: ", total_correct/len(test_data))

0 0
0 1
3 2
2 3
3 4
0 0
0 1
2 2
2 3
4 4
2 0
2 1
2 2
2 3
4 4
0 0
3 1
3 2
3 3
3 4
1 0
1 1
1 2
3 3
4 4
1 0
1 1
1 2
4 3
3 4
0 0
2 1
2 2
2 3
3 4
0 0
1 1
1 2
3 3
3 4
2 0
2 1
2 2
2 3
2 4
1 0
1 1
1 2
3 3
3 4
2 0
2 1
2 2
3 3
4 4
0 0
1 1
2 2
2 3
2 4
0 0
1 1
2 2
3 3
2 4
2 0
2 1
2 2
2 3
2 4
2 0
2 1
2 2
2 3
2 4
2 0
2 1
2 2
2 3
3 4
0 0
1 1
1 2
3 3
4 4
1 0
1 1
1 2
1 3
1 4
2 0
2 1
2 2
2 3
3 4
1 0
2 1
2 2
2 3
4 4
1 0
2 1
2 2
2 3
4 4
2 0
2 1
2 2
2 3
4 4
0 0
0 1
3 2
3 3
4 4
1 0
1 1
2 2
3 3
3 4
1 0
1 1
2 2
3 3
3 4
2 0
2 1
2 2
2 3
2 4
Test MAE:  0.773814968764782
Test MSE:  0.9784882211084528
Test Accuracy:  0.4230769230769231
