In [None]:
import os
import codecs
import math
from collections import Counter
import random

levels = ["0", "1", "2", "3", "4"]
train_data = []
test_data = []
counts = {}
total_words = {}

class InputExample(object):
    
    def __init__(self, text, label):
        self.text = text
        self.label = label
    
    def summary(self):
        return "[" + self.label + "," + self.text[0:20] + "]"
    
def load_data():
    current_id = 0

    # Load data
    path = "articles"
    for filename in os.listdir(path):
        if filename.endswith(".txt"):
            file = open(os.path.join(path, filename), encoding="utf-8")
            level = filename[-5:-4]
        
        if level == "5":
            continue
            
        text = file.read()
        train_data.append(InputExample(text, level))
        current_id += 1
        
        if current_id >= 100:
            break

    print("Data loaded.")

In [None]:
load_data()
print(train_data[0].summary())
print(len(train_data))

In [None]:
features = []
class InputFeatures(object):
    
    def __init__(self, input_ids, input_mask, segment_ids, label_ids):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_ids = label_ids

vocab = []
sequence_length = 0

max_words = 1000

def create_features():
    global vocab
    global sequence_length

    label_vectors = {
        "0": [1,0,0,0,0],
        "1": [0,1,0,0,0],
        "2": [0,0,1,0,0],
        "3": [0,0,0,1,0],
        "4": [0,0,0,0,1],
    }

    # find max length text and create vocab
    vocab = set(["<SOS>", "<PAD>", "<EOS>"])
    max_length = 0
    for example in train_data:
        words = example.text.split()
        length = len(words)
        if length > max_length:
            max_length = length
        for word in words:
            vocab.add(word)

    current_index = 0
    vocab = list(vocab)
    for example in train_data:
        words = example.text.split()
        input_ids = [0]
        input_mask = [1]
        for word in words:
            input_ids.append(vocab.index(word))
            input_mask.append(1)
            input_ids += [1]*(max_length - len(words))
            input_mask += [0]*(max_length - len(words))
            segment_ids = [1]*max_length

        if len(input_ids) > max_words:
            input_ids = input_ids[0:max_words]
            input_mask = input_mask[0:max_words]
            segment_ids = input_mask[0:max_words]

        feature = InputFeatures(input_ids, input_mask, segment_ids, [label_vectors[example.label]])
        features.append(feature)
        #print(input_ids)
        #print(input_mask)
        #print(segment_ids)
        #print(feature.label_ids)
        current_index += 1
        print(current_index / len(train_data))

    sequence_length = max_length
    if max_words < max_length:
        sequence_length = max_words


    print("Max Length: ", sequence_length)
    print("Features created.")


In [None]:
create_features()

In [None]:
test_features = []
train_features = []

def split_data():
    global test_features
    global train_features
    global features
    
    n_test_data = math.ceil(len(features)*0.2)
    test_features = features[0:n_test_data]
    train_features = features[n_test_data:]

split_data()
print(len(train_features), " training examples")
print(len(test_features), " test examples")

In [None]:
import torch, pickle, os, sys, random, time
from torch import nn, optim
import numpy as np

vocab_size = len(vocab)
print("vocab size: ", vocab_size)

In [None]:
class RNNLM(nn.Module):
    def __init__(self, params):
        super(RNNLM, self).__init__()
        self.vocab_size = params['vocab_size']
        self.d_emb = params['d_emb']
        self.d_hid = params['d_hid']
        self.n_layer = 1
        self.batch_size = params['batch_size']

        self.encoder = nn.Embedding(self.vocab_size, self.d_emb)
        self.rnn = nn.RNN(self.d_emb, self.d_hid, self.n_layer, batch_first=True)
        self.fc = nn.Linear(self.d_hid, len(levels))

    def forward(self, batch):
        encoded_data = self.encoder(batch)
        output, hn = self.rnn(encoded_data)
        return self.fc(hn.squeeze(0))

In [None]:
def train_lm(params, net):

    criterion = nn.BCEWithLogitsLoss()
    criterion.to(device)

    optimizer = optim.SGD(net.parameters(), lr=params['learning_rate'])
    num_examples = len(train_features)

    for epoch in range(params['epochs']):
        ep_loss = 0.
        start_time = time.time()
        random.shuffle(train_features)
        net.train()
        
        for i in range(num_examples):

            batch = torch.LongTensor([train_features[i].input_ids])

            output = net(batch).squeeze(1)

            targets = torch.Tensor(train_features[i].label_ids)
            loss = criterion(output.squeeze(1), targets)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(net.parameters(), 3)
            optimizer.step()
            optimizer.zero_grad()
            ep_loss += loss

    net.eval()
    print('epoch: %d, loss: %0.2f, time: %0.2f sec ' % (epoch, ep_loss, time.time()-start_time))


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

params = {}
params['vocab_size'] = vocab_size
params['d_emb'] = 64
params['d_hid'] = 64
params['batch_size'] = 1
params['epochs'] = 30
params['learning_rate'] = 0.001

RNNnet = RNNLM(params)
RNNnet.to(device)
train_lm(params, RNNnet)

In [None]:
n_correct = 0
for test_feature in test_features:
    encoded_data = RNNnet.encoder(torch.LongTensor([test_feature.input_ids]))
    output, hn = RNNnet.rnn(encoded_data)
    value, index = RNNnet.fc(hn.squeeze(0)).max(1)
    index = int(index)
    actual = np.argmax(test_feature.label_ids)
    print(index, actual)
    if index == actual:
        n_correct += 1

print("test accuracy: ", n_correct/len(test_features))