In [1]:
# Supress unnecessary warnings so that presentation looks clean
import warnings
from sklearn import model_selection
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch import optim
from torch.utils.data import TensorDataset, DataLoader

warnings.filterwarnings('ignore')

google_colab=False

In [2]:
if google_colab:
    # Google Colab stuff
    from google.colab import drive
    drive.mount('/content/drive')

In [3]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Patrick\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Patrick\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
class Dataset:
    def __init__(self, path):
        
        if google_colab:
            self.df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/NLP/data/training_data.csv")
        else:
            self.df = pd.read_csv("training_data.csv")
        self.data = self.df.to_numpy()

        self.stopwords = set(stopwords.words('english'))
        self.essay_id = self.data[:,0]
        self.text = self.data[:,1]
        self.scores = self.data[:,2:8]
        self.new_data = []
        self.new_scores = []
        self.vocab = set()
        self.word_to_id = None

    def preprocess(self):
        for i in range(len(self.essay_id)):
            text = self.text[i].lower()
            text = " ".join([word for word in text.split() if '@' not in word])
            text = word_tokenize(text)
            text = [word for word in text if word not in self.stopwords]
            self.text[i] = text 

    def create_vocab(self):
        for line in self.text:
            for word in line:
                self.vocab.add(word)

        self.vocab = sorted(list(self.vocab))
        self.word_to_id = {word:i for i, word in enumerate(self.vocab)}

    def text_num(self):
        for i, line in enumerate(self.text):
            x = []
            for word in line:
                x.append(self.word_to_id[word])
            self.text[i] = x

    def create_chunks(self):
        for idx in range(len(self.essay_id)):
            ess = self.text[idx]
            n = len(ess)
            self.new_data.append([ess[:n//3]])
            self.new_data.append([ess[n//3:2*n//3]])
            self.new_data.append([ess[2*n//3:]])
            self.new_scores.append(self.scores[idx])
            self.new_scores.append(self.scores[idx])
            self.new_scores.append( self.scores[idx])

import pandas as pd
import nltk
#nltk.download()
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np


dataset = Dataset("/content/drive/My Drive/Colab Notebooks/NLP/data/training_data.csv")
print(dataset.text.shape)
dataset.preprocess()
dataset.create_vocab()
# print(len(dataset.vocab))
# print(dataset.vocab[:10])
# print(dataset.word_to_id['!'])
# dataset.create_chunks()
# print(dataset.new_data.shape)
# print(dataset.text[0])
"""
print(len(dataset.new_data), len(dataset.essay_id))
print("0: ", dataset.new_data[0, 0])
print("1: ", dataset.new_data[1, 0])
print("2: ", dataset.new_data[2, 0])
print(dataset.text[0])
"""

(723,)


'\nprint(len(dataset.new_data), len(dataset.essay_id))\nprint("0: ", dataset.new_data[0, 0])\nprint("1: ", dataset.new_data[1, 0])\nprint("2: ", dataset.new_data[2, 0])\nprint(dataset.text[0])\n'

In [5]:
dataset.text_num()
dataset.create_chunks()
train_text = dataset.new_data

max_len = 0
for row in train_text:
  leng = len(row[0])
  if leng > max_len:
    max_len = leng
max_len

train_text = [train_text[i][0] for i in range(len(train_text))]

# Padding with space (0)
for i in range(len(train_text)):
    while len(train_text[i])<=max_len:
        train_text[i].append(0)
        
train_label = dataset.new_scores
len(train_label)

train_x, valid_x, train_y, valid_y = model_selection.train_test_split(train_text, train_label)


In [6]:
train_seq_x = np.array(train_x)
valid_seq_x = np.array(valid_x)

train_y = np.array(train_y)[:,0].astype(float)
valid_y = np.array(valid_y)[:,0].astype(float)

vocab_len = len(dataset.vocab)
print(vocab_len)

print(train_seq_x.shape, train_y.shape)

print(valid_seq_x.shape, valid_y.shape)

# train_y = train_y.astype(float)
# valid_y = valid_y.astype(float)

12520
(1626, 186) (1626,)
(543, 186) (543,)


In [7]:
# device = torch.device('cpu')
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

if google_colab:
    train_seq_x1 = torch.from_numpy(train_seq_x).to(device)
    train_y1 = torch.from_numpy(train_y).to(device)
    valid_seq_x1 = torch.from_numpy(valid_seq_x).to(device)
    valid_y1 = torch.from_numpy(valid_y).to(device)
else:
    train_seq_x1 = torch.from_numpy(train_seq_x).to(torch.int64).to(device)
    train_y1 = torch.from_numpy(train_y).to(torch.int64).to(device)
    valid_seq_x1 = torch.from_numpy(valid_seq_x).to(torch.int64).to(device)
    valid_y1 = torch.from_numpy(valid_y).to(torch.int64).to(device)

batch_size = 100
train_loader = DataLoader(TensorDataset(train_seq_x1, train_y1), batch_size = batch_size, shuffle = True)
valid_loader = DataLoader(TensorDataset(valid_seq_x1, valid_y1), batch_size = batch_size, shuffle = False)

In [8]:
class LSTM_Model(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_num, output_num, layer_num):
      super().__init__()
      self.vocab_size = vocab_size
      self.layer_num = layer_num
      self.hidden_num = hidden_num

      self.embedding = nn.Embedding(vocab_size, embedding_size)
      self.lstm = nn.LSTM(embedding_size, hidden_num, layer_num)
      self.fc = nn.Linear(hidden_num, output_num)
      self.relu = nn.ReLU()

      self.hidden = (torch.randn(self.layer_num, max_len+1, self.hidden_num).cuda(), torch.randn(self.layer_num, max_len+1, self.hidden_num).cuda())
    
    def forward(self, word_seq):
      word_emb = self.embedding(word_seq)
      self.hidden = (torch.randn(self.layer_num, max_len+1, self.hidden_num).cuda(), torch.randn(self.layer_num, max_len+1, self.hidden_num).cuda())
      lstm_out,self.hidden = self.lstm(word_emb, self.hidden)
      lstm_out = lstm_out.contiguous().view(-1, self.hidden_num)
      fc_out = self.fc(lstm_out)
      relu_out = self.relu(fc_out)
      relu_out = relu_out.view(batch_size, -1) 
      relu_out = relu_out[:,-1]
      return relu_out

In [9]:
train_seq_x.shape

(1626, 186)

In [10]:
def train(data_loader, classifier, loss_function, optimizer):
    classifier.train()
    loss = 0
    losses = []
    prediction_list = []
    accuracy = 0
    accuracies = []
    for i, (texts, labels) in enumerate(data_loader):
      
        if(texts.shape[0] != batch_size):
            break
        labels = labels.float()
        texts = texts.cuda()
        labels = labels.cuda()
        optimizer.zero_grad()
        predictions = classifier(texts)
        # print(predictions.type(), labels.type())
        loss = loss_function(predictions, labels)
        loss.backward()
        optimizer.step()
        losses.append(loss.item()) 
        prediction_list.append([(predictions[i].item(), labels[i].item()) for i in range(len(predictions))])
        # print(losses)       
    return prediction_list, sum(losses)/len(losses)

In [11]:
n_vocab = vocab_len# + 1000
# n_vocab = len(embedding_matrix)
n_embed = 1000
n_hidden = 256
n_output = 1
n_layers = 2

rnn_model = LSTM_Model(n_vocab, n_embed, n_hidden, n_output, n_layers)
rnn_model.cuda()
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(rnn_model.parameters(), lr=0.001)
epochs = 5


In [12]:
def validation(data_loader, classifier, loss_function, optimizer):
    classifier.eval()
    loss = 0
    losses = []
    prediction_list = []
    accuracy = 0
    accuracies = []
    for i, (texts, labels) in enumerate(data_loader):
      
        if(texts.shape[0] != batch_size):
            break
        labels = labels.float()
        texts = texts.cuda()
        labels = labels.cuda()
        optimizer.zero_grad()
        predictions = classifier(texts)
        # print(predictions.type(), labels.type())
        loss = loss_function(predictions, labels)
        losses.append(loss.item()) 
        prediction_list.append([(predictions[i].item(), labels[i].item()) for i in range(len(predictions))])
        # print(losses)       
    return prediction_list, sum(losses)/len(losses)

In [13]:
def get_accuracy(predictions):
    correct = 0
    total = 0
    for batch in predictions:
        for item in batch:
            if int(round(item[0])) == int(item[1]):
                correct += 1
            total += 1
    return correct/total

In [34]:
epochs = 200

training_losses = []
validation_losses = []
for epoch in range(0, epochs):
    print("epoch:", epoch + 1)
    train_predictions, training_loss = train(train_loader, rnn_model, loss_function, optimizer)
    validation_predictions, validation_loss = validation(valid_loader, rnn_model, loss_function, optimizer)
    print("training_loss:", training_loss)
    print("validation_loss:", validation_loss)
    
    print("train_accuracy:", get_accuracy(train_predictions))
    print("validation_accuracy:", get_accuracy(validation_predictions))
    
    
    


epoch: 1
training_loss: 2.4656851701438427
validation_loss: 0.7427350759506226
train_accuracy: 0.315
validation_accuracy: 0.264
epoch: 2
training_loss: 0.6506977863609791
validation_loss: 0.5784127533435821
train_accuracy: 0.530625
validation_accuracy: 0.584
epoch: 3
training_loss: 0.5892071221023798
validation_loss: 0.5945849597454071
train_accuracy: 0.543125
validation_accuracy: 0.588
epoch: 4
training_loss: 0.5617428384721279
validation_loss: 0.570533174276352
train_accuracy: 0.55
validation_accuracy: 0.59
epoch: 5
training_loss: 0.5428692288696766
validation_loss: 0.566394180059433
train_accuracy: 0.555
validation_accuracy: 0.588
epoch: 6
training_loss: 0.5627081245183945
validation_loss: 0.563042676448822
train_accuracy: 0.545
validation_accuracy: 0.588
epoch: 7
training_loss: 0.5657030865550041
validation_loss: 0.5605938792228699
train_accuracy: 0.54625
validation_accuracy: 0.588
epoch: 8
training_loss: 0.541326055303216
validation_loss: 0.550516015291214
train_accuracy: 0.55
val

training_loss: 0.5139880739152431
validation_loss: 0.500239896774292
train_accuracy: 0.5525
validation_accuracy: 0.588
epoch: 127
training_loss: 0.5142284948378801
validation_loss: 0.5092838883399964
train_accuracy: 0.5575
validation_accuracy: 0.588
epoch: 128
training_loss: 0.5075231976807117
validation_loss: 0.5023239612579345
train_accuracy: 0.55375
validation_accuracy: 0.588
epoch: 129
training_loss: 0.5140943117439747
validation_loss: 0.49658443927764895
train_accuracy: 0.555
validation_accuracy: 0.59
epoch: 130
training_loss: 0.511064924299717
validation_loss: 0.5049850821495057
train_accuracy: 0.556875
validation_accuracy: 0.59
epoch: 131
training_loss: 0.5102081038057804
validation_loss: 0.49988868832588196
train_accuracy: 0.554375
validation_accuracy: 0.59
epoch: 132
training_loss: 0.5114692058414221
validation_loss: 0.49880281686782835
train_accuracy: 0.558125
validation_accuracy: 0.588
epoch: 133
training_loss: 0.511620070785284
validation_loss: 0.5004469335079194
train_accu

KeyboardInterrupt: 

In [None]:
train_predictions

In [None]:
import matplotlib as plt


    