In [1]:
import torch
import torchtext
import torchdata
import portalocker
import pandas as pd

RANDOM_STATE = 30255
BATCH_SIZE = 16

In [2]:
df = pd.read_csv('../data/preprocessed_data.csv')
df = df[['CLASS', 'PREPROCESSED']]
df = df.dropna()
df['PREPROCESSED'] = df['PREPROCESSED'].str.replace(r'<[^<>]*>', '', regex=True) # drop HTML tags


from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(df['CLASS'])
df['LABEL'] = le.transform(df['CLASS'])
df.head()

display(df['CLASS'].value_counts())
df = df[['LABEL', 'PREPROCESSED']]

Renewable Energy Sources                       8117
Geosciences                                     142
Environmental Sciences                           98
Energy Storage, Conversion, and Utilization      55
Name: CLASS, dtype: int64

In [3]:
df['LABEL'].value_counts(dropna=False)

3    8117
2     142
1      98
0      55
Name: LABEL, dtype: int64

In [4]:
from sklearn.model_selection import train_test_split

def split_data(df, random_state):
    
    # Split the data into training, testing, and validation sets
    train_data, test_data = train_test_split(df, test_size=0.3, random_state=random_state)
    train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=random_state)

    # Convert the sets into iterable
    train_iter = iter(train_data.values.tolist())
    test_data = iter(test_data.values.tolist())
    val_data = iter(val_data.values.tolist())
    
    return train_iter, test_data, val_data

train_iter, test_data, val_data = split_data(df, RANDOM_STATE)

In [5]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer('basic_english')

def yield_tokens(train_iter):
    for _, text in train_iter:
        yield tokenizer(text)

    
vocab = build_vocab_from_iterator(
    yield_tokens(train_iter), specials=["<unk>"], min_freq=10)

In [6]:
# Task 1
print(f"Task 1: The number of words in the Vocab object is {len(vocab)}.")

# # Task 2
stoi_dict = vocab.get_stoi()
word = "energy"
print(f"Task 2: The index of the word '{word}' is {stoi_dict[word]}.")

# # Task 3
itos_dict = vocab.get_itos()
idx = 500
print(f"Task 3: The word at index 500 is '{itos_dict[idx]}'.")

# # Task 4:
word = "<unk>"
print(f"Task 4: The index of the word '{word}' is {stoi_dict[word]}. Resetting default index to this value.")
vocab.set_default_index(stoi_dict[word])

Task 1: The number of words in the Vocab object is 4666.
Task 2: The index of the word 'energy' is 4.
Task 3: The word at index 500 is 'strong'.
Task 4: The index of the word '<unk>' is 0. Resetting default index to this value.


In [7]:
from torch.utils.data import DataLoader
from collections import Counter

def collate_into_bow(batch):
    '''
    Generates a tensor of batch labels and a tensor of relative token frequencies.
    
    arg:
    - batch: List of tuples, first element of tuple
        is a label, second element is text
    - assumes that Vocab object is created
    tr
    Returns:
    - Tensor (1D; same length as batch) showing text labels (indexed to 0)
    - Tensor (2D; rows are the length of batch, columns are length of Vocab object)
        showing the relative frequency of each token within the text
    '''
    # get tensor dimensions
    k = len(batch)
    m = len(vocab)
    
    # initialize empty tensors
    tensor_labels = torch.zeros((k, ), dtype=torch.int64)
    tensor_rf = torch.zeros((k, m))

    # iterate over batch
    for idx, (label, txt) in enumerate(batch):

        # get individual tokens
        txt_split = txt.split(" ")

        # get indices for each token
        txt_indices = vocab.lookup_indices(txt_split)

        # get frequencies for eacch token
        idx_freq_dict = dict(Counter(txt_indices))

        # update tensor with frequency of each token
        tensor_rf[idx, list(idx_freq_dict.keys())] += torch.tensor(list(idx_freq_dict.values()))

    # normalize so that rows sum to 1
    tensor_row_sum = tensor_rf.sum(dim=1, keepdim=True)
    tensor_rf = tensor_rf / tensor_row_sum
    
    return tensor_labels, tensor_rf

In [8]:
from torch.utils.data import IterableDataset

class MyIterableDataset(IterableDataset):
    def __init__(self, data):
        self.data = data
    
    def __iter__(self):
        # Return an iterator over your data
        return iter(self.data)

In [9]:
train_iter, test_data, val_data = split_data(df, RANDOM_STATE)
dataloader = DataLoader(MyIterableDataset(train_iter), batch_size=BATCH_SIZE, shuffle=False, 
                        collate_fn=collate_into_bow)
for idx, (lt, tt) in enumerate(dataloader):
    print(idx, lt.shape, tt.shape)
    if idx == 4: 
        break

0 torch.Size([16]) torch.Size([16, 4666])
1 torch.Size([16]) torch.Size([16, 4666])
2 torch.Size([16]) torch.Size([16, 4666])
3 torch.Size([16]) torch.Size([16, 4666])
4 torch.Size([16]) torch.Size([16, 4666])


In [10]:
# Write a BoWClassifier class with one single linear layer

from torch import nn
import torch.nn.functional as F

class BoWClassifier(nn.Module):
    def __init__(self, num_labels, vocab_size):
        super(BoWClassifier, self).__init__()
        
        # create affine map
        self.linear = nn.Linear(vocab_size, num_labels)
        
    def forward(self, bow_vec):
        # single linear layer
        return F.log_softmax(self.linear(bow_vec), dim=1)

In [11]:
train_data, _, _ = split_data(df, RANDOM_STATE)
num_labels = len(set([label for (label, text) in train_data]))
vocab_size = len(vocab)
model = BoWClassifier(num_labels, vocab_size)

In [12]:
import time

loss_function = torch.nn.NLLLoss()

def train_an_epoch(dataloader, optimizer):
    model.train() # Sets the module in training mode.
    log_interval = 500

    for idx, (label, text) in enumerate(dataloader):
        model.zero_grad()
        log_probs = model(text)
        loss = loss_function(log_probs, label)
        loss.backward()
        optimizer.step()
        if idx % log_interval == 0 and idx > 0:
            print(f'At iteration {idx} the loss is {loss:.3f}.')

In [13]:
# helper function to compute accuracy

def get_accuracy(dataloader):
    '''
    Compute accuracy rate of model. Generate
    model predictions, compare to true labels,
    and compute accuracy.
    
    args:
    - dataloader (object)
    
    Returns: An accuracy rate (float)
    '''
    
    # switch to evaluation mode
    model.eval()
    
    # initialize counters
    correct_count = 0.0
    example_count = 0.0
    
    with torch.no_grad():
        i = 0
        # unpack dataloader
        for tl, tt in dataloader:
            
            # get the size of the batch
            example_count += tl.shape[0]

            # get predicted values (label with highest probability)
            model_result = model(tt)
            tensor_pred = model_result.argmax(dim=1) 

            # count how often predictions match true labels
            correct_count_batch = (tensor_pred == tl).sum().item()
            correct_count += correct_count_batch
            
            i += 1
    
    if example_count == 0:
        print("correct_count:", correct_count, "iter number:", i)
    return correct_count / example_count

In [14]:
def get_dataloaders(df, RANDOM_STATE, collate_fn):
    train_iter, test_data, val_data = split_data(df, RANDOM_STATE)
    train_dataloader = DataLoader(MyIterableDataset(train_iter), batch_size=BATCH_SIZE, shuffle=False, 
                        collate_fn=collate_fn)
    test_dataloader = DataLoader(MyIterableDataset(test_data), batch_size=BATCH_SIZE, 
                        collate_fn=collate_fn)
    val_dataloader = DataLoader(MyIterableDataset(val_data), batch_size=BATCH_SIZE, 
                        collate_fn=collate_fn)
    
    return train_dataloader, test_dataloader, val_dataloader

In [17]:
# TRAINING
import matplotlib.pyplot as plt
%matplotlib inline

EPOCHS = 3 # epoch
optimizer = torch.optim.SGD(model.parameters(), lr=3)

test_accuracies=[]
train_accuracies=[] # added
valid_accuracies=[]

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    
    train_dataloader, _, _ = get_dataloaders(df, RANDOM_STATE, collate_into_bow)
    train_an_epoch(train_dataloader, optimizer)
    
    train_dataloader, test_dataloader, val_dataloader = get_dataloaders(df, RANDOM_STATE, collate_into_bow)
    train_accuracy = get_accuracy(train_dataloader) # added
    test_accuracy = get_accuracy(test_dataloader)
    val_accuracy = get_accuracy(val_dataloader)

    train_accuracies.append(train_accuracy)
    train_accuracies.append(test_accuracy)
    valid_accuracies.append(val_accuracy)
    time_taken = time.time() - epoch_start_time
    print('--')
    print(f'Time taken: {time_taken:.3f}.')
    print(f'After epoch {epoch} the train accuracy is {train_accuracy:.3f}.')
    print(f'After epoch {epoch} the test accuracy is {test_accuracy:.3f}.')
    print(f'After epoch {epoch} the validation accuracy is {val_accuracy:.3f}.')
    print('--')

--
Time taken: 3.014.
After epoch 1 the train accuracy is 1.000.
After epoch 1 the test accuracy is 1.000.
After epoch 1 the validation accuracy is 1.000.
--
--
Time taken: 2.723.
After epoch 2 the train accuracy is 1.000.
After epoch 2 the test accuracy is 1.000.
After epoch 2 the validation accuracy is 1.000.
--
--
Time taken: 2.704.
After epoch 3 the train accuracy is 1.000.
After epoch 3 the test accuracy is 1.000.
After epoch 3 the validation accuracy is 1.000.
--


In [18]:
# add pre-trained embeddings

from itertools import combinations
from torchtext.vocab import GloVe

# Save GloVe data in a cache
VECTOR_CACHE_DIR = '../.vector_cache'

glove = GloVe(name='6B', cache = VECTOR_CACHE_DIR)

In [19]:
glove_size = glove.dim

def collate_into_cbow(batch):
    '''
    Generates a tensor of batch labels and a tensor of mean GloVe embeddings for each token.
    
    arg:
    - batch: List of tuples, first element of tuple is a label, second element is text
    - assumes that GloVe object is created
    
    Returns:
    - Tensor (1D; same length as batch) showing text labels (indexed to 0)
    - Tensor (2D; rows are the length of batch, columns are size of GloVe embeddings)
        showing the average of GloVe embeddings for each text
    '''
    # get tensor dimensions
    k = len(batch)
    m = glove.dim

    # initialize empty tensors
    tensor_labels = torch.zeros((k, ), dtype=torch.int64)
    tensor_glove = torch.zeros((k, m))

    # iterate over batch
    for idx, (label, txt) in enumerate(batch):

        # get individual tokens
        txt_split = txt.split(" ")

        # get GloVe embeddings for each token
        txt_embedding = glove.get_vecs_by_tokens(txt_split)
        
        # update tensor with average of GloVe embeddings
        tensor_glove[idx, :] = txt_embedding.mean(dim=0)

    return tensor_labels, tensor_glove

In [20]:
EPOCHS = 3 # epoch
optimizer = torch.optim.SGD(model.parameters(), lr=3)
model = BoWClassifier(num_labels, glove_size)

val_accuracies=[]
train_accuracies=[] # added
test_accuracies=[] # added
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    
    train_dataloader, _, _ = get_dataloaders(df, RANDOM_STATE, collate_into_cbow)
    train_an_epoch(train_dataloader, optimizer)
    
    train_dataloader, test_dataloader, val_dataloader = get_dataloaders(df, RANDOM_STATE, collate_into_cbow)
    train_accuracy = get_accuracy(train_dataloader) # added
    test_accuracy = get_accuracy(test_dataloader)
    val_accuracy = get_accuracy(val_dataloader)

    train_accuracies.append(train_accuracy)
    train_accuracies.append(test_accuracy)
    valid_accuracies.append(val_accuracy)
    time_taken = time.time() - epoch_start_time
    print('--')
    print(f'Time taken: {time_taken:.3f}.')
    print(f'After epoch {epoch} the train accuracy is {train_accuracy:.3f}.')
    print(f'After epoch {epoch} the test accuracy is {test_accuracy:.3f}.')
    print(f'After epoch {epoch} the validation accuracy is {val_accuracy:.3f}.')
    print('--')

--
Time taken: 8.869.
After epoch 1 the train accuracy is 0.346.
After epoch 1 the test accuracy is 0.342.
After epoch 1 the validation accuracy is 0.359.
--
--
Time taken: 8.473.
After epoch 2 the train accuracy is 0.346.
After epoch 2 the test accuracy is 0.342.
After epoch 2 the validation accuracy is 0.359.
--
--
Time taken: 11.211.
After epoch 3 the train accuracy is 0.346.
After epoch 3 the test accuracy is 0.342.
After epoch 3 the validation accuracy is 0.359.
--
