1. Pre-process the input data 


Reference:

https://medium.com/@bijil.subhash/code-walkthrough-of-word2vec-pytorch-implementation-3a9ca0ad55a7

In [107]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset

In [108]:
file_path = '/Users/irsaashraf/Desktop/UChicago/Spring_23/Advanced ML/Project/Irsa_project/train.csv'
df = pd.read_csv(file_path)
df


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [109]:
class CustomDataset(Dataset):
    def __init__(self, data_file_path, transform=None):
        self.data = pd.read_csv(data_file_path)
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = self.data.iloc[index]
        # Extract the relevant columns from the CSV file
        text = sample["comment_text"]
        label = sample["toxic"]
        # Apply any transformations to the data
        if self.transform:
            text, label = self.transform(text, label)
        return text, label

Apply transformation to dataset 

In [110]:
train_data = CustomDataset(file_path)
train_data

<__main__.CustomDataset at 0x7fea829b4b50>

In [111]:
len(train_data)

159571

In [112]:
type(train_data)

__main__.CustomDataset

In [113]:
text_lst = [] 
label_lst = []
for i in range(len(train_data) - 1):
    text, label = train_data[i]
    text_lst.append(text)
    label_lst.append(label)

### Tokenize dataset

In [114]:
def yield_tokens(train_iter):
    '''
    Documentation:
    https://pytorch.org/text/stable/_modules/torchtext/data/utils.html#get_tokenizer
    Reference:
    https://medium.com/@bijil.subhash/code-walkthrough-of-word2vec-pytorch-implementation-3a9ca0ad55a7
    '''
    tokenizer = get_tokenizer("basic_english", language="en")
    for text, labels in train_iter:
        yield tokenizer(text)

# def get_english_tokenizer():
#     """
#     Documentation:
#     https://pytorch.org/text/stable/_modules/torchtext/data/utils.html#get_tokenizer
#     Reference:
#     https://medium.com/@bijil.subhash/code-walkthrough-of-word2vec-pytorch-implementation-3a9ca0ad55a7
#     """
#     tokenizer = get_tokenizer("basic_english", language="en")
#     return tokenizer


def build_vocab(data_iter, MIN_WORD_FREQUENCY):
    """
    Builds vocabulary from iterator
    """
    vocab = build_vocab_from_iterator(yield_tokens(data_iter), specials=["<unk>"], min_freq=MIN_WORD_FREQUENCY)
    vocab.set_default_index(vocab["<unk>"])
    
    return vocab

In [115]:
MIN_WORD_FREQUENCY = 1000

vocab = build_vocab(train_data, MIN_WORD_FREQUENCY)

In [116]:
# To view the token to index mapping 

# vocab.get_stoi()

### Create utility functions for collating 

In [117]:
MAX_SEQUENCE_LENGTH = 256
CBOW_N_WORDS = 4

tokenizer = get_tokenizer("basic_english", language="en")
text_pipeline = lambda x: vocab(tokenizer(x))


def collate_cbow(batch, text_pipeline):
    """
    https://medium.com/@bijil.subhash/code-walkthrough-of-word2vec-pytorch-implementation-3a9ca0ad55a7
    
    Collate_fn for CBOW model to be used with Dataloader.
    `batch` is expected to be list of text paragrahs.
    
    Context is represented as N=CBOW_N_WORDS past words 
    and N=CBOW_N_WORDS future words.
    
    Long paragraphs will be truncated to contain
    no more that MAX_SEQUENCE_LENGTH tokens.
    
    Each element in `batch_input` is N=CBOW_N_WORDS*2 context words.
    Each element in `batch_output` is a middle word.
    """
    batch_input, batch_output = [], []
    for text, labels in batch:
        text_tokens_ids = text_pipeline(text)

        if len(text_tokens_ids) < CBOW_N_WORDS * 2 + 1:
            continue

        if MAX_SEQUENCE_LENGTH:
            text_tokens_ids = text_tokens_ids[:MAX_SEQUENCE_LENGTH]

        for idx in range(len(text_tokens_ids) - CBOW_N_WORDS * 2):
            token_id_sequence = text_tokens_ids[idx : (idx + CBOW_N_WORDS * 2 + 1)]
            output = token_id_sequence.pop(CBOW_N_WORDS)
            input_ = token_id_sequence
            batch_input.append(input_)
            batch_output.append(output)

    batch_input = torch.tensor(batch_input, dtype=torch.long)
    batch_output = torch.tensor(batch_output, dtype=torch.long)
    return batch_input, batch_output

In [149]:
# ## DELETE ## 

# for text, labels in train_data:
#     text_tokens_ids = text_pipeline(text)

In [148]:
# text_tokens_ids

### Dataloader

In [125]:
from torch.utils.data import DataLoader
from functools import partial

batch_size = 16
collate_fn = collate_cbow

dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=False, 
                        collate_fn=partial(collate_fn, text_pipeline=text_pipeline))

# dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=False, 
#                         collate_fn=collate_cbow(train_data, text_pipeline))

# dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=False, 
#                         collate_fn=collate_cbow(train_data, text_pipeline))

### Build Classifier

https://www.cse.chalmers.se/~richajo/nlp2019/l2/Text%20classification%20using%20a%20CBoW%20representation.html

In [135]:
# https://www.cse.chalmers.se/~richajo/nlp2019/l2/Text%20classification%20using%20a%20CBoW%20representation.html

class CBoWClassifier(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, num_labels, dropout=0.5):
        super().__init__()   
        # Embedding layer: we specify the vocabulary size and embedding dimensionality.        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # A linear output layer.
        self.top_layer = nn.Linear(embedding_dim, num_labels)
        
    def forward(self, docs):
        # The words in the documents are encoded as integers. The shape of the documents
        # tensor is (max_len, n_docs), where n_docs is the number of documents in this batch,
        # and max_len is the maximal length of a document in the batch.

        # First look up the embeddings for all the words in the documents.
        # The shape is now (max_len, n_docs, emb_dim).
        embedded = self.embedding(docs)

        # Compute the mean of word embeddings over the documents.
        # The shape is now (n_docs, emb_dim)
        cbow = embedded.mean(dim=0)

        # Apply the dropout layer. (This is only used during training, not during testing.)
#         cbow_drop = self.dropout(cbow)

        # Finally, compute the output scores.
#         scores = self.top_layer(cbow_drop)
        scores = self.top_layer(cbow)
        

        return scores

### Train the model

In [137]:
vocab_size=len(vocab)
embedding_dim=900
num_labels=2

model = CBoWClassifier(vocab_size=vocab_size, embedding_dim=embedding_dim, num_labels=num_labels)

In [138]:
model

CBoWClassifier(
  (embedding): Embedding(1094, 900)
  (top_layer): Linear(in_features=900, out_features=2, bias=True)
)

In [139]:
import time

loss_function = torch.nn.NLLLoss()

def train_an_epoch(dataloader, optimizer):
    model.train() # Sets the module in training mode.
    log_interval = 500
    num_nan_batch = 0

    for idx, (text, label) in enumerate(dataloader):
        model.zero_grad()
        log_probs = model(text)

        has_nan = torch.isnan(log_probs).any().item()
        if has_nan:
            num_nan_batch += 1
            # print(text)
            # print(label)
            continue

        loss = loss_function(log_probs, label)
        print("loss: ", loss)
        loss.backward()
        optimizer.step()
        if idx % log_interval == 0 and idx > 0:
            print(f'At iteration {idx} the loss is {loss:.3f}.')
        if idx % 5000 == 0 and idx > 0:
            print("Skip", num_nan_batch, "batches containing nan probability")
            break

### Create functions for accuracy  

In [141]:
import torch.optim as optim
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [142]:
def get_accuracy(dataloader):
    model.eval()
    with torch.no_grad():

        total_count = 0
        total_correct = 0
        
        for idx, (label, text) in enumerate(dataloader):
            total_count += len(label)
            log_probs = model(text)
            preds = torch.argmax(log_probs, dim=1)
            total_correct += sum(preds == label)

        return total_correct / total_count

In [146]:
# from torch.utils.data.dataset import random_split
# test_data = pd.read_csv('/Users/irsaashraf/Desktop/UChicago/Spring_23/Advanced ML/Project/Irsa_project/test.csv')
# test_data


In [147]:
# test_labels = pd.read_csv('/Users/irsaashraf/Desktop/UChicago/Spring_23/Advanced ML/Project/Irsa_project/test_labels.csv')
# test_labels

In [158]:
from torch.utils.data.dataset import random_split

BATCH_SIZE = 8 # batch size for training
  
# train_valid_data, test_data = AG_NEWS()
train_valid_data = list(train_data)
num_train = int(len(train_valid_data) * 0.95)
num_valid = len(train_valid_data) - num_train
train_data, valid_data = random_split(
    train_valid_data, [num_train, num_valid])

train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE,
                              shuffle=True, 
                              collate_fn=partial(collate_fn, text_pipeline=text_pipeline))
valid_dataloader = DataLoader(valid_data, batch_size=BATCH_SIZE,
                              shuffle=False, 
                              collate_fn=partial(collate_fn, text_pipeline=text_pipeline))
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE,
                             shuffle=False, 
                             collate_fn=partial(collate_fn, text_pipeline=text_pipeline))

### Training

In [159]:
import matplotlib.pyplot as plt
%matplotlib inline

EPOCHS = 15 # epoch
optimizer = torch.optim.SGD(model.parameters(), lr=3)

accuracies=[]
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train_an_epoch(train_dataloader, optimizer)
    accuracy = get_accuracy(valid_dataloader)
    accuracies.append(accuracy)
    time_taken = time.time() - epoch_start_time
    print()
    print(f'After epoch {epoch} the validation accuracy is {accuracy:.3f}.')
    print()
    
plt.plot(range(1, EPOCHS+1), accuracies)

ValueError: Expected input batch_size (8) to match target batch_size (629).

In [140]:
# def get_accuracy(dataloader):
#     model.eval()
#     with torch.no_grad():
#         ## WRITE YOUR CODE BELOW.
#         acc = 0
#         num_examples = 0
#         for idx, (label, text) in enumerate(dataloader):
#             acc += torch.sum(label == text.argmax(1))
#             num_examples += len(text)
#         accuracy = acc/num_examples
#         return accuracy 