# Model Training

In [3]:
# required library imports & variable declarations

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline


# set seed value for reproducibility
RANDOM_SEED = 1024

## Data Preparation

### Reading Cleaned Data

In [4]:
import pandas as pd


# define relative data path (according the current path of this notebook) and data file name
DATA_PATH = './scripts/data'

df_train_full = pd.read_csv(f'{DATA_PATH}/train_cleaned.csv.gz')
df_test_full  = pd.read_csv(f'{DATA_PATH}/test_cleaned.csv.gz')

df_train_full.head()

Unnamed: 0,headline,headline_preprocessed,category
0,Medicare Supplemental Policies: Do You Need One?,medicare supplemental policies do you need one,WELLNESS
1,7 Tips For You And Your Dog This July 4th,7 tips for you and your dog this july 4th,GREEN & ENVIRONMENT
2,The Best Hotel-Hosted Super Bowl Parties In La...,the best hotelhosted super bowl parties in las...,TRAVEL
3,"Even If You Lose The Weight, Obesity May Still...",even if you lose the weight obesity may still ...,HEALTHY LIVING
4,Cocaine Cowboy 'White Boy Rick' Could Be Relea...,cocaine cowboy white boy rick could be release...,CRIME


### Subsampling Data
Our dataset is large (159k+ rows for train and 40k+ for test portions). Needless to say, creating multiple models and tuning them will take a long time. What should we do then?<br>
One simple trick I use in similar cases is to create a representative subsample dataset with smaller number of samples and then do the tuning magics on this one. Later on I can utilize the parameters I discovered and use them on full train data or entire dataset. Although this is not an ideal practice, it provides a good starting point when resources are limited.

In [5]:
# let's shuffle the whole dataframe before subsampling
df_train_full = df_train_full.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

# second shuffle with an exponential random seed :D
df_train_full = df_train_full.sample(frac=1, random_state=(RANDOM_SEED**2)).reset_index(drop=True)

To make sure that the subsampled data is representative and similar to our original full dataset, it is advised widely to use __stratified__ method in all sampling scenarios.

In [6]:
from sklearn.model_selection import train_test_split


column_target = 'category'
desired_subsample_size_train = 24000
desired_subsample_size_valid = 6000

# split the data into representative training and validation sets
df_train_subsample, df_valid_subsample = train_test_split(df_train_full,
                                                          train_size=desired_subsample_size_train,
                                                          test_size=desired_subsample_size_valid,
                                                          stratify=df_train_full[column_target],
                                                          random_state=RANDOM_SEED)

### Class Labels

In [13]:
category_map = dict(zip(df_train_full['category'].unique(), range(len(df_train_full['category'].unique()))))

class_labels = list(category_map.keys())
print(f'Classification Labels:\n{class_labels}')

Classification Labels:
['PARENTING', 'GREEN & ENVIRONMENT', 'WORLD NEWS', 'WOMEN', 'ENTERTAINMENT', 'COMEDY', 'STYLE & BEAUTY', 'FOOD & DRINK', 'WELLNESS', 'COLLEGE', 'HEALTHY LIVING', 'TRAVEL', 'POLITICS', 'TECH', 'IMPACT', 'HOME & LIVING', 'BUSINESS', 'CRIME', 'MEDIA', 'BLACK VOICES', 'WEDDINGS', 'QUEER VOICES', 'MONEY', 'SPORTS', 'ARTS & CULTURE', 'LATINO VOICES', 'SCIENCE', 'GOOD NEWS', 'DIVORCE', 'WEIRD NEWS', 'RELIGION', 'FIFTY', 'EDUCATION']


## Creating Language Model Class
Just like convolutional neural networks that enable feature extraction from images, language models like BERT, Roberta, etc. allow us to get contextual embedding or vector representation of an input text by the power of transformers and attention masks, which are pretty sophisticated neural network architecture designs. Our final model will utilize a language model inside. First we get the vector representation of the input from language model, and then pass it to the final layers of neural network. In fact, model training phase consist of fine-tuning the weights of our model's additional hidden layers for our classification task.

But before getting into building our neural network, we need to create a class that will allow us to handle multiple language models in an easy way.

In [7]:
from typing import Union
from transformers import AutoTokenizer, AutoModel


class LanguageModel():
    def __init__(self, model_name: str, device: str, tokenizer_max_length: int=None):
        self.model = AutoModel.from_pretrained(model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.tokenizer_max_length = tokenizer_max_length

        # model initialization
        self.model.to(device)
        self.model.eval()


    def get_embedding_size(self) -> int:
        return self.model.config.hidden_size

    def set_tokenizer_max_length(self, length: int) -> None:
        self.tokenizer_max_length = length

    def estimate_tokenizer_max_length(self, df: pd.DataFrame, text_column_name: str, estimation_type: str = None, adjustment_coefficient: float = 1.0) -> Union[int, dict]:
        '''
        Extracts the token length needed for language model based on desired estimation approach.
        Uses a text column of a dataframe for the estimation of the token length.

        Args:
            df (pd.DataFrame): the dataframe to be used for the estimation
            text_column_name (str): column title of the text column in the dataframe
            estimation_type (str, optional): 'avg', 'max' or 'min' :: defaults to None
            adjustment_coefficient (float): a coefficient to multiply the estimated token length by, defaults to 1.0 :: e.g. 1.2 => 20% larger token length

        Returns:
            int or dict: the estimated token length(s)
        '''
        if adjustment_coefficient <= 0.0:
            raise ValueError("adjust_amount must be a positive float.")

        if estimation_type is not None:
            self.validate_tokenizer_max_length(estimation_type, identifier='estimation_type')

        results = dict()
        # find the rows with the most and least count of words
        word_counts = df[text_column_name].str.split().str.len()
        index_max = word_counts.argmax()
        index_min = word_counts.argmin()
        text_with_max_words = df[text_column_name].iloc[index_max]
        text_with_min_words = df[text_column_name].iloc[index_min]
        # tokenize them and add special tokens, for example `[CLS]` and `[SEP]`
        token_ids_most  = self.tokenizer.encode(text_with_max_words, add_special_tokens=True)
        token_ids_least = self.tokenizer.encode(text_with_min_words, add_special_tokens=True)
        # push 'max' and 'min' token ids length to results list
        results['max'] = round(len(token_ids_most) * adjustment_coefficient)
        results['min'] = round(len(token_ids_least) * adjustment_coefficient)

        # do not continue any further if we don't need to estimate 'avg'
        if estimation_type in ['max', 'min']:
            return results[estimation_type]

        # 'avg' or altogether
        tokens_sum = 0
        all_text = df[text_column_name]
        for text in all_text:
            # tokenize the text and add special tokens, for example `[CLS]` and `[SEP]`
            token_ids = self.tokenizer.encode(text, add_special_tokens=True)
            # update the token length sum
            tokens_sum += len(token_ids)
        results['avg'] = round(round(tokens_sum / len(df)) * adjustment_coefficient)

        return results['avg'] if estimation_type == 'avg' else results

    def validate_tokenizer_max_length(self, token_length: Union[int, str], identifier='token_length') -> bool:
        if (not isinstance(token_length, int) and not isinstance(token_length, str)) or \
        (not isinstance(token_length, str) and not isinstance(token_length, int)):
            raise ValueError(f"Wrong value provided, please check '{identifier}' parameter.")
        if isinstance(token_length, str) and token_length not in ['avg', 'max', 'min']:
            raise ValueError(f"Wrong value provided, please check '{identifier}' parameter.\n\
                            It must be one of 'avg', 'max' or 'min' values.")
        if isinstance(token_length, int) and token_length < 1:
            raise ValueError(f"Wrong value provided, please check '{identifier}' parameter.\n\
                            It must be a positive integer.")
        return True

## Building PyTorch Modules

In [15]:
import os
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, Sampler, WeightedRandomSampler


# GPU support
use_gpu = True
cuda_available = torch.cuda.is_available()
device = torch.device('cuda:0' if (cuda_available and use_gpu) else 'cpu')
print('PyTorch runs on => ', device, '\nPyTorch version => ', torch.__version__)
if use_gpu:
    torch.cuda.empty_cache()
torch.set_num_threads(torch.get_num_threads()-1)

# reproducibility (almost)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

PyTorch runs on =>  cuda:0 
PyTorch version =>  1.10.0+cu113


### Dataset Class

In [None]:
import re
import string


# pytorch dataset class
class TextClassificationDataset(Dataset):
    def __init__(self, df: pd.DataFrame, text_column_name: str, label_column_name: str, language_model: LanguageModel, preprocess: bool = False):
        self.df = df
        self.tokenizer = language_model.tokenizer
        self.tokenizer_max_length = language_model.tokenizer_max_length
        if preprocess:
            text_column = f'{text_column_name}_preprocessed'
            self.df[text_column] = df.apply(lambda row: self.preprocess_text(str(row[text_column_name])), axis=1)
        else: text_column = label_column_name
        self.titles  = df[text_column].to_numpy()
        self.targets = df[label_column_name].to_numpy()
        self.preprocess_enabled = preprocess

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = str(self.titles[idx])
        label = self.targets[idx]
        text_encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.tokenizer_max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        sample = {
            'token_ids': text_encoding['input_ids'].flatten(),
            'attention_mask': text_encoding['attention_mask'].flatten()}
        output = torch.tensor(label, dtype=torch.long)
        return (sample, output)

    def preprocess_text(self, text: str) -> str:
        text = text.lower()  # convert to lowercase
        text = re.sub('(#)(\S+)', r' \2', text)  # remove hashtags sign
        text = re.sub('[%s]' % re.escape(string.punctuation), '', text)  # remove punctuations
        text = re.sub(' +', ' ', text)  # replace multiple whitespaces with a single space
        text = text.strip()  # remove leading and trailing whitespaces
        return text

### Dataloader Class

In [None]:
# Dataloader class
class TextClassificationDataLoader(DataLoader):
    def __init__(self, dataset: Dataset, batch_size: int = 128, shuffle: bool = False, sampler: Sampler = None, num_workers: int = 0):
        super().__init__(dataset, batch_size=batch_size, shuffle=shuffle, sampler=sampler, num_workers=num_workers)