# Introduction

The task is to predict library by question title on StackOverflow. For solving this task I fine-tune pretrained transformer-based models like BERT (or some variations).

P.S. If you have any issues with plots on GitHub pls [check this notebook on Kaggle](https://www.kaggle.com/code/xyinspired/framework-prediction-stackoverflow/notebook).

# Imports

In [1]:
!pip install lets_plot transformers -q

[0m

In [2]:
from lets_plot import *
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from sklearn import metrics

import torch.nn.functional as F
import numpy as np
import random
import pandas as pd
import os
import torch

In [3]:
np.random.seed(42)

In [4]:
LetsPlot.setup_html()

# SetUp

In [5]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print(f'Using GPU : {torch.cuda.get_device_name(0)}')
else:
    device = torch.device("cpu")
    print(f'Using CPU')

Using GPU : Tesla P100-PCIE-16GB


# Data

## Exploration

In [6]:
path_to_train_data = '/kaggle/input/stackoverlow-data/train.csv'
path_to_test_data = '/kaggle/input/stackoverlow-data/test.csv'

In [7]:
train_data = pd.read_csv(path_to_train_data)
test_data = pd.read_csv(path_to_test_data)

In [8]:
print(train_data.shape)
train_data.head()

(134024, 3)


Unnamed: 0,title,lib,id
0,Is there a way to sort strings in alphabetical...,functools,84026
1,Maintaining history by soft deleting the row o...,django,8930
2,Wave on a string analysis,numpy,133474
3,Regular expression find word but NOT if it's p...,re,34429
4,TextBlob - Loop over articles to calculate pol...,pandas,82106


As we can see there are no null values in our train dataset. Also we can notice that some questions have the same title and at all we got 24 libraries (classes).

In [9]:
print(f'Null values:\n{train_data.isna().sum()}')
print(f'Unique:\n{train_data.nunique()}')

Null values:
title    0
lib      0
id       0
dtype: int64
Unique:
title    134014
lib          24
id       134024
dtype: int64


In [10]:
print(test_data.shape)
test_data.head()

(33506, 2)


Unnamed: 0,title,id
0,cython: pass a 2D numpy array to cdef function,119056
1,What parameters need to be put in the code in ...,63881
2,Get a classification report stating the class ...,162795
3,How to interpolate and smooth data with step-l...,149951
4,Numpy: concatenate different dtypes whilst pre...,90130


In [11]:
label_to_id = {k : v for v, k in enumerate(set(train_data.lib))}
id_to_label = {v : k for k, v in label_to_id.items()}

label_to_id

{'scipy': 0,
 'subprocess': 1,
 'selenium': 2,
 'flask': 3,
 'sys': 4,
 're': 5,
 'sklearn': 6,
 'json': 7,
 'pandas': 8,
 'matplotlib': 9,
 'collections': 10,
 'csv': 11,
 'os': 12,
 'time': 13,
 'urllib': 14,
 'math': 15,
 'functools': 16,
 'numpy': 17,
 'itertools': 18,
 'random': 19,
 'datetime': 20,
 'django': 21,
 'tensorflow': 22,
 'requests': 23}

In [12]:
frequency = pd.DataFrame({
   'Library' : list(label_to_id.keys()),
   'Count' : [len(train_data[train_data.lib == lib]) for lib in list(label_to_id.keys())]
})


In [13]:
ggplot(frequency, aes(x=frequency.Library, weight=frequency.Count, fill=frequency.Library)) + \
    geom_bar() + labs(x='Library', y='Count')

  shapely_geos_version, geos_capi_version_string


## Preprocessing

In [14]:
def extract_label_vector(label, total_labels):
    labels = np.zeros(total_labels)
    labels[label] = 1
    return labels.tolist()

In [15]:
train_data.lib = train_data.lib.map(lambda x : label_to_id[x])
train_data['labels'] = train_data.lib.map(lambda x : extract_label_vector(x, len(label_to_id.keys())))
train_data.head(10)

Unnamed: 0,title,lib,id,labels
0,Is there a way to sort strings in alphabetical...,16,84026,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,Maintaining history by soft deleting the row o...,21,8930,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,Wave on a string analysis,17,133474,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,Regular expression find word but NOT if it's p...,5,34429,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
4,TextBlob - Loop over articles to calculate pol...,8,82106,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
5,Creating a multi-index from csv census data,5,191617,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
6,Faster way to remove duplicates from a very la...,18,140810,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7,How do I add values to a list stored as a dict...,10,36324,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8,Python open file in specific application using...,12,189392,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9,"Python can't open text file: ""FileNotFoundError""",12,45889,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [16]:
text_data = train_data.title.values
labels_data = list(train_data.labels)

In [17]:
model_name = 'distilbert-base-uncased'
# model_name = 'bert-base-cased'

## Tokenizing text data

In [18]:
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [19]:
max_len = np.zeros(len(text_data))
for i in range(len(text_data)):
    input_ids = tokenizer.encode(text_data[i], add_special_tokens=True)
    max_len[i] = len(input_ids)
print('Max length: ', max_len.max())

Max length:  78.0


In [20]:
input_ids = []
attention_masks = []

for text in text_data:
    encoded_dict = tokenizer.encode_plus(
                        text,                     
                        add_special_tokens = True,
                        max_length = 128,          
                        pad_to_max_length = True,
                        truncation=True,
                        return_attention_mask = True,  
                        return_tensors = 'pt')
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels_tensor = torch.tensor(labels_data)



In [21]:
val_size = 0.1

In [22]:
dataset = TensorDataset(input_ids, attention_masks, labels_tensor)
train_dataset, val_dataset = random_split(dataset, [1 - val_size, val_size])

print(f'Train set : {len(train_dataset)}')
print(f'Validation set : {len(val_dataset)}')

Train set : 120622
Validation set : 13402


In [23]:
batch_size = 64

train_dataloader = DataLoader(
            train_dataset,  
            sampler = RandomSampler(train_dataset), 
            batch_size = batch_size)

validation_dataloader = DataLoader(
            val_dataset, 
            sampler = SequentialSampler(val_dataset), 
            batch_size = batch_size)

print(f"train : {len(train_dataloader)}")
print(f'val : {len(validation_dataloader)}')

train : 1885
val : 210


In [24]:
model = DistilBertForSequenceClassification.from_pretrained(
    model_name,
    problem_type="multi_class_classification",
    num_labels = len(label_to_id.keys()), 
    output_attentions = False,
    output_hidden_states = False, 
)

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

In [25]:
model.to(device) # fix to device

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

We fine-tune our model, not training from scratch so we don't need too much epochs not to overfit (or use EarlyStop)

In [26]:
learning_rate = 4e-5
epochs = 6

optimizer = AdamW(model.parameters(),
                  lr = learning_rate)



In [27]:
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 300,
                                            num_training_steps = total_steps)

# Training and Validation

In [28]:
def compute_f1_macro(out, pred):
    probs = F.softmax(pred, dim=1)
    pred = torch.argmax(probs, dim=1)
    out = torch.argmax(out, dim=1)
    return metrics.f1_score(pred, out, average='macro')

In [29]:
def compute_precision(out, pred):
    probs = F.softmax(pred, dim=1)
    pred = torch.argmax(probs, dim=1)
    out = torch.argmax(out, dim=1)
    return metrics.precision_score(pred.numpy(), out.numpy(), average='micro')

In [30]:
def compute_recall(out, pred):
    probs = F.softmax(pred, dim=1)
    pred = torch.argmax(probs, dim=1)
    out = torch.argmax(out, dim=1)
    return metrics.recall_score(pred.numpy(), out.numpy(), average='micro')

In [31]:
def get_accuracy_from_logits(logits, labels):
    probs = F.softmax(logits, dim=1)
    output = torch.argmax(probs, dim=1)
    labels = torch.argmax(labels, dim=1)
    acc = (output == labels).float().mean()
    return acc

In [32]:
loss_function = torch.nn.CrossEntropyLoss()

In [33]:
print('Training started...')

np.random.seed(42)
random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

training_stats = []
    
for epoch_i in range(epochs):
    print()
    print('#-----------------------#')
    print(f'     Epoch : {epoch_i + 1} / {epochs}')
    print('#-----------------------#')

    model.train()
    total_train_loss = 0
        
    for step, batch in enumerate(train_dataloader):
        batch_input_ids = batch[0].to(device)
        batch_input_mask = batch[1].to(device)
        batch_labels = batch[2].float().to(device)

        model.zero_grad()        
            
        result = model(batch_input_ids, 
                        attention_mask=batch_input_mask, 
                        labels=batch_labels,
                        return_dict=True)

        logits = result.logits
        loss = loss_function(logits, batch_labels)

        total_train_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)            

    print(f'Average train loss : {avg_train_loss:.3f}')
    print()
    print('Validation started...')
    print()

    model.eval()

    total_eval_loss = 0
    total_eval_f1_macro = 0
    total_precision = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:
        batch_input_ids = batch[0].to(device)
        batch_input_mask = batch[1].to(device)
        batch_labels = batch[2].float().to(device)

        with torch.no_grad():        
            result = model(batch_input_ids, 
                            attention_mask=batch_input_mask,
                            labels=batch_labels,
                            return_dict=True)
            
        logits = result.logits
        loss = loss_function(logits, batch_labels)

        total_eval_loss += loss.item()

        logits = logits.detach().cpu()
        label_ids = batch_labels.to('cpu').detach()

        total_eval_f1_macro += compute_f1_macro(logits, label_ids)
        total_precision += compute_precision(logits, label_ids)


        
    avg_val_f1_macro = total_eval_f1_macro / len(validation_dataloader)
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    avg_val_precision = total_precision / len(validation_dataloader)
    
    print(f'Average validation loss : {avg_val_loss:.3f}')
    print('Average validation metrics:')
    print('----------------')
    print(f'Precision : {avg_val_precision:.3f}')
    print(f'f1-score macro : {avg_val_f1_macro:.3f}')

    training_stats.append(
            {
                'epoch': epoch_i + 1,
                'train_loss': avg_train_loss,
                'valid_loss': avg_val_loss,
                'val_f1_macro' : avg_val_f1_macro,
                'val_precision': avg_val_precision
})
    
print()
print('Training finished...')

Training started...

#-----------------------#
     Epoch : 1 / 6
#-----------------------#
Average train loss : 1.616

Validation started...

Average validation loss : 1.304
Average validation metrics:
----------------
Precision : 0.623
f1-score macro : 0.526

#-----------------------#
     Epoch : 2 / 6
#-----------------------#
Average train loss : 1.226

Validation started...

Average validation loss : 1.259
Average validation metrics:
----------------
Precision : 0.636
f1-score macro : 0.537

#-----------------------#
     Epoch : 3 / 6
#-----------------------#
Average train loss : 1.076

Validation started...

Average validation loss : 1.258
Average validation metrics:
----------------
Precision : 0.635
f1-score macro : 0.536

#-----------------------#
     Epoch : 4 / 6
#-----------------------#
Average train loss : 0.937

Validation started...

Average validation loss : 1.297
Average validation metrics:
----------------
Precision : 0.631
f1-score macro : 0.535

#--------------

In [34]:
train_loss = [i['train_loss'] for i in training_stats]
val_loss = [i['valid_loss'] for i in training_stats]
epochs = [i['epoch'] for i in training_stats]
val_f1_macro = [i['val_f1_macro'] for i in training_stats]
loss_stats = pd.DataFrame({
    'epoch' : epochs,
    'train_loss' : train_loss,
    'val_loss' : val_loss,
    'val_f1_macro' : val_f1_macro
})

In [35]:
bunch = GGBunch()
plot = ggplot(loss_stats) + geom_path(aes('epoch', 'train_loss'), size=1.3, color='blue') + ggsize(500, 400) + ggtitle('Average Train Loss')
bunch.add_plot(plot, 100, 0)
plot = ggplot(loss_stats) + geom_path(aes('epoch', 'val_loss'), size=1.3, color='red') + ggsize(500, 400) + ggtitle('Average Validation Loss')
bunch.add_plot(plot, 700, 0)
bunch.show()

# Inference

In [36]:
def get_pred(logits):
    probs = F.softmax(logits, dim=1)
    return torch.argmax(probs, dim=1)

In [37]:
def submit_csv(model, tokenizer, test_data):
    preds = np.zeros(len(test_data))
    model.eval()
    
    for i, row in test_data.iterrows():
        inputs = tokenizer(row.title, return_tensors="pt")
        model.to('cpu')
        with torch.no_grad():
            logits = model(**inputs).logits
            preds[i] = get_pred(logits)
        
    csv_data = pd.DataFrame({
        'id' : test_data.id.values,
        'lib': preds.astype(int)
    })
        
    csv_data.lib = csv_data.lib.map(lambda x : id_to_label[x])
    path = os.path.join(os.getcwd(), "submission.csv")
    csv_data.to_csv(path, index=False)
    
    print("Submission successful!")
    return csv_data

In [38]:
test_data.head()

Unnamed: 0,title,id
0,cython: pass a 2D numpy array to cdef function,119056
1,What parameters need to be put in the code in ...,63881
2,Get a classification report stating the class ...,162795
3,How to interpolate and smooth data with step-l...,149951
4,Numpy: concatenate different dtypes whilst pre...,90130


In [39]:
submit = submit_csv(model, tokenizer, test_data)

submit.head()

Submission successful!


Unnamed: 0,id,lib
0,119056,numpy
1,63881,requests
2,162795,sklearn
3,149951,scipy
4,90130,numpy
