In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
#Initiating the GPU

!pip install gputil
!pip install psutil
!pip install humanize
import os
import GPUtil as GPU

GPUs = GPU.getGPUs()

def printn():
  process = psutil.Process(os.getpid())
  print ("Gen RAM free:"+ humanize.naturalsize(psutil.virtual_memory.available), " | Proc size:"+ humanize.naturalsize(process.memory_info().rss))
  print ("GPU RAM free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))

In [None]:
#Importing required Python libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import re
import nltk

nltk.download("punkt")
from nltk.tokenize import word_tokenize, sent_tokenize

from html.parser import HTMLParser
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

import torch
from torch.utils.data import TensorDataset, DataLoader, random_split, RandomSampler, SequentialSampler

import time
import datetime
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Enabling cuda

if torch.cuda.is_available():
  device = torch.device("cuda")
  print ("There are %d GPUs available"% torch.cuda.device_count())
  print ("We will use the GPU:", torch.cuda.get_device_name(0))

else:
  print ("No GPU available, using the CPU instead")
  device = torch.device("cpu")

There are 1 GPUs available
We will use the GPU: Tesla T4


In [None]:
#Display the elapsed time when loading data into the model, rounded to the nearest second

def format_time(elapsed):
  elapsed_rounded = int(round(elapsed))
  return str(datetime.timedelta(seconds = elapsed_rounded))


In [None]:
def get_accuracy(pred,true):
  pred_flat = np.argmax(pred, axis = 1).flatten()
  true_flat = true.flatten()
  return np.sum(pred_flat == true_flat)/ len(true_flat)

In [None]:
#Loading the data 

df = pd.read_csv(<mention path>)
df = df[:150000]
df.info()
df.drop('Unnamed: 0', axis = 1, inplace = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  150000 non-null  int64 
 1   Category    150000 non-null  object
 2   Topics      150000 non-null  object
 3   Content     150000 non-null  object
 4   Tag         150000 non-null  object
 5   Votes       150000 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 6.9+ MB


In [None]:
df.head()

Unnamed: 0,Category,Topics,Content,Tag,Votes
0,Sql,"When to use single quotes, double quotes, and ...",I am trying to learn the best way ...,"['mysql', 'sql', 'quotes']",642
1,Sql,SQL injection that gets around mysql_real_esca...,Is there an SQL injection possibil...,"['php', 'mysql', 'sql', 'security', 'sql-injec...",654
2,Sql,SQL select only rows with max value on a colum...,I have this table for documents (s...,"['mysql', 'sql', 'aggregate-functions', 'great...",1256
3,Sql,Simulating group_concat MySQL function in Micr...,I'm trying to migrate a MySQL-base...,"['sql', 'sql-server', 'sql-server-2005', 'stri...",347
4,Sql,Select first row in each GROUP BY group?,"As the title suggests, I'd like to...","['sql', 'sqlite', 'postgresql', 'group-by', 'g...",1366


In [None]:
df['Category'].value_counts().plot(kind = 'barh', figsize = (10,10))

In [None]:
df['Category'].value_counts()

In [None]:
#Some cleaning steps to remove HTML tags, hash tags, new line, @ symbol and other special characters

def text_cleaning(text):
  parser = HTMLParser()
  text = parser.unescape(text)
  text = text.lower()
  text = re.sub(r'<[^>]+>', '', text)
  text = re.sub(r'(?:\#+[\w_]+[\w\*_\-]*[\w_]+)', '', text)
  text = re.sub('@','', text)
  text = re.sub(r'(?:[\ufffd]+)', '', text)
  return text

In [None]:
topics = df['Topics'].apply(lambda x:text_cleaning(x))

In [None]:
#Loading the transformer model, tokenizer

try:
  import transformers as ppb
except:
  !pip install transformers
  import transformers as ppb

from transformers import AdamW, BertConfig, get_linear_schedule_with_warmup

tokenizer = ppb.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = ppb.DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = df['Category'].nunique())
model.to(device)

In [None]:
#Tokenizing the topics

label_encoder = LabelEncoder()
inputs = []
attention_masks = []

for topic in topics:
  encoded_dict = tokenizer.encode_plus(
      topic,
      add_special_tokens = True,
      truncation = True,
      max_length = 512,
      pad_to_max_length = True,
      return_overflowing_tokens = True,
      stride = 70,
      return_attention_mask = True,
      return_tensors = 'pt',
  )
  inputs.append(encoded_dict['input_ids']) #Adding encoded sentence and its attention mask to the list
  attention_masks.append(encoded_dict['attention_mask'])

inputs = torch.cat(inputs, dim = 0) #Conversion of lists to tensors and loading of the same to GPU
attention_masks = torch.cat(attention_masks, dim = 0)
labels = label_encoder.fit_transform(df['Category']) #Encoding the labels
labels = torch.tensor(labels)


In [None]:
#Preparing training and validation dataset

batch_size = 16
train_size = int(len(topics)*0.7)
val_size = int(len(topics)*0.1)
test_size = len(topics) - train_size - val_size
dataset = TensorDataset(inputs, attention_masks, labels)
train_set, val_set, test_set = random_split(dataset, [train_size, val_size, test_size])
train_loader = DataLoader(train_set, sampler = RandomSampler(train_set), batch_size = batch_size)
val_loader = DataLoader(val_set, sampler = SequentialSampler(val_set), batch_size = batch_size)
test_loader = DataLoader(test_set, sampler = SequentialSampler(test_set), batch_size = batch_size)

In [None]:
optimizer = AdamW(model.parameters(),
                  lr=2e-5,
                  eps=1e-8)
epochs = 4
total_steps = len(train_loader)*epochs #Number of total training steps = (Number of batches)*(Number of epochs)
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [None]:
#Training model

seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_log = []
for epoch in range(epochs):
  time_start= time.time()
  total_train_loss = 0
  print ('Begin training')
  print('')
  model.train()
  for step, batch in enumerate(train_loader):
    if step%10 == 0 and not step == 0:
      elapsed = format_time(time.time()- time_start)
      print('Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_loader),elapsed))
    
    b_inputs = batch[0].to(device)
    b_attention_masks = batch[1].to(device)
    b_label = batch[2].to(device)

    model.zero_grad()
    loss, logit= model(b_inputs, attention_mask = b_attention_masks, labels= b_label)

    optimizer.zero_grad()
    total_train_loss +=loss.item()
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(),1.0)

    optimizer.step()
    scheduler.step()

  avg_train_loss = total_train_loss/len(train_loader)
  training_time= format_time(time.time() - time_start)
  print('')
  print('Avg training loss: {0:.2f}'.format(avg_train_loss))
  print('Training epoch time: {:}'.format(training_time))
  print('')
  print('Begin validation')
  time_start = time.time()
  model.eval()
  total_eval_acc = 0
  total_eval_loss = 0
  nb_eval_steps = 0
  for batch in val_loader:
    b_inputs = batch[0].cuda()
    b_attention_masks = batch[1].cuda()
    b_label = batch[2].cuda()

    with torch.nograd():
      (loss,logits)= model(b_inputs, attention_mask = b_attention_masks, labels=b_label)

    total_eval_loss +=loss.item()
    logits = logits.detach().cpu().numpy()
    label = b_label.to('cpu').numpy()

    total_eval_acc += get_accuracy(logits, label)
  avg_val_accuracy = total_eval_acc/len(val_loader)
  print("Accuracy: {0:.2f}". format(avg_val_accuracy))

  avg_val_loss = total_eval_loss/len(val_loader)
  validation_time= format_time(time.time()- time_start)

  print("Validation Loss: {0:.2f}".format(avg_val_loss))
  print("Validation time: {:}".format(validation_time))

  training_log.append(
      {
          'epoch': epoch+1,
          'Training Loss': avg_train_loss,
          'Valid.Loss': avg_val_loss,
          'Valid.Accur.': avg_val_accuracy,
          'Training time':training_time,
          'Validation time': validation_time
      }
  )
print('Training complete')




In [None]:
#Model evaluation

model.eval()
pred_labels = []
true_labels = []
for batch in test_loader:
  b_inputs = batch[0].to(device)
  b_attention_masks = batch[1].to(device)
  b_label = batch[2].to(device)

  with torch.no_grad():
    outputs = model(b_inputs, attention_mask = b_attention_masks)
  logits = outputs[0]

  logits = logits.detach().cpu().numpy()
  labels = b_label.to('cpu').numpy()

  pred_labels.append(logits)
  true_labels.append(labels)




In [None]:
flat_pred_labels = [item for sublist in pred_labels for item in sublist]
flat_pred_labels = np.argmax(flat_pred_labels, axis = 1).flatten()

flat_true_labels = [item for sublist in true_labels for item in sublist]

print(classification_report(flat_true_labels, flat_pred_labels))




              precision    recall  f1-score   support

           0       0.85      0.96      0.90      7155
           1       0.70      0.10      0.18      1058
           2       0.97      0.99      0.98     10313
           3       0.89      0.88      0.89      3417
           4       0.98      0.96      0.97      8057

    accuracy                           0.93     30000
   macro avg       0.88      0.78      0.78     30000
weighted avg       0.93      0.93      0.92     30000



In [None]:
#Saving the model
# import os
# ouput_dir = <mention path>

# if not os.path.exists(output_dir):
#   os.makedirs(output_dir)

# print("Saving model to %s"% output_dir)

# model_to_save = model.module if hasattr(model,'module') else model
# model_to_save.save_pretrained(output_dir)
# tokenizer.save_pretrained(output_dir)