<a href="https://colab.research.google.com/github/karunaprakash062/multiclass/blob/main/MultiClassHTBP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [146]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [147]:
!pip install -q -U transformers

In [148]:
from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification,TFBertForSequenceClassification
from transformers import AutoTokenizer
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast
PRETRAINED_LM = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(PRETRAINED_LM)

In [149]:
data = pd.read_json('/content/drive/MyDrive/News_Category_Dataset_v3.json', lines=True)

In [150]:
data = pd.DataFrame(data, columns= ['headline','category','short_description'])

In [151]:
data.head()

Unnamed: 0,headline,category,short_description
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha..."
3,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to..."
4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...


In [152]:
data=data.loc[(data['category']=='POLITICS') | (data['category']=='ENTERTAINMENT') | (data['category']=='SPORTS') | (data['category']=='TECH')]

In [153]:
data.head()

Unnamed: 0,headline,category,short_description
13,Twitch Bans Gambling Sites After Streamer Scam...,TECH,One man's claims that he scammed people on the...
17,"Maury Wills, Base-Stealing Shortstop For Dodge...",SPORTS,"Maury Wills, who helped the Los Angeles Dodger..."
20,Golden Globes Returning To NBC In January Afte...,ENTERTAINMENT,"For the past 18 months, Hollywood has effectiv..."
21,Biden Says U.S. Forces Would Defend Taiwan If ...,POLITICS,President issues vow as tensions with China rise.
24,‘Beautiful And Sad At The Same Time’: Ukrainia...,POLITICS,An annual celebration took on a different feel...


In [154]:
data['category'].count()

60145

In [155]:
text_data=np.array(data['short_description'])

In [156]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [157]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [158]:
stop_words=stopwords.words('english')
lemmatizer=WordNetLemmatizer()

In [159]:
cleaned_data=[]

In [160]:
for text in text_data:
  text=text.lower()
    #statement1-removing special characters and Punctuation
  sentence=re.sub(r'[^\w\s]','',text)
    #statement2-removing numbers from text
  sentence=re.sub(r'[0-9]+','',sentence)
    #statement3-removing html tags and urls from text
  sentence = re.sub(r'<.*?>', '', sentence)
    #statement4-removing urls from text
  clean_text = re.sub(r'http\S+', '',sentence)
    #statement5-text to word tokenization
  sentence=nltk.word_tokenize(sentence)
    #statement6-removing stop_words
  i=0
  while i<3:
    sentence = [word for word in sentence if word.lower() not in stop_words]
    i=i+1
  lst=[]
   #statement7-lemmatization
  for word in sentence:
    lst.append(lemmatizer.lemmatize(word))
  #statement8-joing words to form sentence
  cleaned_text=' '.join(lst)
  cleaned_data.append(cleaned_text)

In [161]:
data['cleaned_text']=cleaned_data

In [162]:
data['cleaned_text'].head()

13    one man claim scammed people platform caused s...
17    maury will helped los angeles dodger win three...
20    past month hollywood effectively boycotted glo...
21               president issue vow tension china rise
24    annual celebration took different feel russia ...
Name: cleaned_text, dtype: object

In [205]:
train, val = train_test_split(data, test_size=0.2,random_state=500)

In [164]:
train.shape

(48116, 4)

In [165]:
val.shape

(12029, 4)

In [166]:
train = train.groupby('category').apply(lambda x: x.sample(1500)).reset_index(drop=True)
val = val.groupby('category').apply(lambda x: x.sample(300)).reset_index(drop=True)

In [167]:
def encode(docs):
    '''
    This function takes list of texts and returns input_ids and attention_mask of texts
    '''
    encoded_dict = tokenizer.batch_encode_plus(docs, add_special_tokens=True, max_length=128, padding='max_length',
                            return_attention_mask=True, truncation=True, return_tensors='pt')
    input_ids = encoded_dict['input_ids']
    attention_masks = encoded_dict['attention_mask']
    return input_ids, attention_masks

In [168]:
train_input_ids, train_att_masks = encode(train['cleaned_text'].values.tolist())
valid_input_ids, valid_att_masks = encode(val['cleaned_text'].values.tolist())

In [169]:
train.category = pd.Categorical(train.category)
val.category = pd.Categorical(val.category)

In [170]:
train['label'] = train.category.cat.codes
val['label'] = val.category.cat.codes

In [171]:
import torch
train_y = torch.LongTensor(train['label'].values.tolist())
valid_y = torch.LongTensor(val['label'].values.tolist())
train_y.size(),valid_y.size()

(torch.Size([6000]), torch.Size([1200]))

In [172]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

BATCH_SIZE = 32
train_dataset = TensorDataset(train_input_ids, train_att_masks, train_y)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

valid_dataset = TensorDataset(valid_input_ids, valid_att_masks, valid_y)
valid_sampler = SequentialSampler(valid_dataset)
valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=BATCH_SIZE)

In [173]:
from transformers import BertForSequenceClassification
N_labels = len(train.label.unique())
model = BertForSequenceClassification.from_pretrained(PRETRAINED_LM,
                                                      num_labels=N_labels,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [174]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [175]:
model = model.cuda()

In [176]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup

EPOCHS = 100
LEARNING_RATE = 2e-5

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(optimizer, 
             num_warmup_steps=0,
            num_training_steps=len(train_dataloader)*EPOCHS )

In [177]:
class EarlyStopping:
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.delta = delta
        self.path = path
        self.trace_func = trace_func

    def __call__(self, val_loss, model):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.best_score:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)


In [178]:
from torch.nn.utils import clip_grad_norm_
from tqdm.notebook import tqdm
import numpy as np
import math

train_loss_per_epoch = []
val_loss_per_epoch = []

early_stopping = EarlyStopping(patience=3, verbose=True)


for epoch_num in range(EPOCHS):
    print('Epoch: ', epoch_num + 1)
    '''
    Training
    '''
    model.train()
    train_loss = 0
    for step_num, batch_data in enumerate(tqdm(train_dataloader,desc='Training')):
        input_ids, att_mask, labels = [data.to(device) for data in batch_data]
        output = model(input_ids = input_ids, attention_mask=att_mask, labels= labels)
        
        loss = output.loss
        train_loss += loss.item()

        model.zero_grad()
        loss.backward()
        del loss

        clip_grad_norm_(parameters=model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

    train_loss_per_epoch.append(train_loss / (step_num + 1))              


    '''
    Validation
    '''
    model.eval()
    valid_loss = 0
    valid_pred = []
    with torch.no_grad():
        for step_num_e, batch_data in enumerate(tqdm(valid_dataloader,desc='Validation')):
            input_ids, att_mask, labels = [data.to(device) for data in batch_data]
            output = model(input_ids = input_ids, attention_mask=att_mask, labels= labels)

            loss = output.loss
            valid_loss += loss.item()
   
            valid_pred.append(np.argmax(output.logits.cpu().detach().numpy(),axis=-1))
        
    val_loss_per_epoch.append(valid_loss / (step_num_e + 1))
    valid_pred = np.concatenate(valid_pred)

    '''
    Loss message
    '''
    print("{0}/{1} train loss: {2} ".format(step_num+1, math.ceil(len(train) / BATCH_SIZE), train_loss / (step_num + 1)))
    print("{0}/{1} val loss: {2} ".format(step_num_e+1, math.ceil(len(val) / BATCH_SIZE), valid_loss / (step_num_e + 1)))

    early_stopping(valid_loss / (step_num_e + 1), model)
    if early_stopping.early_stop:
        print("Early stopping, epoch:", epoch_num + 1)
        break

Epoch:  1


Training:   0%|          | 0/188 [00:00<?, ?it/s]

Validation:   0%|          | 0/38 [00:00<?, ?it/s]

188/188 train loss: 0.956717865898254 
38/38 val loss: 0.7025288217946103 
Validation loss decreased (-0.702529 --> 0.702529).  Saving model ...
Epoch:  2


Training:   0%|          | 0/188 [00:00<?, ?it/s]

Validation:   0%|          | 0/38 [00:00<?, ?it/s]

188/188 train loss: 0.6557340073458692 
38/38 val loss: 0.6770133564346715 
Validation loss decreased (-0.677013 --> 0.677013).  Saving model ...
Epoch:  3


Training:   0%|          | 0/188 [00:00<?, ?it/s]

Validation:   0%|          | 0/38 [00:00<?, ?it/s]

188/188 train loss: 0.5112265080530592 
38/38 val loss: 0.7277190693114933 
EarlyStopping counter: 1 out of 3
Epoch:  4


Training:   0%|          | 0/188 [00:00<?, ?it/s]

Validation:   0%|          | 0/38 [00:00<?, ?it/s]

188/188 train loss: 0.3687850481414415 
38/38 val loss: 0.8415804050470653 
EarlyStopping counter: 2 out of 3
Epoch:  5


Training:   0%|          | 0/188 [00:00<?, ?it/s]

Validation:   0%|          | 0/38 [00:00<?, ?it/s]

188/188 train loss: 0.2715748767665726 
38/38 val loss: 0.9154053353949597 
EarlyStopping counter: 3 out of 3
Early stopping, epoch: 5


In [179]:
label_names=train.category.unique()
from sklearn.metrics import classification_report
print('classifiation report')
print(classification_report(valid_pred, val['label'].to_numpy(), target_names=label_names))

classifiation report
               precision    recall  f1-score   support

ENTERTAINMENT       0.56      0.72      0.63       236
     POLITICS       0.72      0.77      0.74       278
       SPORTS       0.86      0.63      0.72       409
         TECH       0.78      0.85      0.81       277

     accuracy                           0.73      1200
    macro avg       0.73      0.74      0.73      1200
 weighted avg       0.75      0.73      0.73      1200



In [206]:
test = data.sample(250)

In [207]:
test['label'] = 0

In [208]:
test.count()

headline             250
category             250
short_description    250
cleaned_text         250
label                250
dtype: int64

In [209]:
test_input_ids, test_att_masks = encode(test['cleaned_text'].values.tolist())

In [210]:
test_y = torch.LongTensor(test['label'].values.tolist())

In [211]:
test_y.shape

torch.Size([250])

In [212]:
test_dataset = TensorDataset(test_input_ids, test_att_masks, test_y)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

In [213]:
from sklearn.metrics import accuracy_score

In [218]:
model.eval()
test_loss = 0
test_pred = []
with torch.no_grad():
    for step_num_t, batch_data in enumerate(tqdm(test_dataloader,desc='Testing')):
        input_ids, att_mask, labels = [data.to(device) for data in batch_data]
        output = model(input_ids = input_ids, attention_mask=att_mask, labels= labels)

        loss = output.loss
        test_loss += loss.item()
   
        test_pred.append(np.argmax(output.logits.cpu().detach().numpy(),axis=-1))
        
test_loss /= (step_num_t + 1)
test_pred = np.concatenate(test_pred)
test_labels = np.concatenate([batch[2].cpu().numpy() for batch in test_dataloader])

print("Test loss:", test_loss)

test_acc = accuracy_score(test_labels, test_pred)
print("Test accuracy:", test_acc)


Testing:   0%|          | 0/8 [00:00<?, ?it/s]

Test loss: 3.9908021986484528
Test accuracy: 0.208
