# BERT Model for context based analysis

Importing required libraries


In [None]:
import numpy as np
import pandas as pd
import time
import datetime
import gc
import random
from nltk.corpus import stopwords
import re

import torch
import torch.nn as nn
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, precision_score

import transformers
from transformers import BertForSequenceClassification, AdamW, BertConfig,BertTokenizer,get_linear_schedule_with_warmup

Setting up GPU or CPU

In [57]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

Dataset loading and preprocessing

In [58]:
df=pd.read_csv("/kaggle/input/text-data/en_curlie_notcheck_all.csv",encoding="ISO-8859-1",on_bad_lines='skip', skiprows="")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 403214 entries, 0 to 403213
Data columns (total 9 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   Unnamed: 0            403214 non-null  int64 
 1   Url                   403214 non-null  object
 2   Title                 399925 non-null  object
 3   Article               351417 non-null  object
 4   Text                  366386 non-null  object
 5   Image_Name            102778 non-null  object
 6   response_status_code  403214 non-null  int64 
 7   is_dead               403214 non-null  bool  
 8   prob                  403214 non-null  int64 
dtypes: bool(1), int64(3), object(5)
memory usage: 25.0+ MB


In [59]:
df.info()
df= df.sample(frac=1, random_state=42)
df=df.iloc[:4000]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 403214 entries, 0 to 403213
Data columns (total 9 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   Unnamed: 0            403214 non-null  int64 
 1   Url                   403214 non-null  object
 2   Title                 399925 non-null  object
 3   Article               351417 non-null  object
 4   Text                  366386 non-null  object
 5   Image_Name            102778 non-null  object
 6   response_status_code  403214 non-null  int64 
 7   is_dead               403214 non-null  bool  
 8   prob                  403214 non-null  int64 
dtypes: bool(1), int64(3), object(5)
memory usage: 25.0+ MB


In [60]:

df['Title'] = df['Title'].fillna("none")
df['Text'] = df['Text'].fillna("none")
#df['Article'] = df['Article'].fillna("none")
df['Image_Name'] = df['Image_Name'].fillna("none")
df = df.dropna(subset=['is_dead'], axis=0)
df['is_dead'] = df['is_dead'].astype(bool)
# Convert non-numeric values to NaN
df['response_status_code'] = pd.to_numeric(df['response_status_code'], errors='coerce')

# Drop rows with NaN values in the 'response_status_code' column
df = df.dropna(subset=['response_status_code'])

# Convert 'response_status_code' column to integer
df['response_status_code'] = df['response_status_code'].astype(int)

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4000 entries, 118232 to 375761
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Unnamed: 0            4000 non-null   int64 
 1   Url                   4000 non-null   object
 2   Title                 4000 non-null   object
 3   Article               3444 non-null   object
 4   Text                  4000 non-null   object
 5   Image_Name            4000 non-null   object
 6   response_status_code  4000 non-null   int64 
 7   is_dead               4000 non-null   bool  
 8   prob                  4000 non-null   int64 
dtypes: bool(1), int64(3), object(5)
memory usage: 285.2+ KB


In [62]:
def sanitize_text(input_text):
    # Remove tabs, commas, and other special characters
    sanitized_text = re.sub(r'[\t,;:!\'"<>?~`@#$%^&*()\-_+=\[\]{}|\\\/]', ' ', input_text)
    
    # Replace multiple spaces with a single space
    sanitized_text = re.sub(r'\s+', ' ', sanitized_text)
    
    return sanitized_text


In [63]:
df['Text'] = df['Text'].apply(lambda x:sanitize_text(x))

In [64]:
#df['Concatenated'] = '[' + df['Url']+ ']' + '[' + df['Title'] + ']' + df['Text'] +df['Image_Name']  
#'[SEP]' #+df['Article']+
df['Concatenated'] = '[CLS]'  + df['Title'] +  '[SEP]'+ df['Text'] + '[SEP]' +df['Article']+ '[SEP]' +df['Image_Name']


In [None]:
df['Concatenated'] = df['Concatenated'].apply(lambda x:sanitize_text(x))

In [65]:
df.to_csv("text.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Url,Title,Article,Text,Image_Name,response_status_code,is_dead,prob,Concatenated
118232,166112,https://www.emtsinc.com,EMTS Inc,All Paramedic programs offered in partnership ...,Call today 877 385 EMTS 3687 Dean College Fran...,logo_emts.pngtitle_fromemt.pngicon_fb.pngicon_...,200,False,0,[CLS]EMTS Inc[SEP]Call today 877 385 EMTS 3687...
86853,134733,https://www.visitwamego.com,Visit Wamego Kansas - Small Town. Big Experience.,Every year the town brings to life one of Amer...,October 1st 2022 Every year the town brings to...,none,200,False,0,[CLS]Visit Wamego Kansas - Small Town. Big Exp...
389691,171882,https://www.tamars.co.uk/mvnirieyogtjdqqseoqtk...,21st Century Back Care - TAMARS Treatment â 404,,Page not found,21_C_Logo.svg21_C_Logo.svg,404,True,0,[CLS]21st Century Back Care - TAMARS Treatment...
216080,263960,https://www.introllingmills.com,A Global Leader in Metal Processing Equipment ...,What We Do We work closely with our customers ...,A GLOBAL LEADER IN METAL PROCESSING EQUIPMENT ...,none,200,False,0,[CLS]A Global Leader in Metal Processing Equip...
59200,107080,https://www.bondware.com,Websites for News Marketing and Commerce,Our software provides all the tools you need t...,none,MV_thumbnail-2.png,200,False,0,[CLS]Websites for News Marketing and Commerce[...


In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4000 entries, 118232 to 375761
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Unnamed: 0            4000 non-null   int64 
 1   Url                   4000 non-null   object
 2   Title                 4000 non-null   object
 3   Article               3444 non-null   object
 4   Text                  4000 non-null   object
 5   Image_Name            4000 non-null   object
 6   response_status_code  4000 non-null   int64 
 7   is_dead               4000 non-null   bool  
 8   prob                  4000 non-null   int64 
 9   Concatenated          4000 non-null   object
dtypes: bool(1), int64(3), object(6)
memory usage: 316.4+ KB


In [67]:
df.loc[ df['response_status_code'] <= 205, 'response_status_code'] = int(0)
df.loc[df['response_status_code'] > 205 , 'response_status_code'] = int(1)
df.head()

Unnamed: 0.1,Unnamed: 0,Url,Title,Article,Text,Image_Name,response_status_code,is_dead,prob,Concatenated
118232,166112,https://www.emtsinc.com,EMTS Inc,All Paramedic programs offered in partnership ...,Call today 877 385 EMTS 3687 Dean College Fran...,logo_emts.pngtitle_fromemt.pngicon_fb.pngicon_...,0,False,0,[CLS]EMTS Inc[SEP]Call today 877 385 EMTS 3687...
86853,134733,https://www.visitwamego.com,Visit Wamego Kansas - Small Town. Big Experience.,Every year the town brings to life one of Amer...,October 1st 2022 Every year the town brings to...,none,0,False,0,[CLS]Visit Wamego Kansas - Small Town. Big Exp...
389691,171882,https://www.tamars.co.uk/mvnirieyogtjdqqseoqtk...,21st Century Back Care - TAMARS Treatment â 404,,Page not found,21_C_Logo.svg21_C_Logo.svg,1,True,0,[CLS]21st Century Back Care - TAMARS Treatment...
216080,263960,https://www.introllingmills.com,A Global Leader in Metal Processing Equipment ...,What We Do We work closely with our customers ...,A GLOBAL LEADER IN METAL PROCESSING EQUIPMENT ...,none,0,False,0,[CLS]A Global Leader in Metal Processing Equip...
59200,107080,https://www.bondware.com,Websites for News Marketing and Commerce,Our software provides all the tools you need t...,none,MV_thumbnail-2.png,0,False,0,[CLS]Websites for News Marketing and Commerce[...


In [68]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [69]:
df['Length'] = df['Concatenated'].apply(lambda x: len(tokenizer.encode(x, add_special_tokens=True)))

# Drop rows where the length is greater than 450
df = df[df['Length'] <= 450]

Token indices sequence length is longer than the specified maximum sequence length for this model (580 > 512). Running this sequence through the model will result in indexing errors


In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3998 entries, 118232 to 375761
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Unnamed: 0            3998 non-null   int64 
 1   Url                   3998 non-null   object
 2   Title                 3998 non-null   object
 3   Article               3442 non-null   object
 4   Text                  3998 non-null   object
 5   Image_Name            3998 non-null   object
 6   response_status_code  3998 non-null   int64 
 7   is_dead               3998 non-null   bool  
 8   prob                  3998 non-null   int64 
 9   Concatenated          3998 non-null   object
 10  Length                3998 non-null   int64 
dtypes: bool(1), int64(4), object(6)
memory usage: 347.5+ KB


In [71]:
print(df)

        Unnamed: 0                                                Url  \
118232      166112                            https://www.emtsinc.com   
86853       134733                        https://www.visitwamego.com   
389691      171882  https://www.tamars.co.uk/mvnirieyogtjdqqseoqtk...   
216080      263960                    https://www.introllingmills.com   
59200       107080                           https://www.bondware.com   
...            ...                                                ...   
115944      163824                              https://www.acoem.org   
153896      201776                      https://www.hanschristian.org   
116357      164237                          https://www.nobivacbb.com   
47703        95583                          https://www.smartgoat.com   
375761      149710    https://www.ydcpa.com/wbnbibzfxmyzamvfwezlnwrjl   

                                                    Title  \
118232                                           EMTS Inc   
8

In [74]:
Content = df.Concatenated.values
labels = df.response_status_code.values
#print(Content)
print(labels)

[0 0 1 0 0 0 0 0 1 0 1 1 0 1 0 0 1 0 0 0 1 1 0 0 1 0 1 0 0 0 0 1 0 0 0 1 0
 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 1 1 0 0 0 1 0 1 0 0 1 0 0 0
 0 0 0 0 0 0 0 1 1 0 0 1 1 0 1 0 0 1 1 0 1 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 0
 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 0 0 1 1
 1 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0
 0 0 0 1 1 1 0 0 0 0 0 0 1 0 1 1 1 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 1 1 0 1 0 1 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0 1 0 0 0 0 1 0 1 0 0 0
 1 1 1 0 0 0 1 1 0 0 1 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 1 0 1 0 1 1 1 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 0 1
 1 1 0 1 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 1 0 0
 0 0 1 1 0 0 1 1 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 1 0 1 0 0 0
 0 0 1 0 1 0 0 0 0 0 1 0 

In [76]:
print(' Original: ', Content[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(Content[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(Content[0])))

 Original:  [CLS]EMTS Inc[SEP]Call today 877 385 EMTS 3687 Dean College Franklin MA McNeilly EMS Educators Danvers MA Nantucket Fire Dept Nantucket MA Dean College Franklin MA McNeilly EMS Educators Danvers MA Nantucket Fire Dept Nantucket MA DEAN EMTS CONSORTIUM PROGRAM OUTCOMES 2020 National Registry Cognitive Exam Pass Rate 84 Program Retention Rate 75 Job Placement for Graduates 100 All Paramedic programs offered in partnership by EMTS Inc. and Dean College are sponsored by the Dean EMTS Consortium. The Dean EMTS Consortium is accredited by the Commission on Accreditation of Allied Health Education Programs www.caahep.org upon the recommendation of the Committee on Accreditation of Educational Programs for the Emergency Medical Services Professions
Tokenized:  ['[CLS]', 'em', '##ts', 'inc', '[SEP]', 'call', 'today', '87', '##7', '385', 'em', '##ts', '36', '##8', '##7', 'dean', 'college', 'franklin', 'ma', 'mc', '##neil', '##ly', 'ems', 'educators', 'dan', '##vers', 'ma', 'nan', '##

In [77]:
max_len = 0

# For every sentence...
for sent in Content:
    #print(sent)
   
    #Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))
    if len(input_ids) >450 :
        print(sent)

print('Max sentence length: ', max_len)

Max sentence length:  401


In [78]:
input_ids = []
attention_masks = []

# For every webpage...
for sent in Content:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_len,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        truncation=True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', Content[0])
print('Token IDs:', input_ids[0])



Original:  [CLS]EMTS Inc[SEP]Call today 877 385 EMTS 3687 Dean College Franklin MA McNeilly EMS Educators Danvers MA Nantucket Fire Dept Nantucket MA Dean College Franklin MA McNeilly EMS Educators Danvers MA Nantucket Fire Dept Nantucket MA DEAN EMTS CONSORTIUM PROGRAM OUTCOMES 2020 National Registry Cognitive Exam Pass Rate 84 Program Retention Rate 75 Job Placement for Graduates 100 All Paramedic programs offered in partnership by EMTS Inc. and Dean College are sponsored by the Dean EMTS Consortium. The Dean EMTS Consortium is accredited by the Commission on Accreditation of Allied Health Education Programs www.caahep.org upon the recommendation of the Committee on Accreditation of Educational Programs for the Emergency Medical Services Professions
Token IDs: tensor([  101,   101,  7861,  3215,  4297,   102,  2655,  2651,  6584,  2581,
        24429,  7861,  3215,  4029,  2620,  2581,  4670,  2267,  5951,  5003,
        11338, 27276,  2135, 29031, 19156,  4907, 14028,  5003, 16660, 

In [79]:
# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Create a  train-validation split.

# Calculate the number of samples to include in each set.
train_size = int(0.6 * len(dataset))
#val_size = int(0.4 * len(dataset))
val_size = len(dataset)  - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

2,398 training samples
1,600 validation samples


In [80]:

# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 16

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [81]:
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2,
    output_attentions = True,
    output_hidden_states = True,
)

# if device == "cuda:0":
# # Tell pytorch to run this model on the GPU.

#     model = model.cuda()
model = model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [82]:
optimizer = AdamW(model.parameters(),
                  lr = 3e-5,#this learning rate gives best results this has already been worked around
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )



In [83]:
# Number of training epochs. The BERT authors recommend between 2 and 4. 
epochs = 4
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [84]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [85]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

## Training the model 

In [87]:


seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(epochs):
    
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    # Measure how long the training epoch takes.
    t0 = time.time()
    total_train_loss = 0
    model.train()
    
    # Wrap the train_dataloader with tqdm for the progress bar
    train_iterator = tqdm(train_dataloader, desc="Training Iteration")
    
    for step, batch in enumerate(train_iterator):
        # Unpack this training batch from our dataloader.
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        optimizer.zero_grad()
        output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = output.loss
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        # Update the tqdm progress bar description with the current loss
        train_iterator.set_description(f"Training Loss: {loss.item():.4f}")
    
    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
    
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    print("")
    print("Running Validation...")
    t0 = time.time()
    model.eval()
    total_eval_accuracy = 0
    best_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    
    # Wrap the validation_dataloader with tqdm for the progress bar
    val_iterator = tqdm(validation_dataloader, desc="Validation Iteration")
    threshold = 0.8
    for batch in val_iterator:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad():
            output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        
        loss = output.loss
        total_eval_loss += loss.item()
        logits = output.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        
        # Update the tqdm progress bar description with the current loss
        predictions = []
        for probability in logits:
            if probability[1] >= threshold:
                predictions.append(1)
            else:
                predictions.append(0)
        accuracy = np.mean(predictions == label_ids)
        precision, recall, f1, support = precision_recall_fscore_support(label_ids, predictions, average='binary')
        val_iterator.set_description(f"Validation Loss: {loss.item():.4f}, Threshold: {threshold}, Accuracy: {accuracy:.2f}, Precision: {precision:.2f}, Recall: {recall:.2f}")
    
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    validation_time = format_time(time.time() - t0)
    
    if avg_val_accuracy > best_eval_accuracy:
        torch.save(model, 'bert_model')
        best_eval_accuracy = avg_val_accuracy
    
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time() - total_t0)))



Training...


Training Loss: 0.0196: 100%|██████████| 150/150 [01:36<00:00,  1.55it/s]



  Average training loss: 0.18
  Training epoch took: 0:01:37

Running Validation...


Validation Loss: 0.3882, Threshold: 0.8, Accuracy: 0.88, Precision: 1.00, Recall: 0.50: 100%|██████████| 100/100 [00:22<00:00,  4.52it/s]


  Accuracy: 0.97

Training...


Training Loss: 0.0610: 100%|██████████| 150/150 [01:37<00:00,  1.54it/s]



  Average training loss: 0.08
  Training epoch took: 0:01:37

Running Validation...


Validation Loss: 0.6615, Threshold: 0.8, Accuracy: 0.88, Precision: 1.00, Recall: 0.50: 100%|██████████| 100/100 [00:22<00:00,  4.52it/s]


  Accuracy: 0.97

Training...


Training Loss: 0.0006: 100%|██████████| 150/150 [01:36<00:00,  1.55it/s]



  Average training loss: 0.03
  Training epoch took: 0:01:37

Running Validation...


Validation Loss: 0.6992, Threshold: 0.8, Accuracy: 0.88, Precision: 1.00, Recall: 0.50: 100%|██████████| 100/100 [00:22<00:00,  4.53it/s]


  Accuracy: 0.98

Training...


Training Loss: 0.0070: 100%|██████████| 150/150 [01:36<00:00,  1.55it/s]



  Average training loss: 0.01
  Training epoch took: 0:01:37

Running Validation...


Validation Loss: 0.7010, Threshold: 0.8, Accuracy: 0.88, Precision: 1.00, Recall: 0.50: 100%|██████████| 100/100 [00:22<00:00,  4.53it/s]


  Accuracy: 0.98

Training complete!
Total training took 0:07:59 (h:mm:ss)


## To test
To test on some dataset or labelled websites enter the csv in DFT 

In [88]:
torch.cuda.empty_cache()

In [89]:
dft=pd.read_csv("/kaggle/input/text-data/en_curlie_notcheck_all.csv")#"/kaggle/input/bert-soft404/Traindf.csv"
dft.info()
dft= dft.sample(frac=1, random_state=42)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 403214 entries, 0 to 403213
Data columns (total 9 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   Unnamed: 0            403214 non-null  int64 
 1   Url                   403214 non-null  object
 2   Title                 399925 non-null  object
 3   Article               351417 non-null  object
 4   Text                  366386 non-null  object
 5   Image_Name            102778 non-null  object
 6   response_status_code  403214 non-null  int64 
 7   is_dead               403214 non-null  bool  
 8   prob                  403214 non-null  int64 
dtypes: bool(1), int64(3), object(5)
memory usage: 25.0+ MB


In [90]:
dft.head()

Unnamed: 0.1,Unnamed: 0,Url,Title,Article,Text,Image_Name,response_status_code,is_dead,prob
118232,166112,https://www.emtsinc.com,EMTS Inc,All Paramedic programs offered in partnership ...,Call today 877-385-EMTS (3687) Dean College-Fr...,logo_emts.pngtitle_fromemt.pngicon_fb.pngicon_...,200,False,0
86853,134733,https://www.visitwamego.com,Visit Wamego Kansas - Small Town. Big Experience.,Every year the town brings to life one of Amer...,October 1st 2022 Every year the town brings to...,,200,False,0
389691,171882,https://www.tamars.co.uk/mvnirieyogtjdqqseoqtk...,21st Century Back Care - TAMARS Treatment — 404,,Page not found,21_C_Logo.svg21_C_Logo.svg,404,True,0
216080,263960,https://www.introllingmills.com,A Global Leader in Metal Processing Equipment ...,What We Do We work closely with our customers ...,A GLOBAL LEADER IN METAL PROCESSING EQUIPMENT ...,,200,False,0
59200,107080,https://www.bondware.com,Websites for News Marketing and Commerce,Our software provides all the tools you need t...,,MV_thumbnail-2.png,200,False,0


In [91]:

dft['Title'] = dft['Title'].fillna("none")
dft['Text'] = dft['Text'].fillna("none")
dft['Image_Name'] = dft['Image_Name'].fillna("none")
dft = dft.dropna(subset=['is_dead'], axis=0)
dft['is_dead'] = dft['is_dead'].astype(bool)
# Convert non-numeric values to NaN
#dft['response_status_code'] = pd.to_numeric(dft['response_status_code'], errors='coerce')

# Drop rows with NaN values in the 'response_status_code' column
#dft = df.dropna(subset=['response_status_code'])

# Convert 'response_status_code' column to integer
#dft['response_status_code'] = dft['response_status_code'].astype(int)

In [92]:
dft.head()

Unnamed: 0.1,Unnamed: 0,Url,Title,Article,Text,Image_Name,response_status_code,is_dead,prob
118232,166112,https://www.emtsinc.com,EMTS Inc,All Paramedic programs offered in partnership ...,Call today 877-385-EMTS (3687) Dean College-Fr...,logo_emts.pngtitle_fromemt.pngicon_fb.pngicon_...,200,False,0
86853,134733,https://www.visitwamego.com,Visit Wamego Kansas - Small Town. Big Experience.,Every year the town brings to life one of Amer...,October 1st 2022 Every year the town brings to...,none,200,False,0
389691,171882,https://www.tamars.co.uk/mvnirieyogtjdqqseoqtk...,21st Century Back Care - TAMARS Treatment — 404,,Page not found,21_C_Logo.svg21_C_Logo.svg,404,True,0
216080,263960,https://www.introllingmills.com,A Global Leader in Metal Processing Equipment ...,What We Do We work closely with our customers ...,A GLOBAL LEADER IN METAL PROCESSING EQUIPMENT ...,none,200,False,0
59200,107080,https://www.bondware.com,Websites for News Marketing and Commerce,Our software provides all the tools you need t...,none,MV_thumbnail-2.png,200,False,0


In [93]:
dft.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 403214 entries, 118232 to 121958
Data columns (total 9 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   Unnamed: 0            403214 non-null  int64 
 1   Url                   403214 non-null  object
 2   Title                 403214 non-null  object
 3   Article               351417 non-null  object
 4   Text                  403214 non-null  object
 5   Image_Name            403214 non-null  object
 6   response_status_code  403214 non-null  int64 
 7   is_dead               403214 non-null  bool  
 8   prob                  403214 non-null  int64 
dtypes: bool(1), int64(3), object(5)
memory usage: 28.1+ MB


In [94]:
df['Concatenated'] = '[CLS]'  + df['Title'] +  '[SEP]'+ df['Text'] + '[SEP]' +df['Article']+ '[SEP]' +df['Image_Name']
df['Concatenated'] = df['Concatenated'].apply(lambda x:sanitize_text(x))

In [95]:
dft['Length'] = dft['Concatenated'].apply(lambda x: len(tokenizer.encode(x, add_special_tokens=True)))

# Drop rows where the length is greater than 450
dft = dft[dft['Length'] <= 450]

In [96]:
# Convert the values based on the condition
dft.loc[dft['response_status_code'] <= 205, 'response_status_code'] = 0
dft.loc[dft['response_status_code'] > 205, 'response_status_code'] = 1

# Check the unique values in the 'response_status_code' column
unique_values = dft['response_status_code'].unique()
print(unique_values)


[0 1]


In [97]:
dft.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 403096 entries, 118232 to 121958
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   Unnamed: 0            403096 non-null  int64 
 1   Url                   403096 non-null  object
 2   Title                 403096 non-null  object
 3   Article               351323 non-null  object
 4   Text                  403096 non-null  object
 5   Image_Name            403096 non-null  object
 6   response_status_code  403096 non-null  int64 
 7   is_dead               403096 non-null  bool  
 8   prob                  403096 non-null  int64 
 9   Concatenated          403096 non-null  object
 10  Length                403096 non-null  int64 
dtypes: bool(1), int64(4), object(6)
memory usage: 34.2+ MB


In [98]:
dft

Unnamed: 0.1,Unnamed: 0,Url,Title,Article,Text,Image_Name,response_status_code,is_dead,prob,Concatenated,Length
118232,166112,https://www.emtsinc.com,EMTS Inc,All Paramedic programs offered in partnership ...,Call today 877-385-EMTS (3687) Dean College-Fr...,logo_emts.pngtitle_fromemt.pngicon_fb.pngicon_...,0,False,0,[CLS]EMTS Inc[SEP]Call today 877-385-EMTS (368...,192
86853,134733,https://www.visitwamego.com,Visit Wamego Kansas - Small Town. Big Experience.,Every year the town brings to life one of Amer...,October 1st 2022 Every year the town brings to...,none,0,False,0,[CLS]Visit Wamego Kansas - Small Town. Big Exp...,61
389691,171882,https://www.tamars.co.uk/mvnirieyogtjdqqseoqtk...,21st Century Back Care - TAMARS Treatment — 404,,Page not found,21_C_Logo.svg21_C_Logo.svg,1,True,0,[CLS]21st Century Back Care - TAMARS Treatment...,33
216080,263960,https://www.introllingmills.com,A Global Leader in Metal Processing Equipment ...,What We Do We work closely with our customers ...,A GLOBAL LEADER IN METAL PROCESSING EQUIPMENT ...,none,0,False,0,[CLS]A Global Leader in Metal Processing Equip...,124
59200,107080,https://www.bondware.com,Websites for News Marketing and Commerce,Our software provides all the tools you need t...,none,MV_thumbnail-2.png,0,False,0,[CLS]Websites for News Marketing and Commerce[...,21
...,...,...,...,...,...,...,...,...,...,...,...
259178,307058,https://www.parklet.co.uk,Rent a Parking Space or Garage from ParkLet,Reading Berkshire RG1 An allocated parking spa...,Login / sign up short term and long term parki...,none,0,False,0,[CLS]Rent a Parking Space or Garage from ParkL...,129
365838,132610,https://www.aschendorf.com/jxdfuxjbxzpirgjmrar...,404 - Seite nicht gefunden,Die gesuchte Seite konnte nicht gefunden werde...,Die gesuchte Seite konnte nicht gefunden werde...,none,1,True,0,[CLS]404 - Seite nicht gefunden[SEP]Die gesuch...,53
131932,179812,https://multihull.co.uk,M.i Cats The specialists in Catamarans & Multi...,MiCats are the official and exclusive UK Chann...,MiCats are the official and exclusive UK Chann...,none,0,False,0,[CLS]M.i Cats The specialists in Catamarans & ...,165
146867,194747,https://www.pantoscripts.me.uk,pantoscripts – PANTO SCRIPTS pantomime scripts...,“Many thanks for the script I have had a read ...,PANTO SCRIPTS pantomime scripts for am dram an...,none,0,False,0,[CLS]pantoscripts – PANTO SCRIPTS pantomime sc...,153


In [99]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=False)

In [100]:
Content1 = dft.Concatenated.values
labels1 = dft.response_status_code.values
print(Content1[0])
print (len(labels1))

[CLS]EMTS Inc[SEP]Call today 877-385-EMTS (3687) Dean College-Franklin MA McNeilly EMS Educators-Danvers MA Nantucket Fire Dept-Nantucket MA Dean College-Franklin MA McNeilly EMS Educators-Danvers MA Nantucket Fire Dept-Nantucket MA DEAN-EMTS CONSORTIUM PROGRAM OUTCOMES 2020 National Registry Cognitive Exam Pass Rate 84% Program Retention Rate 75% Job Placement for Graduates 100% All Paramedic programs offered in partnership by EMTS Inc. and Dean College are sponsored by the Dean-EMTS Consortium. The Dean-EMTS Consortium is accredited by the Commission on Accreditation of Allied Health Education Programs ( www.caahep.org ) upon the recommendation of the Committee on Accreditation of Educational Programs for the Emergency Medical Services Professionslogo_emts.pngtitle_fromemt.pngicon_fb.pngicon_mail.png
403096


In [101]:
max_len = 0

# For every sentence...
print(len(Content1))
for sent in Content1:
    #print(sent)
   
    #Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))
    if len(input_ids) >450 :
        print(sent)

print('Max sentence length: ', max_len)

403096
Max sentence length:  441


In [None]:
input_ids1 = []
attention_masks1 = []

# For every webpage...
for sent in Content1:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_len,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        truncation=True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids1.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks1.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids1 = torch.cat(input_ids1, dim=0)
attention_masks1 = torch.cat(attention_masks1, dim=0)
labels1 = torch.tensor(labels1)

# Print sentence 0, now as a list of IDs.
print('Original: ', len(Content1))
      
print('Token IDs:', input_ids1[0])



In [None]:
dataset = TensorDataset(input_ids1, attention_masks1, labels1)

In [None]:
test_dataset = TensorDataset(input_ids1, attention_masks1)
test_dataloader = DataLoader(
            test_dataset, # The validation samples.
            sampler = SequentialSampler(test_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [None]:
model = torch.load('/kaggle/input/text-data/bert_model1')

In [None]:
predictions = []
# Wrap the test_dataloader with tqdm for the progress bar
test_iterator = tqdm(test_dataloader, desc="Testing Iteration")

for batch in test_iterator:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    with torch.no_grad():
        output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = output.logits
        logits = logits.detach().cpu().numpy()
        pred_flat = np.argmax(logits, axis=1).flatten()
        predictions.extend(list(pred_flat))


In [None]:
# Calculate accuracy
accuracy = accuracy_score(labels1,predictions)
print("Accuracy:", accuracy)

# Calculate recall
recall = recall_score(labels1, predictions)
print("Recall:", recall)

# Calculate precision
precision = precision_score(labels1, predictions)
print("Precision:", precision)


# Compute the confusion matrix
cm = confusion_matrix(labels1, predictions)

# Plot the confusion matrix
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()
tick_marks = np.arange(len(np.unique(labels)))
plt.xticks(tick_marks, np.unique(labels))
plt.yticks(tick_marks, np.unique(labels))
#plt.xticks(tick_marks, ['200 OK', '404 Error'])
#plt.yticks(tick_marks, ['200 OK', '404 Error'])

plt.xlabel('Predicted Label')
plt.ylabel('True Label')

# Add text annotations
thresh = cm.max() / 2.0
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, format(cm[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

plt.tight_layout()
plt.show()
