In [1]:
# A dependency of the preprocessing for BERT inputs
!pip install -U "tensorflow-text==2.13.*"
!pip install "tf-models-official==2.13.*"
!pip install transformers
!pip install torch torchvision torchaudio

Collecting typing-extensions<4.6.0,>=3.6.6 (from tensorflow<2.14,>=2.13.0->tensorflow-text==2.13.*)
  Using cached typing_extensions-4.5.0-py3-none-any.whl.metadata (8.5 kB)
Using cached typing_extensions-4.5.0-py3-none-any.whl (27 kB)
Installing collected packages: typing-extensions
  Attempting uninstall: typing-extensions
    Found existing installation: typing_extensions 4.10.0
    Uninstalling typing_extensions-4.10.0:
      Successfully uninstalled typing_extensions-4.10.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.2.1 requires typing-extensions>=4.8.0, but you have typing-extensions 4.5.0 which is incompatible.[0m[31m
[0mSuccessfully installed typing-extensions-4.5.0
Collecting typing-extensions>=4.8.0 (from torch)
  Using cached typing_extensions-4.10.0-py3-none-any.whl.metadata (3.0 kB)
Using cached typing_extensions-4.10.0-py3-non

In [2]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
tf.get_logger().setLevel('ERROR')

2024-04-01 22:53:41.600860: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
df = pd.read_csv('animes.csv', usecols=['synopsis', 'genre'])

df.head()

Unnamed: 0,synopsis,genre
0,Following their participation at the Inter-Hig...,"['Comedy', 'Sports', 'Drama', 'School', 'Shoun..."
1,Music accompanies the path of the human metron...,"['Drama', 'Music', 'Romance', 'School', 'Shoun..."
2,The Abyss—a gaping chasm stretching down into ...,"['Sci-Fi', 'Adventure', 'Mystery', 'Drama', 'F..."
3,"""In order for something to be obtained, someth...","['Action', 'Military', 'Adventure', 'Comedy', ..."
4,After helping revive the legendary vampire Kis...,"['Action', 'Mystery', 'Supernatural', 'Vampire']"


In [4]:
df.dropna(subset = ['synopsis'], inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18336 entries, 0 to 19310
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   synopsis  18336 non-null  object
 1   genre     18336 non-null  object
dtypes: object(2)
memory usage: 429.8+ KB


In [5]:
from pprint import pprint
#importing defaultdict
from collections import defaultdict
#saving total genres here.
hash_map = defaultdict(lambda: 0)
#saving each row's genre in genrelist
genrelist = []
for text in df['genre']:
    templist = text[2:len(text)-2].split("', '")
    for i in range(len(templist)):
        if templist[i] == '':
            templist.pop(i)
            break
    genrelist.append(templist)
    for j in templist:
        hash_map[j] += 1
        
# Adding genrelist Column
df['genre_list'] = genrelist
all_genres = list(hash_map)
df.drop(columns=['genre'], inplace=True)
len(all_genres)

43

In [6]:
#Attempt to insert the optimizer 

from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import precision_score, recall_score, f1_score, multilabel_confusion_matrix

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(all_genres))
max_length = 512  # Changed max_length to 256
tokenized_texts = []
labels = []
criterion = nn.BCEWithLogitsLoss()
for synopsis, genre_list in zip(df['synopsis'], df['genre_list']):
    tokenized_text = tokenizer.encode_plus(synopsis, add_special_tokens=True, max_length=max_length, padding='max_length', truncation=True)
    tokenized_texts.append(tokenized_text)

    # Encode labels based on genre_list using one-hot encoding
    label = torch.zeros(len(all_genres))
    for genre in genre_list:
        label[all_genres.index(genre)] = 1
    labels.append(label)

input_ids = torch.tensor([tokenized_text['input_ids'] for tokenized_text in tokenized_texts])
attention_masks = torch.tensor([tokenized_text['attention_mask'] for tokenized_text in tokenized_texts])
labels = torch.stack(labels)

# Split data into training and validation sets
train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
    input_ids, attention_masks, labels, random_state=42, test_size=0.2
)

# Create DataLoader for training and validation sets
batch_size = 8  # Adjusted batch size for shorter sequences
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_loader = DataLoader(val_data, batch_size=batch_size)
from transformers import AdamW, get_linear_schedule_with_warmup


optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8,
                  no_deprecation_warning=True)
                  
epochs = 5

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(train_loader)*epochs)
mcm_dict = {}
epoch_dict = {}
for epoch in tqdm(range(epochs), desc='Epochs'):
    model.train()
    running_loss = 0.0
    
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}', leave=False):
        input_ids, attention_masks, labels = batch
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids, attention_mask=attention_masks)
        logits = outputs.logits
        
        # Compute loss
        loss = criterion(logits, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        scheduler.step()
        running_loss += loss.item()
    
    # Calculate average training loss for the epoch
    avg_train_loss = running_loss / len(train_loader)
    
    # Validation phase
    model.eval()
    val_loss = 0.0
    correct_predictions = 0
    total_predictions = 0
    all_predicted_labels=[]
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f'Validation Epoch {epoch + 1}', leave=False):
            input_ids, attention_masks, labels = batch
            outputs = model(input_ids, attention_mask=attention_masks)
            logits = outputs.logits
            
            # Compute loss
            loss = criterion(logits, labels)
            val_loss += loss.item()
            # Compute accuracy
            probabilities = torch.sigmoid(logits)
            predicted_labels = (probabilities > 0.5).to(torch.float32)
            correct_predictions += torch.sum(predicted_labels == labels).item()
            total_predictions += labels.size(0) * labels.size(1)
            all_predicted_labels.extend(predicted_labels.tolist())
            #print(torch.sum(predicted_labels == labels).item())
            #print(labels.size(0) * labels.size(1))
    
    avg_val_loss = val_loss / len(val_loader)
    accuracy = correct_predictions / total_predictions
    #print(f'correct predictions: {correct_predictions}, total predictions: {total_predictions}, accuracy: {accuracy}')
    y_true = val_labels.cpu().numpy()
    y_pred = np.array(all_predicted_labels)
    mcm = multilabel_confusion_matrix(y_true,y_pred)
    
    mcm_dict[f'epoch_{epoch}'] = mcm
    precision = precision_score(labels.cpu(), predicted_labels.cpu(), average='micro', zero_division=np.nan)
    recall = recall_score(labels.cpu(), predicted_labels.cpu(), average='micro', zero_division=np.nan)
    f1 = f1_score(labels.cpu(), predicted_labels.cpu(), average='micro', zero_division=np.nan)
    print(f'Epoch_{epoch}:')
    print(f'Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}')
    epoch_dict[f'epoch_{epoch}'] = {
        'Training Loss': avg_train_loss,
        'Validation Loss': avg_val_loss,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-score': f1
    }
# Save the fine-tuned model
model.save_pretrained('./fine_tuned_bert_model')
tokenizer.save_pretrained('./fine_tuned_bert_model')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epochs:   0%|                                                                              | 0/5 [00:00<?, ?it/s]
Epoch 1:   0%|                                                                          | 0/1834 [00:00<?, ?it/s][A
Epoch 1:   0%|                                                               | 1/1834 [00:31<15:50:20, 31.11s/it][A
Epoch 1:   0%|                                                               | 2/1834 [01:01<15:45:04, 30.95s/it][A
Epoch 1:   0%|                                                               | 3/1834 [01:31<15:29:31, 30.46s/it][A
Epoch 1:   0%|▏                                                              | 4/1834 [02:02<15:26:57, 30.39s/it][A
Epoch 1:

Epoch_0:
Training Loss: 0.2338, Validation Loss: 0.1865, Accuracy: 0.9368
Precision: 0.6667, Recall: 0.1667, F1-score: 0.2667



Epoch 2:   0%|                                                                          | 0/1834 [00:00<?, ?it/s][A
Epoch 2:   0%|                                                               | 1/1834 [00:31<16:14:32, 31.90s/it][A
Epoch 2:   0%|                                                               | 2/1834 [01:00<15:15:38, 29.99s/it][A
Epoch 2:   0%|                                                               | 3/1834 [01:29<15:03:50, 29.62s/it][A
Epoch 2:   0%|▏                                                              | 4/1834 [01:57<14:36:02, 28.72s/it][A
Epoch 2:   0%|▏                                                              | 5/1834 [02:26<14:40:16, 28.88s/it][A
Epoch 2:   0%|▏                                                              | 6/1834 [02:55<14:45:14, 29.06s/it][A
Epoch 2:   0%|▏                                                              | 7/1834 [03:23<14:33:58, 28.70s/it][A
Epoch 2:   0%|▎                                                

Epoch_1:
Training Loss: 0.1732, Validation Loss: 0.1648, Accuracy: 0.9427
Precision: 0.6667, Recall: 0.1667, F1-score: 0.2667



Epoch 3:   0%|                                                                          | 0/1834 [00:00<?, ?it/s][A
Epoch 3:   0%|                                                               | 1/1834 [00:34<17:23:36, 34.16s/it][A
Epoch 3:   0%|                                                               | 2/1834 [01:05<16:32:55, 32.52s/it][A
Epoch 3:   0%|                                                               | 3/1834 [01:35<15:55:07, 31.30s/it][A
Epoch 3:   0%|▏                                                              | 4/1834 [02:07<16:04:48, 31.63s/it][A
Epoch 3:   0%|▏                                                              | 5/1834 [02:38<15:57:03, 31.40s/it][A
Epoch 3:   0%|▏                                                              | 6/1834 [03:09<15:54:05, 31.32s/it][A
Epoch 3:   0%|▏                                                              | 7/1834 [03:41<15:54:30, 31.35s/it][A
Epoch 3:   0%|▎                                                

Epoch_2:
Training Loss: 0.1532, Validation Loss: 0.1550, Accuracy: 0.9464
Precision: 0.5000, Recall: 0.1667, F1-score: 0.2500



Epoch 4:   0%|                                                                          | 0/1834 [00:00<?, ?it/s][A
Epoch 4:   0%|                                                               | 1/1834 [00:32<16:20:42, 32.10s/it][A
Epoch 4:   0%|                                                               | 2/1834 [01:01<15:35:52, 30.65s/it][A
Epoch 4:   0%|                                                               | 3/1834 [01:31<15:17:49, 30.08s/it][A
Epoch 4:   0%|▏                                                              | 4/1834 [02:01<15:22:09, 30.23s/it][A
Epoch 4:   0%|▏                                                              | 5/1834 [02:31<15:22:47, 30.27s/it][A
Epoch 4:   0%|▏                                                              | 6/1834 [03:02<15:24:12, 30.34s/it][A
Epoch 4:   0%|▏                                                              | 7/1834 [03:32<15:18:10, 30.15s/it][A
Epoch 4:   0%|▎                                                

Epoch_3:
Training Loss: 0.1414, Validation Loss: 0.1495, Accuracy: 0.9479
Precision: 0.5000, Recall: 0.1667, F1-score: 0.2500



Epoch 5:   0%|                                                                          | 0/1834 [00:00<?, ?it/s][A
Epoch 5:   0%|                                                               | 1/1834 [00:33<16:56:10, 33.26s/it][A
Epoch 5:   0%|                                                               | 2/1834 [01:03<16:06:51, 31.67s/it][A
Epoch 5:   0%|                                                               | 3/1834 [01:35<16:07:02, 31.69s/it][A
Epoch 5:   0%|▏                                                              | 4/1834 [02:05<15:44:23, 30.96s/it][A
Epoch 5:   0%|▏                                                              | 5/1834 [02:36<15:40:46, 30.86s/it][A
Epoch 5:   0%|▏                                                              | 6/1834 [03:06<15:35:55, 30.72s/it][A
Epoch 5:   0%|▏                                                              | 7/1834 [03:36<15:30:14, 30.55s/it][A
Epoch 5:   0%|▎                                                

Epoch_4:
Training Loss: 0.1350, Validation Loss: 0.1486, Accuracy: 0.9480
Precision: 0.5000, Recall: 0.2500, F1-score: 0.3333


('./fine_tuned_bert_model/tokenizer_config.json',
 './fine_tuned_bert_model/special_tokens_map.json',
 './fine_tuned_bert_model/vocab.txt',
 './fine_tuned_bert_model/added_tokens.json')

In [7]:
print(mcm_dict.keys())

dict_keys(['epoch_0', 'epoch_1', 'epoch_2', 'epoch_3', 'epoch_4'])


In [None]:
Epoch_0:
Training Loss: 0.2338, Validation Loss: 0.1865, Accuracy: 0.9368
Precision: 0.6667, Recall: 0.1667, F1-score: 0.2667

Epoch_1:
Training Loss: 0.1732, Validation Loss: 0.1648, Accuracy: 0.9427
Precision: 0.6667, Recall: 0.1667, F1-score: 0.2667

Epoch_2:
Training Loss: 0.1532, Validation Loss: 0.1550, Accuracy: 0.9464
Precision: 0.5000, Recall: 0.1667, F1-score: 0.2500

Epoch_3:
Training Loss: 0.1414, Validation Loss: 0.1495, Accuracy: 0.9479
Precision: 0.5000, Recall: 0.1667, F1-score: 0.2500

Epoch_4:
Training Loss: 0.1350, Validation Loss: 0.1486, Accuracy: 0.9480
Precision: 0.5000, Recall: 0.2500, F1-score: 0.3333