In [None]:
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder

train_data = pd.read_csv('/kaggle/input/movie-genre-prediction/train.csv')

<div style="background-color: #E8EAF6; padding: 20px; border-radius: 10px; box-shadow: 2px 2px 10px rgba(0, 0, 0, 0.1);">
    <h2 style="font-family: 'Verdana'; color: #3A405A;">🔍 PyTorch & Bert-Base-Uncased</h2>
    <p style="font-size: 18px; font-family: 'Verdana'; color: #3A405A; line-height: 1.5em;">This is my first approach to pretrained models.Before testing pretrained models, use tensorflow and classifiers such as catboost with accuracy of 0.30</p>
</div>

In [None]:

train_data = train_data.drop(columns=["id"])

label_encoder = LabelEncoder()

# Encoding
y_train_encoded = label_encoder.fit_transform(train_data['genre'])

# Model selection
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
embedding_model = BertModel.from_pretrained(model_name)

# GPU for large task...
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embedding_model.to(device)

# Tokenizer & embeddings
max_length = 12
concatenated_text = train_data['synopsis'] + " " + train_data['movie_name']
encoded_inputs = tokenizer(list(concatenated_text), padding='max_length', truncation=True, max_length=max_length, return_attention_mask=True)

# Create DataLoader for train ds 
train_dataset = TensorDataset(torch.tensor(encoded_inputs['input_ids']), torch.tensor(encoded_inputs['attention_mask']), torch.tensor(y_train_encoded))
train_loader = DataLoader(train_dataset, batch_size=12, shuffle=True)

# NETWORK definition,,
class CustomClassifier(nn.Module):
    def __init__(self, embedding_model, num_classes):
        super(CustomClassifier, self).__init__()
        self.embedding_model = embedding_model
        self.fc = nn.Linear(embedding_model.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        embeddings = self.embedding_model(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0]
        logits = self.fc(embeddings)
        return logits

# Create the new model aand move to GPU (device)
num_classes = len(label_encoder.classes_)
model = CustomClassifier(embedding_model, num_classes)
model.to(device)

# Optimizer & loss
optimizer = optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = nn.CrossEntropyLoss()

# Train!!!!!!!
model.train()
for epoch in range(4):  # Cambia el número de épocas según sea necesario
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/20", leave=False)
    total_correct = 0
    total_samples = 0
    
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = [item.to(device) for item in batch]
        logits = model(input_ids, attention_mask)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
        
        #  accuracy 
        _, predicted = torch.max(logits, 1)
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)
        accuracy = total_correct / total_samples
        
        progress_bar.set_postfix({"loss": loss.item(), "accuracy": accuracy})

    # Accuracy * epoche
    print(f'Epoch {epoch + 1} - Accuracy: {accuracy:.4f}')



# Eval_model 
model.eval()
total_correct = 0
total_samples = 0
with torch.no_grad():
    progress_bar = tqdm(train_loader, desc="Evaluating", leave=False)
    for batch in progress_bar:
        input_ids, attention_mask, labels = [item.to(device) for item in batch]
        logits = model(input_ids, attention_mask)
        _, predicted = torch.max(logits, 1)
        total_correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)
        progress_bar.set_postfix({"accuracy": total_correct / total_samples})

accuracy = total_correct / total_samples
print(f'Final Accuracy: {accuracy:.4f}')


<center>
    <img src="https://img.freepik.com/free-photo/cute-ai-generated-cartoon-bunny_23-2150288886.jpg" style="width: 250px; height: 250px; border-radius: 10px; box-shadow: 5px 5px 15px rgba(0, 0, 0, 0.1);">
</center>

<div style="background-color: #FFF3E0; padding: 20px; border-radius: 10px; box-shadow: 2px 2px 10px rgba(0, 0, 0, 0.1);">
    <h3 style="font-family: 'Verdana'; color: #FF5733;">OPEN TO COMMENTS & EDITS</h3>
    