In [12]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/smallsongs2/l_df.csv


In [13]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

In [14]:

def load_song_data(data_file, bert_model_name, max_length):
  df = pd.read_csv(data_file)
  tokenizer = BertTokenizer.from_pretrained(bert_model_name)
  df['encoding'] = df.apply(lambda x: tokenizer(x['S_Lyric'], return_tensors='pt', max_length=max_length, padding='max_length', truncation=True) ,  axis=1)  
  encodings = df['encoding'].tolist()
  #texts = df['S_Lyric'].tolist()
  labels = [int(v) for v in df['Genre_Index'].tolist()]
  ids_genres = df[['Genre_Index', 'Genre']].drop_duplicates()
  ids_genres = ids_genres.set_index('Genre_Index')
  return encodings, labels, ids_genres
  #return texts, labels, ids_genres

In [23]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        
        self.tokenizer = tokenizer
        self.max_length = max_length
        pass
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.texts[idx]
        label = self.labels[idx]
        #encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}
        #encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        #return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}
        
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        #self.fc1 = nn.Linear(self.bert.config.hidden_size, 64)
        #self.relu = nn.ReLU()
        #self.fc2 = nn.Linear(256, 64)
        #self.fc3 = nn.Linear(64, num_classes)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        #x = self.relu(self.fc1(x))
        #x = self.fc2(x)
        #x = self.fc3(x)
        #return x
        logits = self.fc(x)
        return logits

def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

def predict_sentiment(text, model, tokenizer, device, ids_genres, max_length=128):
    model.eval()
    #encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    encoding = text
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
    #return "positive" if preds.item() == 1 else "negative"
    return ids_genres.loc[preds.item()][0]

In [16]:
# Set up parameters
bert_model_name = 'bert-base-uncased'

max_length = 128
batch_size = 16
num_epochs = 10
learning_rate = 2e-5

data_file = "/kaggle/input/smallsongs2/l_df.csv"
texts, labels, ids_genres = load_song_data(data_file, bert_model_name, max_length)

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2) #, random_state=42)

In [24]:

tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

num_classes = ids_genres.shape[0]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BERTClassifier(bert_model_name, num_classes) #.to(device)
model = torch.nn.DataParallel(model).to(device) #.to(device).to(device)

optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [18]:
num_classes
ids_genres

Unnamed: 0_level_0,Genre
Genre_Index,Unnamed: 1_level_1
0,Country
1,Hip-Hop
2,Electronic
3,R&B


In [None]:
for epoch in range(num_epochs):
  print(f"Epoch {epoch + 1}/{num_epochs}")
  train(model, train_dataloader, optimizer, scheduler, device)
  accuracy, report = evaluate(model, train_dataloader, device)
  print(f"Train Accuracy: {accuracy:.4f}")
  accuracy, report = evaluate(model, val_dataloader, device)
  print(f"Validation Accuracy: {accuracy:.4f}")
  torch.save(model.state_dict(), f"version-{epoch}-acc-{accuracy:.4f}.pth")
  print(report)

Epoch 1/10
Train Accuracy: 0.7780
Validation Accuracy: 0.6829
              precision    recall  f1-score   support

           0       0.76      0.56      0.65       710
           1       0.85      0.80      0.83      1946
           2       0.48      0.69      0.56       873
           3       0.61      0.58      0.60      1432

    accuracy                           0.68      4961
   macro avg       0.68      0.66      0.66      4961
weighted avg       0.70      0.68      0.69      4961

Epoch 2/10
Train Accuracy: 0.9077
Validation Accuracy: 0.7148
              precision    recall  f1-score   support

           0       0.81      0.57      0.67       710
           1       0.88      0.81      0.84      1946
           2       0.55      0.66      0.60       873
           3       0.61      0.70      0.65      1432

    accuracy                           0.71      4961
   macro avg       0.71      0.68      0.69      4961
weighted avg       0.73      0.71      0.72      4961

Epoch 

In [None]:
#!rm /kaggle/working/*