In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import model_utils as mu
from gensim.models import Word2Vec
import numpy as np
import torch
import torch.nn as nn
from tqdm.autonotebook import tqdm

EMBEDDINGS_SIZE = 50
NUM_SEQUENCES_PER_BATCH = 128

# Load the pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')


In [2]:
#load sentences
similar_song_lyrics= pd.read_csv("data/processed_data.csv") 

similar_song_lyrics.head()


Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,song,similar_title,similar_artist,similar_song
0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1,Killa Cam by Cam'ron,Bubble Music,Cam'ron,Bubble Music by Cam'ron
1,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1,Killa Cam by Cam'ron,Get Down,Cam'ron,Get Down by Cam'ron
2,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1,Killa Cam by Cam'ron,The King,Jim Jones,The King by Jim Jones
3,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1,Killa Cam by Cam'ron,freestyle,The Diplomats,freestyle by The Diplomats
4,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1,Killa Cam by Cam'ron,Santana the great,The Diplomats,Santana the great by The Diplomats


In [3]:
lyrics, song_title = similar_song_lyrics["lyrics"].tolist(), similar_song_lyrics["similar_title"].tolist()
processed_lyrics = []
for lyric in lyrics:
    processed_lyrics.append(mu.preprocess_sentence(lyric))

In [4]:
#Create word embeddings 
embeddings = model.encode(processed_lyrics, batch_size=NUM_SEQUENCES_PER_BATCH, convert_to_tensor=True)


In [5]:
embeddings[:5]

tensor([[-0.0408, -0.0494, -0.0330,  ..., -0.0270, -0.0302, -0.0979],
        [-0.0408, -0.0494, -0.0330,  ..., -0.0270, -0.0302, -0.0979],
        [-0.0408, -0.0494, -0.0330,  ..., -0.0270, -0.0302, -0.0979],
        [-0.0408, -0.0494, -0.0330,  ..., -0.0270, -0.0302, -0.0979],
        [-0.0408, -0.0494, -0.0330,  ..., -0.0270, -0.0302, -0.0979]],
       device='mps:0')

In [6]:
print(len(processed_lyrics))
print(embeddings.shape)


637
torch.Size([637, 384])


In [7]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

# Fit and transform the labels to integers
y_encoded = label_encoder.fit_transform(song_title)

# Convert to a PyTorch tensor
y_tensor = torch.tensor(y_encoded, dtype=torch.long)

In [8]:
#word_dataloader_train, word_dataloader_test = mu.create_dataloaders(embeddings, y_tensor, NUM_SEQUENCES_PER_BATCH)


In [9]:
# 10 points

class FFNN(nn.Module):
    """
    A class representing our implementation of a Feed-Forward Neural Network.
    You will need to implement two methods:
        - A constructor to set up the architecture and hyperparameters of the model
        - The forward pass
    """
    
    def __init__(self, vocab_size: int, embedding_size: int, hidden_units=128, device: str = "mps"):
        """
        Initialize a new untrained model. 
        
        You can change these parameters as you would like.
        Once you get a working model, you are encouraged to
        experiment with this constructor to improve performance.
        
        Params:
            vocab_size: The number of words in the vocabulary
            ngram: The value of N for training and prediction.
            embedding_layer: The previously trained embedder. 
            hidden_units: The size of the hidden layer.
        """        
        super().__init__()
        # YOUR CODE HERE
        # we recommend saving the parameters as instance variables
        # so you can access them later as needed
        # (in addition to anything else you need to do here)
        
		# Saving parameters as instance variables
        self.vocab_size = vocab_size
        #self.ngram = ngram
        self.hidden_units = hidden_units
         # Embedding size
		# Save embedding size

        #embedding_size = embedding_layer.embedding_dim
        
		# Defining layers
        self.flatten = nn.Flatten() # Useful later to flatten array of ngram-1 after embedding before passing it to the linear layer
        self.linear_relu_stack = nn.Sequential(
			nn.Linear(in_features=embedding_size, out_features=hidden_units, bias=True),
			nn.ReLU(),
			nn.Linear(in_features=hidden_units, out_features=vocab_size, bias=True)
		)

        self.to(device)
        
    def forward(self, X: list) -> torch.tensor:
        """
        Compute the forward pass through the network.
        This is not a prediction, and it should not apply softmax.

        Params:
            X: the input data

        Returns:
            The output of the model; i.e. its predictions.
        
        """
        # YOUR CODE HERE
        flat_embedded = self.flatten(X)
        logits = self.linear_relu_stack(flat_embedded)
        return logits


In [10]:
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")


In [11]:
# 10 points

# Defining a training function that goes over every batch per epoch
def train_one_epoch(dataloader, nn_model, optimizer, loss_fn):
    epoch_loss = 0

    for data in dataloader:
        # Separating the input + label pair for each instance
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)

		# Zeroing gradients for every batch
        optimizer.zero_grad()
        
		# Make predictions for this batch
        outputs = nn_model(inputs)
        
		# Compute loss and gradients
        batch_loss = loss_fn(outputs, labels)
        batch_loss.backward()
        
		# Adjust learning weights
        optimizer.step()
        
		# Adding to epoch loss
        epoch_loss += batch_loss.item() # Covert scalar tensor into floating-point

    return epoch_loss

# Defining a general training function that goes over all the epochs
def train(dataloader, input_model, epochs: int = 1, lr: float = 0.001) -> None:
    """
    Our model's training loop.
    Print the cross entropy loss every epoch.
    You should use the Adam optimizer instead of SGD.

    When looking for documentation, try to stay on PyTorch's website.
    This might be a good place to start: https://pytorch.org/tutorials/beginner/introyt/trainingyt.html 
    They should have plenty of tutorials, and we don't want you to get confused from other resources.

    Params:
        dataloader: The training dataloader
        model: The model we wish to train
        epochs: The number of epochs to train for
        lr: Learning rate 
    """
    # YOUR CODE HERE
    # you will need to initialize an optimizer and a loss function, which you should do
    # before the training loop
    
    optimizer = torch.optim.Adam(input_model.parameters(), lr=lr) # Adam optimizer instead of SGD
    loss_fn = torch.nn.CrossEntropyLoss() # Multinomial Cross Entropy Loss that applies log-softmax internally and computes the negative log likelihood
    
    n_batches = len(dataloader)
    
	# Making sure gradient tracking is on before start training
    input_model.train()
    
    for epoch in tqdm(range(epochs)):
        epoch_loss = train_one_epoch(dataloader, input_model, optimizer, loss_fn)
        avg_epoch_loss = epoch_loss/n_batches
        print(f"Epoch: {epoch}, Loss: {avg_epoch_loss}\n")

    # print out the epoch number and the current average loss after each epoch
    # you can use tqdm to print out a progress bar

In [12]:
nn_model = FFNN(vocab_size=len(song_title) , embedding_size=384)
nn_model.to(device)



FFNN(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=384, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=637, bias=True)
  )
)

In [13]:
#train(word_dataloader_train, nn_model, epochs=5, lr=0.01)


In [None]:
# 3 points

# make a function that does your full *training* pipeline
# This is essentially pulling the pieces that you've done so far earlier in this 
# notebook into a single function that you can call to train your model

def full_pipeline(x,y,
                batch_size:int = NUM_SEQUENCES_PER_BATCH, hidden_units = 128, epochs = 1,
                lr = 0.001, test_pct = 0.1
                ) -> FFNN:
    """
    Run the entire pipeline from loading embeddings to training.
    You won't use the test set for anything.

    Params:
        data: The raw data to train on, parsed as a list of lists of tokens
        word_embeddings_filename: The filename of the Word2Vec word embeddings
        batch_size: The batch size to use
        hidden_units: The number of hidden units to use
        epochs: The number of epochs to train for
        lr: The learning rate to use
        test_pct: The proportion of samples to use in the test set.

    Returns:
        The trained model.
    """
    # Loading embeddings
    
	# Define vocab size from embedder
    vocab_size = len(y)
    
	# Prepare training samples
    #X = torch.tensor(embeddings, dtype=torch.float32)

    x_train, x_test, y_train, y_test = mu.split_dataset(embeddings, y)

	# Create training dataloader
    dataloader_train, dataloader_test = mu.create_dataloaders(x_train, x_test, batch_size, test_pct, num_sequences_per_batch=NUM_SEQUENCES_PER_BATCH)

	# Create FFNN model
    nn_model = FFNN(vocab_size=vocab_size, embedding_size=384, hidden_units=hidden_units)

	# Train our model
    train(dataloader=dataloader_train, input_model=nn_model, epochs=epochs, lr=lr)

    return model, dataloader_test

In [15]:
print(embeddings.shape)  # Should be [num_samples, embedding_dim]


torch.Size([637, 384])


In [16]:
base_word_model, test_dataloader = full_pipeline(x=embeddings, y=y_tensor, epochs=200)


TypeError: create_dataloaders() missing 1 required positional argument: 'num_sequences_per_batch'

In [None]:
base_word_model.eval()  # Set the model to evaluation mode
base_word_model.to("cpu")  # Move the model to CPU for inference if needed
# use the model to classify test data set
def evaluate_model(model, dataloader):
    model.eval()  # Set the model to evaluation mode
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to("cpu"), labels.to("cpu")  # Move to CPU if needed
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = correct / total
    print(f'Accuracy of the model on the test set: {accuracy * 100:.2f}%')
    return accuracy

In [None]:
# Evaluate the model on the test set
accuracy = evaluate_model(base_word_model, test_dataloader)
# Save the trained model
torch.save(base_word_model.state_dict(), "trained_ffnn_model.pth")
print("Model saved as 'trained_ffnn_model.pth'")
# Load the model for future use
print(accuracy)