In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Neil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Neil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
df = pd.read_csv('./archive/spotify_songs.csv')

In [3]:
'''
The following code is copy and pasted from Participation Activity #2, "feedforward_LM_activity", with slight adjustment in the 
training data. We also changed the amount of epochs that we used from 10 to 20.
'''
import torch
from torch import nn
from torch import optim

import random

# This is a function that prints the number of trainable parameters 
# of a model.
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# This functions prints all parameters (and their gradients) of a model.
def print_parameters(model):
    for name, param in model.named_parameters():
        print(name)
        print(param.data)
        print(param.grad)

In [4]:
#Defines a bigram language model
class FeedforwardLM(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.hidden_layer = nn.Linear(embedding_dim, hidden_dim)
        self.output_layer = nn.Linear(hidden_dim, vocab_size)
        
        self.relu = nn.ReLU()
        
    def forward(self, input):

        embedding = self.embedding(input)
        hidden_rep = self.relu(self.hidden_layer(embedding))
        output = self.output_layer(hidden_rep)
        
        return output

In [5]:
def load_data(filename, old_vocab=False):
    
    # Load data, convert text into tensors, construct vocabulary, return data and vocab
    if not old_vocab:
        vocab = {'<UNK>': 0}
    else:
        vocab = old_vocab
    data = list()
    file = open(filename, encoding='utf-8')
    for line in file:
        line_text = line.split()
        line_text = ['<s>'] + line_text + ['</s>']
        
        if not old_vocab:
            # form vocabulary
            for word in line_text:
                if word not in vocab:
                    vocab[word] = len(vocab)
        
        # add words to data
        for i, word in enumerate(line_text):
            if i < len(line_text) - 1:
                if word in vocab:
                    idx1 = vocab[word]
                else: 
                    idx1 = vocab['<UNK>']
                if line_text[i + 1] in vocab:
                    idx2 = vocab[line_text[i + 1]]
                else: 
                    idx2 = vocab['<UNK>']
                data.append((torch.tensor(idx1), torch.tensor(idx2)))
                
    return vocab, data

In [6]:
# 1) Load the data, and shuffle the training data.
vocab, train_data = load_data('train_lyrics.txt')
print(len(vocab))
_, dev_data = load_data('dev_lyrics.txt', vocab)
print(len(vocab))
_, test_data = load_data('test_lyrics.txt', vocab)
print(len(vocab))

print(train_data[0])
random.shuffle(train_data)
print(train_data[0])

832
832
832
(tensor(1), tensor(2))
(tensor(485), tensor(56))


In [7]:
# 2) Initialize our model.

our_lm = FeedforwardLM(len(vocab), 10, 15)
count_parameters(our_lm)


21797

In [9]:
def validate(model, data):
    
    model.eval()
    
    # Implement validation function
    av_loss = 0
    for (x, y) in data[:1000]:
        
        # a) calculate probs / get an output
        y_raw = model(x)
        
        # b) compute loss
        loss = ce(y_raw.unsqueeze(0),y.unsqueeze(0))
        av_loss += loss

    av_loss = av_loss/len(data[:1000])
    
    print("Average loss: " + str(av_loss))
    

In [10]:
# 3) Now we train our model.

epochs = 20
ce = nn.CrossEntropyLoss()
softmax = nn.Softmax(dim=0)
optimizer = optim.SGD(our_lm.parameters(), lr=0.1)

for i in range(epochs):
    print('### Epoch: ' + str(i+1) + ' ###')
    av_loss = 0
    our_lm.train()
    for (x, y) in train_data[:10000]:
        optimizer.zero_grad()
        
        # a) calculate probs / get an output
        y_raw = our_lm(x)
        y_hat = softmax(y_raw)
        
        # b) compute loss
        loss = ce(y_raw.unsqueeze(0),y.unsqueeze(0))
        av_loss += loss
        
        # c) get the gradient
        loss.backward()

        # d) update the weights
        optimizer.step()
    validate(our_lm, dev_data)
    print(av_loss/len(train_data[:10000]))

### Epoch: 1 ###
Average loss: tensor(6.6326, grad_fn=<DivBackward0>)
tensor(4.7961, grad_fn=<DivBackward0>)
### Epoch: 2 ###
Average loss: tensor(6.9438, grad_fn=<DivBackward0>)
tensor(4.3230, grad_fn=<DivBackward0>)
### Epoch: 3 ###
Average loss: tensor(7.2048, grad_fn=<DivBackward0>)
tensor(3.9856, grad_fn=<DivBackward0>)
### Epoch: 4 ###
Average loss: tensor(7.5044, grad_fn=<DivBackward0>)
tensor(3.7286, grad_fn=<DivBackward0>)
### Epoch: 5 ###
Average loss: tensor(7.7607, grad_fn=<DivBackward0>)
tensor(3.5551, grad_fn=<DivBackward0>)
### Epoch: 6 ###
Average loss: tensor(7.9974, grad_fn=<DivBackward0>)
tensor(3.4105, grad_fn=<DivBackward0>)
### Epoch: 7 ###
Average loss: tensor(8.4696, grad_fn=<DivBackward0>)
tensor(3.3055, grad_fn=<DivBackward0>)
### Epoch: 8 ###
Average loss: tensor(8.4718, grad_fn=<DivBackward0>)
tensor(3.2082, grad_fn=<DivBackward0>)
### Epoch: 9 ###
Average loss: tensor(8.7441, grad_fn=<DivBackward0>)
tensor(3.1281, grad_fn=<DivBackward0>)
### Epoch: 10 ###
A

In [11]:
# 1) Choose a starting word/words

starting_words = ["We're"]
def pred_word(starting_words):
    # 2) Find the index of the word, create a tensor, and pass it through the LM
    with torch.no_grad():
        input_tensor = torch.tensor([vocab[word] for word in starting_words]).unsqueeze(0)  # Convert starting words to tensor
        output_tensor = our_lm(input_tensor)  # Pass the tensor through the language model

    # 3) Find the predicted next word from the LM's output
    predicted_next_word_index = torch.argmax(output_tensor).item()

    # 4) Decode the next word and print it
    for word, index in vocab.items():
        if index == predicted_next_word_index:
            print("Predicted next word:", word)
            break

    return word
x = pred_word(starting_words)
print(x)

Predicted next word: know
know


#Makes a sentence using the most common word after the inputed word for an input of x "sentence_len" amount of words, which in our case is 10
def Make_Sentence(starting_words, sentence_len, sentence, curr_len = 0):
    #Base case of recursive function
    if curr_len == sentence_len:
        return sentence
    #predictes the next word based off the previous word
    starting_words_next = pred_word(starting_words)
    sentence.append(starting_words_next)
    Make_Sentence([starting_words_next], sentence_len, sentence, curr_len +1)
    return sentence

In [17]:
x = Make_Sentence(["baby"], 10, ["baby"])
print(x)


Predicted next word: for
Predicted next word: ya
Predicted next word: That
Predicted next word: morning
Predicted next word: coffee,
Predicted next word: brewed
Predicted next word: it
Predicted next word: for
Predicted next word: ya
Predicted next word: That
['baby', 'for', 'ya', 'That', 'morning', 'coffee,', 'brewed', 'it', 'for', 'ya', 'That']


In reality, we used 4 different words for our song that we used in the presentation "I", "oh", "baby", and "back". However, since we reran the code since then with a different training and test set, we cannot reproduce this result. But the final result was as follows:

['I', "can't", 'prove', 'it', 'haunts', 'her', 'I', 'guess', 'on', 'Come', 'on']

['oh', 'Switch', 'it', 'haunts', 'her', 'I', 'guess', 'on', 'Come', 'on', 'Come']

['baby', 'You', 'can', 'You', 'can', 'You', 'can', 'You', 'can', 'You', 'can']

['back', 'of', 'you', 'for', 'ya', 'I', 'guess', 'on', 'Come', 'on', 'Come']