In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
import json
import nltk

In [2]:
from preprocess import utils, vocabulary

## Download dummy data

In [6]:
USE_DUMMY_DATA = False

In [7]:
if USE_DUMMY_DATA:
    nltk.download('brown') #sample corpus from nltk
    corpus_object = nltk.corpus.brown
    words = corpus_object.words() #singe list of words ['Friday','an','investigation','of',"Atlanta's",...]

## Load lyrics

In [3]:
lyrics = pd.read_csv('../data/external/songdata.csv', usecols=['text'])

In [4]:
lyrics.head()

Unnamed: 0,text
0,"Look at her face, it's a wonderful face \nAnd..."
1,"Take it easy with me, please \nTouch me gentl..."
2,I'll never know why I had to go \nWhy I had t...
3,Making somebody happy is a question of give an...
4,Making somebody happy is a question of give an...


In [5]:
full_text = lyrics.text.str.cat()
words = full_text.split(' ')

In [38]:
lyrics_full = pd.read_csv('../data/external/lyrics.csv', usecols=['lyrics'])
lyrics_full.columns = ['text']

In [40]:
lyrics_full.head()

Unnamed: 0,text
0,"Oh baby, how you doing?\nYou know I'm gonna cu..."
1,"playin' everything so easy,\nit's like you see..."
2,If you search\nFor tenderness\nIt isn't hard t...
3,"Oh oh oh I, oh oh oh I\n[Verse 1:]\nIf I wrote..."
4,"Party the people, the people the party it's po..."


In [41]:
full_text = lyrics.text.str.cat()
words = full_text.split(' ')

## Load into vocab object for exploration

In [42]:
# "canonicalize_word" performs a few tweaks to the token stream of
# the corpus.  For example, it replaces digits with DG allowing numbers
# to aggregate together when we count them below.
# You can read the details in utils.py if you're really curious.
token_feed = (utils.canonicalize_word(w) for w in words)

# Collect counts of tokens and assign wordids.
vocab = vocabulary.Vocabulary(token_feed)
print("Vocabulary size: {:,}".format(vocab.size))

# Print out some (debugging) statistics to make sure everything went
# as we expected.  (Unsurprisingly, you should see "the" as the most popular word.)
print("Most common unigrams:")
for word, count in vocab.unigram_counts.most_common(10):
    print("\"{:s}\": {:,}".format(word, count))

Vocabulary size: 246,660
Most common unigrams:
"": 2,254,106
"the": 448,461
"you": 329,508
"
": 298,900
"to": 273,355
"i": 257,440
"a": 236,144
"me": 172,838
"and": 163,873
"
i": 154,413


### Language Modeling

In [43]:
from collections import defaultdict
import pickle

def normalize_counter(c):
    """Given a dictionary of <item, counts>, return <item, fraction>."""
    total = sum(c.values())
    return {w:float(c[w])/total for w in c}


class SimpleTrigramLM(object):
    def __init__(self, words, probas_file=None):
        """Build our simple trigram model."""
        #if pre-defined model is provided, use that as probabilities
        if probas_file:
            with open('{}.pkl'.format(file_name), 'rb') as main_dict:
                self.probas = pickle.load(main_dict)
        
        else:
            # Raw trigram counts over the corpus. 
            # c(w | w_1 w_2) = self.counts[(w_2,w_1)][w]
            self.counts = defaultdict(lambda: defaultdict(lambda: 0.0))

            # Iterate through the word stream once.
            w_1, w_2 = None, None
            for word in words:
                if w_1 is not None and w_2 is not None:
                    # Increment trigram count.
                    self.counts[(w_2,w_1)][word] += 1
                # Shift context along the stream of words.
                w_2 = w_1
                w_1 = word
            
            # Normalize so that for each context we have a valid probability
            # distribution (i.e. adds up to 1.0) of possible next tokens.
            self.probas = defaultdict(lambda: defaultdict(lambda: 0.0))
            for context, ctr in self.counts.items():
                self.probas[context] = normalize_counter(ctr)
            
    def next_word_proba(self, word, seq):
        """Compute p(word | seq)"""
        context = tuple(seq[-2:])  # last two words
        return self.probas[context].get(word, 0.0)
    
    def predict_next(self, seq):
        """Sample a word from the conditional distribution."""
        context = tuple(seq[-2:])  # last two words
        pc = self.probas[context]  # conditional distribution
        words, probs = zip(*pc.items())  # convert to list
        return np.random.choice(words, p=probs)
    
    def score_seq(self, seq, verbose=False):
        """Compute log probability (base 2) of the given sequence."""
        score = 0.0
        count = 0
        # Start at third word, since we need a full context.
        for i in range(2, len(seq)):
            if (seq[i] == "<s>" or seq[i] == "</s>"):
                continue  # Don't count special tokens in score.
            s = np.log2(self.next_word_proba(seq[i], seq[i-2:i]))
            score += s
            count += 1
            # DEBUG
            if verbose:
                print("log P({:s} | {:s}) = {.03f}".format(seq[i], " ".join(seq[i-2:i]), s))
        return score, count
    
    def generate_text(self, max_length=40):
        seq = ["<s>", "<s>"]
        for i in range(max_length):
            seq.append(self.predict_next(seq))
            # Stop at end-of-sentence
            if seq[-1] == "</s>": break
        print(" ".join(seq))

In [44]:
# np.array(list(corpus_object.sents())) = array([list(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'primary', 'election', 'produced', '``', 'no', 'evidence', '.']),
                                        #        list(['The', 'jury', 'further', 'said', 'in', 'term-end', 'was', 'conducted', '.']))],
                                        #        list([])...,
                                        #    dtype=object)

### Train test split

In [45]:
train_sents, test_sents = utils.get_train_test_sents(lyrics.text, split=0.8, shuffle=True)

Loaded 57,650 sentences (1.52059e+07 tokens)
Training set: 46,120 sentences (12,159,307 tokens)
Test set: 11,530 sentences (3,046,572 tokens)


In [46]:
vocab = vocabulary.Vocabulary(utils.canonicalize_word(w) for w in utils.flatten(train_sents))
print("Train set vocabulary: %d words" % vocab.size)

Train set vocabulary: 190324 words


### Preprocessing

In [47]:
def sents_to_tokens(sents):
    """Returns a flattened list of the words in the sentences, with padding for a trigram model."""
    padded_sentences = (["<s>", "<s>"] + s + ["</s>"] for s in sents)
    # This will canonicalize words, and replace anything not in vocab with <unk>
    return np.array([utils.canonicalize_word(w, wordset=vocab.wordset) 
                     for w in utils.flatten(padded_sentences)], dtype=object)

train_tokens = sents_to_tokens(train_sents)
#test_tokens = sents_to_tokens(test_sents)

### Train

In [48]:
print("Building trigram LM...",)
lm = SimpleTrigramLM(train_tokens)
print("Built trigram LM...")

Building trigram LM...
Built trigram LM...


## Save trained probabilities

In [49]:
file_name='../data/models/trigram-weights'

In [50]:
with open('{}.pkl'.format(file_name), 'wb') as outfile:
    pickle.dump(dict(lm.probas), outfile)

## Load in pre-trained probabilities

In [51]:
with open('{}.pkl'.format(file_name), 'rb') as main_dict:
    lm.probas = pickle.load(main_dict)

In [52]:
print("Building trigram LM...",)
lm = SimpleTrigramLM(words=None, probas_file='../data/models/trigram-weights')
print("Built trigram LM...")

Building trigram LM...
Built trigram LM...


## Generate Text

In [54]:
lm.generate_text(max_length=1000)

<s> <s> there's a light where  
he takes after each pause]  
q.b., real niggas, bravehearts, c'mon  
c'mon baby, cry baby)  
oh black and white seed  
  
somebody bigger  
anything you have to go  
don't make waves don't make me see.  
driving school,  
his breath had stopped  
for i will say we're gonna miss me  
i sing, i've been waiting for a while, a while  
you in white  
well baby don't ya love lockdown  
now they're draggin' her feet  
eating from silver blue jewel  
lord, they're coming to your way,  
i'll take you through the sphere upon which i stand  
  
i can't say the words of guilt she picks on me  
fingers down the highway  
the stumble in the hands of the year  
let's start a family that's gathered in the ice still on my knees  
now's the time has come now in your face  
it's addressed to your heart love)?  
girl, you'll be there either way  
  
let's hear the choo-choo train that makes the world wasn't glowing  
why is life this good for?  
well this crazy world (crazy