In [1]:
from textgenrnn.textgenrnn import textgenrnn, utils

Using TensorFlow backend.


In [2]:
import os
import pandas as pd

BASE_DIR = os.getcwd()
reviews_f = 'reviews_and_metadata'
DATA_DIR = os.path.join(BASE_DIR, '..', 'datasets', f'{reviews_f}_5yrs')

train_f = os.path.join(DATA_DIR, 'train.json')
train_df = pd.read_json(train_f)
print(f'num_reviews: {len(train_df)}')
train_df.head()

num_reviews: 15900


Unnamed: 0,acousticness,artist,artist_id,audio_features,author,content,content_sentences,danceability,date_published,dek,...,sitename,song_title,speechiness,spotify_genres,spotify_id,tempo,time_signature,total_pages,valence,word_count
0,0.637,Cass McCombs,2iUVQjheBnvOt8vaBrxXJz,"{'danceability': 0.485, 'energy': 0.5740000000...",Zach Frimmel,photo by Rachel Pony Cassells\nEvery Monday th...,[photo by Rachel Pony Cassells Every Monday th...,0.485,2016-10-19T00:00:00.000Z,,...,KEXP Blog,Opposite House,0.0295,"[alternative rock, art pop, chamber pop, dream...",2XpsvLcddOSFanKBCZCEBR,153.076,4.0,1,0.611,487
1,0.0151,Disclosure,6nS5roXSAGhTGr34W6n7Et,"{'danceability': 0.502, 'energy': 0.732, 'key'...",Clayton Warwick,Disclosure\nLatch (TEEMID X Daniela Andrade Ed...,"[<artist> <song_title, >, There's no doubt tha...",0.502,2014-08-06T00:00:00.000Z,,...,The Music Ninja,Latch (TEEMID X Daniela Andrade Edition),0.168,"[house, pop, tropical house]",1BltsyC5W3SAABdxyrDXwi,122.025,4.0,1,0.536,192
2,0.256,Klangstof,25lIYhqIj0R1AnnmqsTrtO,"{'danceability': 0.491, 'energy': 0.45, 'key':...",,How does one know when the love is real? What ...,"[How does one know when the love is real?, Wha...",0.491,,,...,Indie Shuffle,Everest,0.0297,[vapor soul],6i7n4C2mSx6WEmfFEhPWYf,153.964,4.0,1,0.308,113
3,0.367,Skizzy Mars,00Z3UDoAQwzvGu13HoAM7J,"{'danceability': 0.5640000000000001, 'energy':...",,I still remember when I was first convinced to...,[I still remember when I was first convinced t...,0.564,,,...,Indie Shuffle,Be Lazy,0.346,"[pop, pop rap, rap]",0qZDyzZkeFfPseTXUPnH4C,99.228,4.0,1,0.446,185
4,0.7,Promises Ltd.,2JgUCWMkArdLIENeaJcJ0Y,"{'danceability': 0.645, 'energy': 0.734, 'key'...",,"""Days of Lavender"" becomes a haunting, shiftin...","[""Days of Lavender"" becomes a haunting, shifti...",0.645,,,...,Indie Shuffle,Days Of Lavender (AMTRAC REMIX),0.0439,[vapor soul],2AtsONzSPfEHzeGpXaqNEQ,114.976,4.0,1,0.0306,100


In [3]:
import re
import unicodedata

# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize(u'NFD', s)
        if unicodedata.category(c) != u'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalize_string(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"'", r"", s)
    s = re.sub(r"([.!?])", r" \1", s)
    #s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    s = re.sub(r"[^\w]", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip().lstrip().rstrip()
    return s

def normalize(line):
    l = line.strip().lstrip().rstrip()
    return normalize_string(l)

min_gen_length = 10
max_gen_length = 200

sents = [normalize(sent) for ss in train_df.content_sentences for sent in ss if min_gen_length < len(sent.split()) < max_gen_length]
sents[:10]

['photo by rachel pony cassells every monday through friday we deliver a different song as part of our song of the day podcast subscription',
 'this podcast features exclusive kexp in studio performances unreleased songs and recordings from independent artists that our djs think you should hear',
 'todays song featured on the midday show with cheryl waters is song_title by artist from his eight studio record and debut on anti records entitled mangy love',
 'artist has been humbly hiding out under the underground with tip of the iceberg fame for fifteen years now',
 'aside from his melodious easy listening psych rock that religiously flirts with reverb and lyrical slow jams the la based musician is known for being reclusive but respected mellow but meticulous dissent prone but decent',
 'with eight studio albums now to his name the last five previously on domino records hes recently released mangy love out on anti records which is yet another peak of perfection in his seemingly valley l

In [4]:
train_text_f = os.path.join(DATA_DIR, 'train.txt')

sents_out = '\n'.join(sents)
sents_out[:2000]

'photo by rachel pony cassells every monday through friday we deliver a different song as part of our song of the day podcast subscription\nthis podcast features exclusive kexp in studio performances unreleased songs and recordings from independent artists that our djs think you should hear\ntodays song featured on the midday show with cheryl waters is song_title by artist from his eight studio record and debut on anti records entitled mangy love\nartist has been humbly hiding out under the underground with tip of the iceberg fame for fifteen years now\naside from his melodious easy listening psych rock that religiously flirts with reverb and lyrical slow jams the la based musician is known for being reclusive but respected mellow but meticulous dissent prone but decent\nwith eight studio albums now to his name the last five previously on domino records hes recently released mangy love out on anti records which is yet another peak of perfection in his seemingly valley less career\na so

In [5]:
with open(train_text_f, 'w') as f:
    f.write(sents_out)

In [6]:
model_cfg = {
    'rnn_size': 128,
    'rnn_layers': 4,
    'rnn_bidirectional': True,
    'max_length': 10,#5,#300,
    'max_words': 500000,
    'dim_embeddings': 300,
    'word_level': True,
}

train_cfg = {
    'line_delimited': True,
    'num_epochs': 10,#1
    'gen_epochs': 2,#1,#0
    'sample_epochs': 1,
    'batch_size': 512,
    'train_size': 1.0,#0.8,
    'dropout': 0.1,
    'max_gen_length': 200,
    'validation': False,#True,
    'is_csv': False
}

In [7]:
from keras.callbacks import Callback

class save_loss(Callback):
    def __init__(self, losses):
        self.losses = losses

    def on_epoch_end(self, epoch, logs):
        self.losses.append(logs['loss'])

In [8]:
textgen = textgenrnn(name=reviews_f)

train_function = textgen.train_from_file if train_cfg['line_delimited'] else textgen.train_from_largetext_file

losses = []

train_function(
    file_path=str(train_text_f),
    new_model=True,
    num_epochs=train_cfg['num_epochs'],
    gen_epochs=train_cfg['gen_epochs'],
    batch_size=train_cfg['batch_size'],
    train_size=train_cfg['train_size'],
    dropout=train_cfg['dropout'],
    max_gen_length=train_cfg['max_gen_length'],
    validation=train_cfg['validation'],
    is_csv=train_cfg['is_csv'],
    rnn_layers=model_cfg['rnn_layers'],
    rnn_size=model_cfg['rnn_size'],
    rnn_bidirectional=model_cfg['rnn_bidirectional'],
    max_length=model_cfg['max_length'],
    dim_embeddings=model_cfg['dim_embeddings'],
    word_level=model_cfg['word_level'],
    callbacks=[save_loss(losses)])

84,642 texts collected.
Training new model w/ 4-layer, 128-cell Bidirectional LSTMs
Training on 1,970,991 word sequences.
Epoch 1/10
Epoch 2/10
####################
Temperature: 0.2
####################
the track is a slow burner that is so sultry and soulful that is simply stunning

the track is a slow burner that is so beautiful and catchy

the track is a slow burner that showcases the bands ability to blend the of the original and the chorus is the perfect soundtrack for the summer

####################
Temperature: 0.5
####################
the track is one of the most fragile parts of the track that the band have taken a remix of artist s single

the track is a great upbeat track that is simply beautiful and catchy as it does

the track is a beautiful piece of electronica that is steeped in emotion and emotion

####################
Temperature: 1.0
####################
i up a lot is there to be coming out of this year

the original super funky groovy track reads turns about what pu

In [9]:
losses

[5.156938241283192,
 4.553611818181288,
 4.279892824940571,
 4.074083189804481,
 3.896409230530804,
 3.730400722904557,
 3.5700648325268216,
 3.420578568469211,
 3.267748231360063,
 3.126224206249012]

In [10]:
textgen.generate_samples(n=10)

####################
Temperature: 0.2
####################
the track is a slow burner with a smooth bassline and a driving beat

the song is a slow burner with a downtempo beat and smooth vocals

the song is a slow burner with a downtempo pulse that is so danceable

the track is a slow burner with a smooth bassline and a driving beat

the track is a very special place to start a project

the track is a very chill piece of work which builds up to a euphoric climax

the track is a slow burner with a downtempo beat that is surely going to capture the atmosphere of the original

the song is a slow burner with a downtempo rhythm and a very lush vibe

the track is a slow burner with a smooth bassline and a driving beat

the track is a slow burner with a downtempo beat and a smooth house groove

####################
Temperature: 0.5
####################
the track is a soft but textured vibe with a driving beat that makes the track more upbeat than the original

the way she human and fall into