In [10]:
import pandas as pd
import os
# Read data into papers
papers = pd.read_csv('../Experiments/metadata/songsMetadata.csv')
# Print head
papers.head()

Unnamed: 0,artistId,geniusName,songShortcut,title,songId,url,lyrics,releaseDate
0,11774,20Syl,20SYL1,100% Autoproduction by Hocus Pocus,160272,https://genius.com/Hocus-pocus-100-autoproduct...,\n \n \n [Scrat...,000000000
1,11774,20Syl,20SYL2,100 Grammes de Peur by Hocus Pocus,108615,https://genius.com/Hocus-pocus-100-grammes-de-...,\n \n \n On la ...,000000000
2,11774,20Syl,20SYL3,10 que tu penses by Hocus Pocus,2319837,https://genius.com/Hocus-pocus-10-que-tu-pense...,\n \n \n [Intro...,"January 1, 1998"
3,11774,20Syl,20SYL4,10 YRS by End of the Weak France (Ft. Artik (Q...,438039,https://genius.com/End-of-the-weak-france-10-y...,\n \n \n [Artik...,"February 26, 2014"
4,11774,20Syl,20SYL6,73 Touches by Hocus Pocus,61676,https://genius.com/Hocus-pocus-73-touches-lyrics,\n \n \n [Coupl...,000000000


In [11]:
# Remove the columns
papers = papers.drop(columns=['artistId', 'geniusName', 'songShortcut', 
                              'title', 'songId', 'url', 'releaseDate'], axis=1)
                              
# Print out the first rows of papers
papers.head()

Unnamed: 0,lyrics
0,\n \n \n [Scrat...
1,\n \n \n On la ...
2,\n \n \n [Intro...
3,\n \n \n [Artik...
4,\n \n \n [Coupl...


In [12]:
# Load the regular expression library
import re
# Remove punctuation
papers['paper_text_processed'] = papers['lyrics'].map(lambda x: re.sub('[,\.!?]', '', x))
# Convert the titles to lowercase
papers['paper_text_processed'] = papers['paper_text_processed'].map(lambda x: x.lower())
# Print out the first rows of papers
papers['paper_text_processed'].head()

0    \n          \n            \n            [scrat...
1    \n          \n            \n            on la ...
2    \n          \n            \n            [intro...
3    \n          \n            \n            [artik...
4    \n          \n            \n            [coupl...
Name: paper_text_processed, dtype: object

In [13]:
import gensim
from gensim.utils import simple_preprocess
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
data = papers.paper_text_processed.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1][0][:30])

['scratch', 'dj', 'greem', 'ca', 'va', 'faire', 'un', 'sacre', 'boucan', 'refrain', 'cambia', 'syl', 'autoproduction', 'pour', 'un', 'son', 'pur', 'sang', 'empire', 'sans', 'empereur', 'ca', 'va', 'faire', 'un', 'sacre', 'boucan', 'couplet', 'cambia', 'cigare']


In [14]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [15]:
# NLTK Stop words
# import nltk
# nltk.download('stopwords')
import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('english') + stopwords.words('french')
# stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]
def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [17]:
import spacy
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
# Initialize spacy 'fr' model, keeping only tagger component (for efficiency)
nlp = spacy.load('fr_core_news_lg', disable=['parser', 'ner'])
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams)
print(data_lemmatized[:1])

[['scratch_dj', 'greem', 'aller', 'sacre_boucan', 'refrain', 'cambia', 'autoproduction', 'pur', 'sang', 'empire', 'empereur', 'aller', 'sacre_boucan', 'couplet', 'cambia', 'cigare', 'bord', 'levre', 'verre', 'whisky', 'bureau', 'secretair', 'dessou', 'repense', 'dernier', 'blaireau', 'entube', 'dire', 'tenir', 'tube', 'recette', 'campagne', 'pub', 'sacem', 'reste', 'aime', 'producteur', 'rap', 'aller', 'croire', 'ca', 'gene', 'contraire', 'contrat', 'enfiler', 'perle', 'ecoute', 'nouveau', 'merde', 'groupe', 'rever', 'percer', 'bon', 'petit', 'truc', 'toucher', 'mal', 'montrer', 'fesse', 'faire', 'jeter', 'tact', 'producteur', 'charmer', 'vente', 'veux', 'seducteur', 'tenir', 'mec', 'venir', 'entrer', 'cassette', 'main', 'pouvoir', 'tube', 'prochain', 'veu', 'gamin', 'syl', 'vouloir', 'piscine', 'studio', 'sinon', 'cher', 'champagne', 'maillot', 'bain', 'plaisant', 'champagne', 'faire', 'voix', 'girl', 'enfin', 'voir', 'presente', 'tas', 'star', 'vouloir', 'phat', 'genre', 'limousine',

In [134]:
from gensim.models import CoherenceModel

# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    perplexity = lda_model.log_perplexity(corpus)
    return [coherence_model_lda.get_coherence(), perplexity]


import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)
# Create Corpus
texts = data_lemmatized
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1])
num_of_docs = len(corpus)
    
import numpy as np
import tqdm
grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 2
max_topics = 30
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')
# Validation sets
num_of_docs = len(corpus)
corpus_sets = [# gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(corpus, num_of_docs*0.5), 
               gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]
corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': [],
                 'Perplexity': []
                }
# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=540)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    results = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(results[0])
                    model_results['Perplexity'].append(results[1])
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    pbar.close()

[[(0, 1), (1, 1), (2, 1), (3, 6), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 5), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 2), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 5), (24, 1), (25, 2), (26, 3), (27, 1), (28, 2), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 2), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 2), (53, 1), (54, 1), (55, 2), (56, 3), (57, 3), (58, 1), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 5), (69, 1), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (96, 2), (97, 1), (98, 1), (99, 2), (100, 1), (101, 1), (102, 1), (103, 3), (104, 1), (105, 1), (106, 1), (107, 1), (108, 1), (109, 1), (110, 1)

  0%|          | 0/540 [04:16<?, ?it/s]
1221it [15:53:18, 79.01s/it] 

  0%|          | 0/540 [01:23<?, ?it/s]


ValueError: Stop argument for islice() must be None or an integer: 0 <= x <= sys.maxsize.