In [None]:
import enchant
import unidecode

# Init Dictionnaries
d_eng0 = enchant.Dict("en")
d_eng1 = enchant.Dict("en_US")
d_eng2 = enchant.Dict("en_CA")
d_fra0 = enchant.Dict("fr")
d_fra1 = enchant.Dict("fr_FR")
d_fra2 = enchant.Dict("fr_CA")


nbrOfDiscarded = []


def engDictCheck(word):
    return (d_eng0.check(word) or d_eng1.check(word) or d_eng2.check(word))


def fraDictCheck(word):
    return (d_fra0.check(word) or d_fra1.check(word) or d_fra2.check(word))


def dictCheck(word):
    return engDictCheck(word) or fraDictCheck(word)
    

def unidecodeCheck(word):
    
    # Remove all accents
    unaccented_word = unidecode.unidecode(word)
    
    # If the word is the same without accents return
    if unaccented_word == word:
        return False
    
    # Ask french dictionary for a word suggestion
    suggestions = d_fra1.suggest(unaccented_word)
    
    # If the returned suggestion is the same letters with different accents, its good
    if suggestions:
        if unaccented_word == unidecode.unidecode(suggestions[0]):
            return suggestions[0]
    
    return False
            
            
def checkWord(word):
    
    # Check if the word can be found in french or english dictionaries
    if dictCheck(word):
        return word    
    
    # Check with capital letter
    if dictCheck(word.capitalize()):
        return word.capitalize()
    
    # Remove first letter
    if ((word[0] == "l") or (word[0] == "j") or (word[0] == "d")):
        trimmed = word[1:]
        if trimmed != "":
            if dictCheck(trimmed):
                return trimmed
            
            # In french, check if only the accents were wrong
            uniCheck = unidecodeCheck(word)
            if uniCheck != False:
                return uniCheck 
    
    # In french, check if only the accents were wrong
    uniCheck = unidecodeCheck(word)
    if uniCheck != False:
        return uniCheck

    # Add discarded words to a global counter
    global nbrOfDiscarded
    nbrOfDiscarded.append(word)  
    
    return False



In [None]:
def extract_words(content,dirname):

    lastPointer = 0
    uniqueWords = []
    words = []
    
    for counter in range(0,len(content)):

        # If current character is a space
        if content[counter] == " ":
            
            # Substring the new word
            newWord = content[lastPointer:counter].strip()
            
            # Position pointer on character right after space
            lastPointer = counter + 1
            
            # If the new word is not empty
            if newWord != "":
                
                words.append(newWord)
                                
    
    # Write all lyrics to a file inside dir
    newFileName = dirname + "/words.txt"
    
    f = open(newFileName, "w")
    f.write(",".join(words))
    f.close()
    

In [None]:
from random import randint
import numpy as np

def evaluateText(dirnames,nbrOfPull,words_threshold):
    
    for dirname in dirnames:
        
        fName = dirname + "/words.txt"

        # Load the artist's json file
        f = open(fName,"r")

        content = f.read()
        content = content.split(',')      
        nbrWords = len(content)
        
        if nbrWords < words_threshold:
            continue

        nbrUniqueWords_Array = []

        for i in range(0,nbrOfPull):

            uniqueWords = []
            startIndex = randint(0, nbrWords - words_threshold)

            for index in range(startIndex,startIndex + words_threshold):
                word = content[index]

                if word not in uniqueWords:
                    checkResult = checkWord(word)
                    if checkResult != False:
                        uniqueWords.append(checkResult)

            nbrUniqueWords = len(uniqueWords)
            nbrUniqueWords_Array.append(nbrUniqueWords)

        artistName = dirname.split("/")[-1]

        mean = round(np.average(nbrUniqueWords_Array))
        median = round(np.median(nbrUniqueWords_Array))
        variance = round(round((np.std(nbrUniqueWords_Array)/(1.0*mean)),4)*100)
        maxNbr = np.amax(nbrUniqueWords_Array)
        minNbr = np.amin(nbrUniqueWords_Array)
        
        print(artistName + " - mean=" + str(mean) + ", median=" + str(median) + ", var(%)=" + str(variance) + ", max=" + str(maxNbr) + ", min=" + str(minNbr))


In [None]:
from random import randint
import glob
import numpy as np
from threading import Thread

# Path to the directory containing all the artists
dirPath = "../data/*"
dirnames = glob.glob(dirPath)

words_threshold = 7500
nbrOfPull = 20
    
nbrDirNames = len(dirnames)
thread1_end = round(nbrDirNames/4.0)
thread2_end = round(2.0*nbrDirNames/4.0)
thread3_end = round(3.0*nbrDirNames/4.0)

dirnames1 = dirnames[0:thread1_end]
dirnames2 = dirnames[thread1_end:thread2_end]
dirnames3 = dirnames[thread2_end:thread3_end]
dirnames4 = dirnames[thread3_end:]

#create a list of threads
threads = []

# Thread 1
process1 = Thread(target=evaluateText, args=[dirnames1,nbrOfPull,words_threshold])
process1.start()
threads.append(process1)

# Thread 2
process2 = Thread(target=evaluateText, args=[dirnames2,nbrOfPull,words_threshold])
process2.start()
threads.append(process2)

# Thread 3
process3 = Thread(target=evaluateText, args=[dirnames3,nbrOfPull,words_threshold])
process3.start()
threads.append(process3)

# Thread 4
process4 = Thread(target=evaluateText, args=[dirnames4,nbrOfPull,words_threshold])
process4.start()
threads.append(process4)

for process in threads:
    process.join()

In [None]:
sorted_results = sorted(results, key=lambda tup: tup[1])
sorted_results = list(reversed(sorted_results))

index = 1
for res in sorted_results:
    print(str(index) + ". " + res[0] + " - mean=" + str(res[1]) + ", median=" + str(res[2]) + ", var=" + str(res[3]) + ", max=" + str(res[4]) + ", min=" + str(res[5]))
    index = index + 1

In [None]:
import glob

# Path to the directory containing all the artists
dirPath = "../data/*"
dirnames = glob.glob(dirPath)

results = []


for dirname in dirnames:
    
    # Load the artist's json file
    f = open(dirname + "/all_lyrics.txt","r")
    
    if f.mode == 'r':
        content = f.read()
        
        extract_words(content,dirname)
        
        print(dirname)
    