In [None]:
import enchant
import unidecode

# Init Dictionnaries
d_eng0 = enchant.Dict("en")
d_eng1 = enchant.Dict("en_US")
d_eng2 = enchant.Dict("en_CA")
d_fra0 = enchant.Dict("fr")
d_fra1 = enchant.Dict("fr_FR")
d_fra2 = enchant.Dict("fr_CA")


def engDictCheck(word):
    return (d_eng0.check(word) or d_eng1.check(word) or d_eng2.check(word))


def fraDictCheck(word):
    return (d_fra0.check(word) or d_fra1.check(word) or d_fra2.check(word))


def dictCheck(word):
    return engDictCheck(word) or fraDictCheck(word)
    

def unidecodeCheck(word):
    
    # Remove all accents
    unaccented_word = unidecode.unidecode(word)
    
    # If the word is the same without accents return
    if unaccented_word == word:
        return False
    
    # Ask french dictionary for a word suggestion
    suggestions = d_fra1.suggest(unaccented_word)
    
    # If the returned suggestion is the same letters with different accents, its good
    if suggestions:
        if unaccented_word == unidecode.unidecode(suggestions[0]):
            return suggestions[0]
    
    return False
            
            
def checkWord(word):
    
    # Check if the word can be found in french or english dictionaries
    if dictCheck(word):
        return word    
    
    # Check with capital letter
    if dictCheck(word.capitalize()):
        return word.capitalize()
    
    # Remove first letter
    if ((word[0] == "l") or (word[0] == "j") or (word[0] == "d")):
        trimmed = word[1:]
        if trimmed != "":
            if dictCheck(trimmed):
                return trimmed
            
            # In french, check if only the accents were wrong
            uniCheck = unidecodeCheck(word)
            if uniCheck != False:
                return uniCheck 
    
    # In french, check if only the accents were wrong
    uniCheck = unidecodeCheck(word)
    if uniCheck != False:
        return uniCheck

    
    return False



In [None]:
from random import randint
import numpy as np

def evaluateText(dirnames,min_nbr_of_words):
    
    for dirname in dirnames:
        
        fName = dirname + "/all_words.txt"

        # Load the artist
        f = open(fName,"r")

        content = f.read()
        content = content.split(',')      
        nbrWords = len(content)
        
        if nbrWords < min_nbr_of_words:
            continue

            
        # Recalculate a more sensible nbr of samples
        nbr_of_samples = int(round((nbrWords/(1.0*min_nbr_of_words))*3)) + 1
            
        nbrUniqueWords_Array = []
        nbrUniqueWords_fra_Array = []
        nbrUniqueWords_eng_Array = []

        for i in range(0,nbr_of_samples):

            uniqueWords = []
            uniqueWords_fra = []
            uniqueWords_eng = []
            
            startIndex = randint(0, nbrWords - min_nbr_of_words)

            for index in range(startIndex,startIndex + min_nbr_of_words):
                word = content[index]

                if word not in uniqueWords:
                    checkResult = checkWord(word)
                    if checkResult != False:
                        
                        uniqueWords.append(checkResult)
                        
                        if engDictCheck(checkResult):
                            uniqueWords_eng.append(checkResult)
                            
                        if fraDictCheck(checkResult):
                            uniqueWords_fra.append(checkResult)

            nbrUniqueWords = len(uniqueWords)
            nbrUniqueWords_Array.append(nbrUniqueWords)
            
            nbrUniqueWords_fra = len(uniqueWords_fra)
            nbrUniqueWords_fra_Array.append(nbrUniqueWords_fra)
            
            nbrUniqueWords_eng = len(uniqueWords_eng)
            nbrUniqueWords_eng_Array.append(nbrUniqueWords_eng)

        artistName = dirname.split("/")[-1]
        
        # Total
        mean = round(np.average(nbrUniqueWords_Array))
        median = round(np.median(nbrUniqueWords_Array))
        std = round(np.std(nbrUniqueWords_Array))
        maxNbr = np.amax(nbrUniqueWords_Array)
        minNbr = np.amin(nbrUniqueWords_Array)

        # Francais
        mean_fra = round(np.average(nbrUniqueWords_fra_Array))
        median_fra = round(np.median(nbrUniqueWords_fra_Array))
        std_fra = round(np.std(nbrUniqueWords_fra_Array))
        maxNbr_fra = np.amax(nbrUniqueWords_fra_Array)
        minNbr_fra = np.amin(nbrUniqueWords_fra_Array)
        
        # English
        mean_eng = round(np.average(nbrUniqueWords_eng_Array))
        median_eng = round(np.median(nbrUniqueWords_eng_Array))
        std_eng = round(np.std(nbrUniqueWords_eng_Array))
        maxNbr_eng = np.amax(nbrUniqueWords_eng_Array)
        minNbr_eng = np.amin(nbrUniqueWords_eng_Array)
        
        # Write all lyrics to a file inside dir
        newFileName = dirname + "/"  + str(min_nbr_of_words) + "words_statistics2.txt"
        f = open(newFileName, "w")
        
        f.write("nbr_words_checked," + str(min_nbr_of_words) + "\n")
        f.write("nbr_of_samples," + str(nbr_of_samples) + "\n")
        f.write("nbr_of_words,"+ str(nbrWords) + "\n")
        
        f.write("average,"+ str(mean) + "\n")
        f.write("median,"+ str(median) + "\n")
        f.write("standard deviation,"+ str(std) + "\n")
        f.write("max,"+ str(maxNbr) + "\n")
        f.write("min,"+ str(minNbr) + "\n")
        
        f.write("francais average,"+ str(mean_fra) + "\n")
        f.write("francais median,"+ str(median_fra) + "\n")
        f.write("francais standard deviation,"+ str(std_fra) + "\n")
        f.write("francais max,"+ str(maxNbr_fra) + "\n")
        f.write("francais min,"+ str(minNbr_fra) + "\n")
        
        f.write("english average,"+ str(mean_eng) + "\n")
        f.write("english median,"+ str(median_eng) + "\n")
        f.write("english standard deviation,"+ str(std_eng) + "\n")
        f.write("english max,"+ str(maxNbr_eng) + "\n")
        f.write("english min,"+ str(minNbr_eng) + "\n")
        
        f.close()

        print(artistName)


In [None]:
from random import randint
import glob
import numpy as np
from threading import Thread

# Path to the directory containing all the artists
dirPath = "data/*"
dirnames = glob.glob(dirPath)

min_nbr_of_words = 3500
    
nbrDirNames = len(dirnames)
thread1_end = round(nbrDirNames/4.0)
thread2_end = round(2.0*nbrDirNames/4.0)
thread3_end = round(3.0*nbrDirNames/4.0)

dirnames1 = dirnames[0:thread1_end]
dirnames2 = dirnames[thread1_end:thread2_end]
dirnames3 = dirnames[thread2_end:thread3_end]
dirnames4 = dirnames[thread3_end:]


print("artist,average,median,std,max,min")

#create a list of threads
threads = []

# Thread 1
process1 = Thread(target=evaluateText, args=[dirnames1,min_nbr_of_words])
process1.start()
threads.append(process1)

# Thread 2
process2 = Thread(target=evaluateText, args=[dirnames2,min_nbr_of_words])
process2.start()
threads.append(process2)

# Thread 3
process3 = Thread(target=evaluateText, args=[dirnames3,min_nbr_of_words])
process3.start()
threads.append(process3)

# Thread 4
process4 = Thread(target=evaluateText, args=[dirnames4,min_nbr_of_words])
process4.start()
threads.append(process4)

for process in threads:
    process.join()
    