# Evaluation

In [1]:
import pickle
import numpy as np
import nltk
from nltk.translate import bleu_score 
from collections import Counter
from collections import defaultdict
import string
import re
import copy
#!pip install pronouncing
import pronouncing
import numpy as np
import os
import glob
import csv
import time


In [2]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\justi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\justi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
# some basic preprocessing techniques
def prepString(s):
    '''removes punctuation other than apostrophes from string'''
    return str(s).lower().translate({ord(c): None for c in string.punctuation if c not in ("'")})

def removePunc(s):
    '''removes punctuation from string'''
    return str(s).lower().translate({ord(c): None for c in string.punctuation})

def removeMarkupWords(s):
    '''removes positional words generated in lyrics'''
    s = str(s).lower()
    for term in ['xbol-10','xbol-11','xbol-12','xbol-13','xbol-14','xbol-15','xbol-16','xbol-17','xbol-18','xbol-19'
                 ,'xbol-20','xbol-21','xbol-22','xbol-23','xbol-24','xbol-25','xbol-26','xbol-27','xbol-28','xbol-29'
                 ,'xbol-30','xbol-1','xbol-2','xbol-3','xbol-4','xbol-5','xbol-6','xbol-7','xbol-8','xbol-9'
                 ,'xgenre','xtitle','xeol','xbol','xeos','xbos','[verse-1]','[verse-2]','[chorus]']:
        s = str(s).replace(term,'')
    return s

## Trigram perplexity (v0.1)

In [4]:

def load_model(file_name):
    '''input file name (without extention)'''
    # load model
    m1_pkl = open("../data/models/" + file_name + ".pkl", "rb")
    model = pickle.load(m1_pkl)
    m1_pkl.close()
    return model
    
    
def get_entropy(text, model, _n=3, _lpad = ['<s>'], _rpad = ['<s>']):
    ''' calculate average log probability of each word in text, given context
        
        IMPORTANT NOTE:  For initial implementation, we do not have bigram or unigram prob in our dict 'model',
                         and this handles missing or unknown entries naively
    '''
    
    e = 0.0
    padded_string = "<s> " + example1 + " <s>"
    text = padded_string.split(' ')
    for i in range(_n - 1, len(text)):
        context = tuple(text[i - _n + 1:i])
        token = text[i]
        #print(context,token)
        #print(e)
        e += -np.log2(model.get(context,dict()).get(token,0.0000001))  # this is a poor placeholder until we get backoff dicts
    entropy = e / float(len(text) - (_n - 1))
    return entropy

def get_perplexity(text, model, _n = 3, _lpad = ['<s>'], _rpad = ['<s>']):
    return np.power(2,get_entropy(text=text, model=model , _n=_n, _lpad=_lpad, _rpad=_rpad))



In [5]:
example1 = "Mary had a little lamb whose fleece was white as snow, and everywhere that Mary went the sheep was sure to go"

model1 = load_model("trigram-weights")

get_perplexity(example1,model1,3)

3258828.7202875423

## Prosodic

In [6]:
## Well, this was a BLACKHOLE OF WASTED TIME

In [7]:
#!pip install prosodic

In [8]:
#import prosodic as p
#text = p.Text("Shall I compare thee to a summer's day?")
#text.parse()


## POS Tagging

In [9]:
# POS tagging and aggregation to % of text
def nltkPOS(text,verbose=False):
    '''For an input text, return absolute difference from published proportions, between 0 and 1.'''
    
    # define lookups
    mapping = {'CC':'CC','DT':'DT','PDT':'DT','WDT':'DT','IN':'IN','JJ':'JJ','JJR':'JJ','JJS':'JJ'
               ,'NN':'NN','NNS':'NN','NNP':'NN','NNPS':'NN','LS':'OT','CD':'OT','EX':'OT','FW':'OT'
               ,'POS':'OT','UH':'OT','RB':'RB','RBR':'RB','RBS':'RB','WRB':'RB','TO':'TO','MD':'VB'
               ,'RP':'VB','VB':'VB','VBD':'VB','VBG':'VB','VBN':'VB','VBP':'VB','VBZ':'VB','PRP':'WP'
               ,'PRP$':'WP','WP':'WP','WP$':'WP'}
    comp_dict = {'CC':0.0212,'DT':0.0982,'IN':0.0998,'JJ':0.0613,'NN':0.3051,'RB':0.0766,'TO':0.0351
                 ,'VB':0.285,'WP':0.0058,'OT':0.012}

    # initialize
    pos_cnt = Counter()
    total_word_cnt = 0
    pos_dict = defaultdict(float) 
    pos_dict['adjustment'] = 0
    absdiff = 0
    
    # prepare data  
    text = prepString(removeMarkupWords(text))
    tokenized_text = nltk.word_tokenize(text)
    tag_list = nltk.pos_tag(tokenized_text)
    
    if not tag_list:
        raise ValueError("Please provide more complete text")
    
    # initial proportions
    for t in tag_list:
        pos_cnt[t[1]] +=1
        total_word_cnt +=1
    pos_raw_dict = {k: v/float(total_word_cnt) for k,v in dict(pos_cnt).items()}
        
    # adjust for items missing in mapping (mostly punctuation)    
    for k,v in pos_raw_dict.items():
        if k in mapping:
            pos_dict[mapping[k]] += v 
        else:
            pos_dict['adjustment'] += v
    for k,v in pos_dict.items():
        pos_dict[k] = pos_dict[k]/(1-pos_dict['adjustment'])
    del pos_dict['adjustment']
    
    # compare to observed ratios, calculate absolute difference
    for k in comp_dict.keys():
        absdiff += abs(comp_dict[k] - pos_dict.get(k,0))
        if verbose==True: 
            print(k,"- benchmark:",comp_dict[k],", text:",pos_dict.get(k,0),"abs diff:",abs(comp_dict[k] - pos_dict.get(k,0)))
    return absdiff

In [10]:
text = "I jump into a lake and keep swimming." #  The fluffy dog went to the north and first left."
nltkPOS(text)

0.5789

In [11]:
long = "Deep throat is a Python program that can synthesize speech. A simple approach to unrestricted text-to-speech translation uses a small set of letter-to-sound rules, each rule specifying a pronunciation for one or more letters in some context. Deep throat features a small set of letter-to-sound rules that translate English text to phonemes producing usably accurate pronunciations of words. Deep throat can produce sounds by combining stored representations of phoneme sounds in accordance with generated phoneme translations. It can output these sounds to computer sound hardware using PortAudio and it can save them to sound file. \
Deep throat can accept text as a command line option argument, from a pipe and it can be set into an interactive mode.\
Deep throat can be set to read the date and time in various ways, such as in a loop. It can translate text to phonemes, it can translate specified phonemes to sounds and it can translate numbers to English text. It can engage visual and sound analyses."
nltkPOS(long)

0.2898853658536585

## Rhyme

In [12]:

phoneText = "as I walk through the valley of the garden of death"
newText = "I take a look at my life and realize there's nothing left"
hickoryText = "Hickory Dickory Dock,\nThe mouse ran up the clock.\nThe clock struck one,\nThe mouse ran down!\nHickory Dickory Dock."
# convert to phonemes
#phones = [pronouncing.phones_for_word(word) for word in prepString(phoneText).split()]
#phones

In [13]:
exampleWord = 'orange'
examplePhones = pronouncing.phones_for_word(exampleWord)[0]
exampleRhymePart = pronouncing.rhyming_part(examplePhones)

In [14]:
if len(exampleRhymePart.split()) > 1:
    rp2 = exampleRhymePart.split()
else: 
    rp2 = exampleRhymePart
rp2

['AO1', 'R', 'AH0', 'N', 'JH']

In [15]:
exampleRhymePart.split()[0:1]

['AO1']

In [16]:
# calculate rhyme density.  Hopeful enhancements include:
# 1) extending rhymeType
# 2) adding text-to-phoneme and applying for tokens not in CMU dictionary
# 3) improving the calculation by taking into consideration probability of rhymes
# 4) removing repeat tokens from consideration to avoid rewarding repeated words

def calcRhymeDensity(text,rhymeType='perfect',rhymeLocation='all',lineStartStop=(1,-2),printExamples=False):
    '''calculates rhyme density (count of rhymes over n-1 words). \n\n
    
       _parameters_
       text: input text for measurement
       rhymeType: 'perfect' is a perfect rhyme, 'vowel' is a rhyming in the vowel sound + stress only
       rhymeLocation: choose to look at 'all' text, 'section' by line numbers, or 'end' (last word in each line)    
       lineStartStop: tuple of (start,stop) line numbers
       printExamples: if True, print most common values of the selected rhymeType
       
       _returns_
       rhyme_cnt: count of rhymes of specified rhymeType and rhymeLocation
       wordCount: count of words of specified rhymeType and rhymeLocation
       rhymeDensity: rhyme_cnt/float(wordCount-1)
    '''
    # restrict location to (end=last word, internal line = line, all= full text)
    # count tokens
    # 
    
    # initialize
    rhymePart_cnt = Counter()
    rhyme_cnt = 0
    
    # prepare data
    text = prepString(removeMarkupWords(text))
    
    if rhymeLocation == 'all':
        words = text.split()
    
    if rhymeLocation == 'end':
        lines = text.split("\n")
        words = [line.split()[-1] for line in lines if len(line.split())>0]
    
    if rhymeLocation == 'section':
        lines = text.split("\n")
        words = [line.split()[-1] for line in lines[lineStartStop[0]:lineStartStop[1]+1] if len(line.split())>0]
    
    # 
    wordCount = len(words)
    #print(words)
    for word in words:
        pros = pronouncing.phones_for_word(word)
        if pros:     
            phonelist = pros[0]  #using first pronunciation for now
            if len(phonelist) > 0:
                if rhymeType == 'perfect':
                    rhymePart_cnt[pronouncing.rhyming_part(phonelist)] +=1

                #if rhymeType == 'rime':
                #    pass
                #if rhymeType == 'soft':
                #    pass
                #if rhymeType == 'consonant':
                #    pass

                elif rhymeType == 'vowel':
                    rhymePart_cnt[pronouncing.rhyming_part(phonelist).split()[0]] +=1
    
    for v in rhymePart_cnt.values():
        rhyme_cnt += v-1
    
    if wordCount>1: 
        rhymeDensity = rhyme_cnt/float(wordCount-1)
    else:
        rhymeDensity = 0.0
    
    if printExamples == True:
        print(rhymePart_cnt.most_common(5))
    
    return rhymeDensity, rhyme_cnt, wordCount

   

In [17]:
calcRhymeDensity(newText,rhymeType='perfect',printExamples=True)

[('AY1', 2), ('EY1 K', 1), ('AH0', 1), ('UH1 K', 1), ('AE1 T', 1)]


(0.09090909090909091, 1, 12)

In [18]:
with open('../data/lyrics/current/samp1.txt') as sampf:
    samp = sampf.read()
    text = samp
    text = removeMarkupWords(text)
    text = prepString(text)
    #print(text)
    lines = text.split("\n")
    #print(lines)
    words = [line.split()[-1] for line in lines if len(line.split())>0]
    #print(words)
    print(calcRhymeDensity(text,rhymeType='perfect',rhymeLocation='end',printExamples=True))

[('OW1', 2), ('UW1', 2), ('AY1', 1), ('IH1 S', 1)]
(0.4, 2, 6)


In [19]:
calcRhymeDensity(newText,rhymeType='vowel')

(0.36363636363636365, 4, 12)

In [20]:
pronouncing.search(exampleRhymePart)

['orange', 'orange-green', 'orangeburg', 'oranges']

In [21]:
calcRhymeDensity(hickoryText,rhymeType='vowel',rhymeLocation='end')

(0.5, 2, 5)

## BLEU

In [22]:
def bleu(ref_list,candidateText,nGram=4,nGramType='cumulative',shouldSmooth=True):
    '''calculates BLEU score 
    
        _parameters_
        ref_list: expects a list of reference texts to compare (as strings)
        candidateText: the new text needing to be scored
        nGram: choose between 1-4.  Determines which ngram(s) to use in the scoring
        nGramType: 'cumulative' uses a simple average of all ngrams from 1 to nGram
        shouldSmooth: if False, calculates the BLEU score without smoothing. Recommended to use smoothing (set to True)
        
        _returns_
        score: BLEU score using nGram settings input, smoothed by default (can be turned off)
    '''
    
    # basic checks
    if nGram not in [1,2,3,4]:
        raise ValueError('nGram must be between 1 and 4')
    
    if nGramType not in ['cumulative','exclusive']:
        raise ValueError('nGramType must either be cumulative (average of nGrams less than n) or exclusive (1=unigram, etc.)')
    
    # pre-score
    weight_dict = {('cumulative',1):(1,0,0,0)
                  ,('cumulative',2):(.5,.5,0,0)
                  ,('cumulative',3):(.3333,.3333,.3333,0)
                  ,('cumulative',4):(.25,.25,.25,.25)
                  ,('exclusive',1):(1,0,0,0)
                  ,('exclusive',2):(0,1,0,0)
                  ,('exclusive',3):(0,0,1,0)
                  ,('exclusive',4):(0,0,0,1)}
    candidate = [removePunc(str(removeMarkupWords(candidateText))).split()]
    references = [[removePunc(str(removeMarkupWords(ref))).split() for ref in ref_list]]
    weights = weight_dict[(nGramType,nGram)]
       
    
    # scoring
    if shouldSmooth==True:
        smoother = bleu_score.SmoothingFunction().method5
    else:
        smoother = None
    score = bleu_score.corpus_bleu(references, candidate, weights, smoothing_function=smoother)
    #print(score)
    return score


In [23]:
bleu([newText],phoneText,4,'cumulative')

0.0692255179440046

In [24]:
bleu([phoneText],hickoryText,4,'cumulative')

0.0769800358919501

## Meter

In [25]:
def findLineStress(line):
    '''find accentual stress of a given line, based on CMU dict.  Still a bit unclever.
    
    _parameters_
    line: line of text
    
    _returns_
    parselist: list of potential stresses after parsing. 0 is unstressed, 1 is primary stress, 2 is secondary stress (middle)
    syllableLengths: list of syllable lengths corresponding to the parses in parselist
    wordCount: count of words in the line 
    '''
    line = prepString(removeMarkupWords(line))
    words = line.split()
    wordCount = len(words)
    parses = ['']
    for word in words:
        pros = pronouncing.phones_for_word(word)
        if pros:
            for phonelist in [pronouncing.phones_for_word(word)]:           
                stressOptions = copy.deepcopy(parses)
                currLen = len(parses)
                newparse = []
                # I don't really need to loop through pronunciations, just distinct stress patterns, so a little inefficient here
                for pronunciation in phonelist:
                    wordStress = pronouncing.stresses(pronunciation)
                    for option in range(currLen):
                        newparse.append(''+str(stressOptions[option]) + str(wordStress))
            parses = newparse 

    return list(set(parses)), [len(parse) for parse in list(set(parses))], wordCount


In [26]:
findLineStress(phoneText)

(['1111110111011', '1111010101011', '1111010111011', '1111110101011'],
 [13, 13, 13, 13],
 11)

In [27]:
findLineStress(newText)

(['111111101021101', '110111111021101', '110111101021101', '111111111021101'],
 [15, 15, 15, 15],
 12)

In [28]:
def levenshtein(s1, s2):
    '''calculate levenshtein distance for two input strings
    
    _parameters_
    s1: first input string
    s2: second input string
    
    _returns_
    distance: levenshtein distance between two strings...that is, the lowest number of modifications to turn s1 into s2
    '''
    s1 = str(s1)
    s2 = str(s2)
    
    if len(s1) < len(s2):
        return levenshtein(s2, s1)

    # otherwise len(s1) >= len(s2)
    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
            deletions = current_row[j] + 1       # than s2
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    
    return previous_row[-1]
    
def findMeter(text):
    '''finds meter with smallest edit distance
    
    _parameters_
    text: input text, usually a poem of some kind
    
    _returns_
    lowest: lowest edit distance for any standard accentual-syllabic verse
    options: list of potential meters for the lowest edit distance.
    '''
    # define
    meter_dict = {'0101':'Iambic dimeter'
                  ,'010101':'Iambic trimeter'
                  ,'01010101':'Iambic tetrameter'
                  ,'0101010101':'Iambic pentameter'
                  ,'010101010101':'Iambic hexameter'
                  ,'01010101010101':'Iambic heptameter'
                  ,'0101010101010101':'Iambic octameter'
                  ,'1010':'Trochaic dimeter'
                  ,'101010':'Trochaic trimeter'
                  ,'10101010':'Trochaic tetrameter'
                  ,'1010101010':'Trochaic pentameter'
                  ,'101010101010':'Trochaic hexameter'
                  ,'10101010101010':'Trochaic heptameter'
                  ,'1010101010101010':'Trochaic octameter'
                  ,'001001':'Anapestic dimeter'
                  ,'001001001':'Anapestic trimeter'
                  ,'001001001001':'Anapestic tetrameter'
                  ,'001001001001001':'Anapestic pentameter'
                  ,'001001001001001001':'Anapestic hexameter'
                  ,'001001001001001001001':'Anapestic heptameter'
                  ,'100100':'Dactyllic dimeter'
                  ,'100100100':'Dactyllic trimeter'
                  ,'100100100100':'Dactyllic tetrameter'
                  ,'100100100100100':'Dactyllic pentameter'
                  ,'100100100100100100':'Dactyllic hexameter'
                  ,'100100100100100100100':'Dactyllic heptameter'}

    # initialize
    vote_cnt = Counter()
    text = prepString(removeMarkupWords(text))
    lines = text.split('\n')
    line_cnt = len(lines)
    minDist = 999
    
    # update distances
    for line in lines:
        for k,v in meter_dict.items():
            minDist = 999
            for reading in findLineStress(line)[0]:
                dist = levenshtein(k,reading)
                if dist < minDist:
                    minDist = dist    
            vote_cnt[v] += minDist
    
    #options = min(vote_cnt, key=vote_cnt.get)  #chooses one in the event of ties
    lowest = min(vote_cnt.values()) 
    options = [k for k,v in vote_cnt.items() if v==lowest]
    return lowest, options, line_cnt, lowest/float(line_cnt) #, vote_cnt

In [29]:
print(phoneText,'\n',findLineStress(phoneText)[0],'\n',findMeter(phoneText))

as I walk through the valley of the garden of death 
 ['1111110111011', '1111010101011', '1111010111011', '1111110101011'] 
 (3, ['Iambic hexameter', 'Iambic heptameter', 'Trochaic hexameter', 'Trochaic heptameter'], 1, 3.0)


In [30]:
print(hickoryText)
findMeter(hickoryText)

Hickory Dickory Dock,
The mouse ran up the clock.
The clock struck one,
The mouse ran down!
Hickory Dickory Dock.


(6, ['Iambic dimeter'], 5, 1.2)

In [31]:
romeoText = "But, soft! what light through yonder window breaks? \n It is the east, and Juliet is the sun"
print(romeoText,'\n',findMeter(romeoText))

But, soft! what light through yonder window breaks? 
 It is the east, and Juliet is the sun 
 (4, ['Iambic pentameter'], 2, 2.0)


In [32]:
outputText1 = "xbos xbol listen to my tale of woe , xeol \n  xbol it ''s terribly sad but true , xeol \n  xbol all dressed up , no place to go xeol \n  xbol each evening i ''m awfully blue . xeol \n  xbol xeol \n xbol i must win some handsome guy xeol \n xbol can ''t go on like this , xeol"

In [33]:
for line in outputText1.split('\n'):
    print(line, findLineStress(line))

xbos xbol listen to my tale of woe , xeol  (['1011111', '1001111'], [7, 7], 6)
  xbol it ''s terribly sad but true , xeol  (['1100111', '0100111'], [7, 7], 6)
  xbol all dressed up , no place to go xeol  (['1111111', '1111101'], [7, 7], 7)
  xbol each evening i ''m awfully blue . xeol  (['1101101', '11011001'], [7, 8], 6)
  xbol xeol  ([''], [0], 0)
 xbol i must win some handsome guy xeol  (['1111101'], [7], 6)
 xbol can ''t go on like this , xeol (['11110', '01111', '11111', '01110'], [5, 5, 5, 5], 6)


In [34]:
calcRhymeDensity(outputText1,rhymeType='perfect',printExamples=True)

[('UW1', 4), ('AY1', 4), ('OW1', 4), ('IH1 S AH0 N', 1), ('EY1 L', 1)]


(0.25, 9, 37)

In [35]:
a = "people steeple"
b = "poodle stroooudel"
calcRhymeDensity(a,rhymeType='perfect',printExamples=True)

[('IY1 P AH0 L', 2)]


(1.0, 1, 2)

## Output

In [36]:

def scoreDirectory(source_dir='../data/lyrics/current/'
                   ,destination_dir='../data/scores/'
                   ,output_file_name='output.csv'
                   ,reference_dir='../data/lyrics/validation/'):
    
    '''create .csv with scores from files in specified directory'''
    
    songs = glob.glob(source_dir+'/*.txt')  
    refs = glob.glob(reference_dir+'/*.txt')
    ref_list = []
    
    ## add BLEU reference code
    for ref in refs:
        with open(ref) as rf:
            ref_raw_text = rf.read()
            ref_list.append(ref_raw_text)
    
    with open(destination_dir+output_file_name, 'w', newline='') as outf:
        cw = csv.writer(outf,quoting=csv.QUOTE_NONNUMERIC)    
        cw.writerow(['Model_Name'
                    ,'Temperature'
                    ,'Beam_Width'
                    ,'Item_Number'
                    ,'Genre'
                    ,'Title'
                    ,'POSConfirmity'
                    ,'RD_PerfectAll'
                    ,'RD_PerfectEnd'
                    ,'RD_VowelAll'
                    ,'RD_VowelEnd'
                    ,'ClosestMeter'
                    ,'AvgDistanceToMeter'
                    ,'BLEU_1_excl_Unsmoothed'
                    ,'BLEU_2_excl_Unsmoothed'
                    ,'BLEU_3_excl_Unsmoothed'
                    ,'BLEU_4_excl_Unsmoothed'
                    ,'BLEU_3_cumul_Smoothed'
                    ,'BLEU_4_cumul_Smoothed'
                    ,'Text'
                    ])

        for song in songs: 
            with open(song, newline='') as f: 
                model = source_dir.split('/')[-2]
                item,temperature,beamWidth = os.path.splitext(os.path.basename(song))[0].split('-')
                filename = os.path.basename(song)
                rawText = f.read()            
                
                
                ## if exists, extract genre
                genres = re.search('xgenre (.*?) xtitle', rawText)
                if genres.group(1):
                    genre = genres.group(1)
                else:
                    genre = ''
                
                ## if exists, extract title
                titles = re.search('xtitle (.*?) xbol',rawText)
                if titles.group(1):
                    title = titles.group(1)
                else:
                    title = ''
                
                ## ignore (or remove) everything before 'xbol-1'
                ## replace 'xeol' with '\n'
                
                text = removeMarkupWords(re.search('xbol-?\d? (.*)',rawText).group(1).replace(' xeol ','\n'))
                
                cw.writerow([model
                            ,temperature
                            ,beamWidth
                            ,item
                            ,genre
                            ,title
                            ,round(nltkPOS(text),4)
                            ,round(calcRhymeDensity(text,rhymeType='perfect',rhymeLocation='all')[0],4)
                            ,round(calcRhymeDensity(text,rhymeType='perfect',rhymeLocation='end',printExamples=False)[0],4)
                            ,round(calcRhymeDensity(text,rhymeType='vowel',rhymeLocation='all')[0],4)
                            ,round(calcRhymeDensity(text,rhymeType='vowel',rhymeLocation='end')[0],4) 
                            ,findMeter(text)[1][0]
                            ,round(findMeter(text)[3],4)
                            ,round(bleu(ref_list,text,nGram=1,nGramType='exclusive',shouldSmooth=False),4)
                            ,round(bleu(ref_list,text,nGram=2,nGramType='exclusive',shouldSmooth=False),4)
                            ,round(bleu(ref_list,text,nGram=3,nGramType='exclusive',shouldSmooth=False),4)
                            ,round(bleu(ref_list,text,nGram=4,nGramType='exclusive',shouldSmooth=False),4)
                            ,round(bleu(ref_list,text,nGram=3,nGramType='cumulative',shouldSmooth=True),4)
                            ,round(bleu(ref_list,text,nGram=4,nGramType='cumulative',shouldSmooth=True),4)
                            ,text
                            ]
                           )



In [37]:
scoreDirectory(source_dir='../data/lyrics/current/'
                   ,destination_dir='../data/scores/'
                   ,output_file_name='output.csv'
                   ,reference_dir='../data/lyrics/reference/')

ValueError: not enough values to unpack (expected 3, got 1)

In [None]:
model_name = '4.2-LM-108k-lines-genre-song_title'

scoreDirectory(source_dir=f'../../transfer/w210-capstone/lyrics/{model_name}/'
               ,output_file_name=f'scores_{model_name}.csv'
               ,reference_dir='../data/lyrics/validation/'
              )

In [74]:
test_dir = '../../transfer/w210-capstone/lyrics/test/'

In [77]:

songs = glob.glob(test_dir+'*.txt')  
print(songs)
for song in songs: 
    with open(song) as f: 
        text = f.read()
        model = test_dir.split('/')[-2]
        filename = os.path.basename(song)
        item,temperature,beamWidth = os.path.splitext(os.path.basename(song))[0].split('-')
        print(filename,model,item,temperature,beamWidth)
        
    

['../../transfer/w210-capstone/lyrics/test\\1543791069-1.4-5.txt']
1543791069-1.4-5.txt test 1543791069 1.4 5


In [None]:
a=time.time()
scoreDirectory(source_dir='../../transfer/w210-capstone/lyrics/test2/'
               ,output_file_name='test2.csv'
               ,reference_dir='../data/lyrics/validation/')
#scoreDirectory(source_dir=test_dir,output_file_name='test.csv',reference_dir='../data/lyrics/validation/')
b=time.time()
print(b-a)
