In this notebook, we will calculate some metrics on song lyrics and news articles.

In [2]:
import lyrics_analysis

First, let's count unique words in the set of 1000 random songs.

In [4]:
import ijson

# create a generator that will yield all the song lyrics
def generate_lyrics():
    with open("../data/cleaned/eval_set_1000_lyrics.json") as file:
        songs = ijson.items(file, "item")
        for song in songs:
            yield song["lyrics"], song["genre"]
            
# now count unique words in each song and save the result
unique_word_counts = []
for lyrics, _ in generate_lyrics():
    score = lyrics_analysis.evaluation.count_unique_words(lyrics)
    unique_word_counts.append(score)
    

Now, let's look at the first few scores.

In [3]:
unique_word_counts[:30]

[200,
 51,
 81,
 140,
 110,
 212,
 69,
 52,
 70,
 103,
 151,
 43,
 82,
 44,
 105,
 87,
 136,
 86,
 292,
 102,
 113,
 151,
 124,
 267,
 118,
 105,
 72,
 85,
 30,
 74]

Calculate the average number of unique words.

In [4]:
sum(unique_word_counts) / len(unique_word_counts)

170.846

Do the same but for the proportion meaningful (what's the word?) words.

In [5]:
meaningful_words = []
for lyrics in generate_lyrics():
    score = lyrics_analysis.evaluation.proportion_parts_of_speech(lyrics)
    meaningful_words.append(score)
    
print(meaningful_words[:30])
print(sum(meaningful_words) / len(meaningful_words))

[0.5558823529411765, 0.40119760479041916, 0.5488721804511278, 0.5422222222222223, 0.501577287066246, 0.4343891402714932, 0.5433962264150943, 0.5975609756097561, 0.48580441640378547, 0.4808510638297872, 0.5669291338582677, 0.4528301886792453, 0.6090909090909091, 0.6078431372549019, 0.5472636815920398, 0.4980237154150198, 0.4956896551724138, 0.48951048951048953, 0.5440677966101695, 0.5596330275229358, 0.5612745098039216, 0.5298804780876494, 0.5729166666666666, 0.4977843426883309, 0.5307017543859649, 0.5136363636363637, 0.5942622950819673, 0.46779661016949153, 0.44155844155844154, 0.5174418604651163]
0.5310589055152487


Compare the results to random texts.

In [8]:
def generate_texts():
    with open("../data/cleaned/eval_set_1000_random.json") as file:
        songs = ijson.items(file, "item")
        for song in songs:
            yield song["lyrics"]
            
unique_word_counts_random = []
meaningful_words_random = []
for lyrics in generate_texts():
    unique = lyrics_analysis.evaluation.count_unique_words(lyrics)
    meaningful = lyrics_analysis.evaluation.proportion_parts_of_speech(lyrics)
    unique_word_counts_random.append(unique)
    meaningful_words_random.append(meaningful)
    
print("Average unique words: ", sum(unique_word_counts_random) / len(unique_word_counts_random))
print("Average proportion of meaningful words: ", sum(meaningful_words_random) / len(meaningful_words_random))


Average unique words:  434.433
Average proportion of meaningful words:  0.5284735824249588


Compare pop vs rap songs

In [7]:
content_words_pop = []
content_words_rap = []
for lyrics, genre in generate_lyrics():
    score = lyrics_analysis.evaluation.proportion_parts_of_speech(lyrics)
    if genre == "pop":
        content_words_pop.append(score)
    elif genre == "rap":
        content_words_rap.append(score)
        
    

In [8]:
print("pop:", sum(content_words_pop)/len(content_words_pop))
print("rap:", sum(content_words_rap)/len(content_words_rap))


pop: 0.5224621089975695
rap: 0.5407799126688126
