In [45]:
import nltk
import pandas as pd
import csv
from collections import Counter

nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\cake\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cake\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [46]:
df = pd.read_csv("./data/lyrics_language.csv")
df_en = df[df.language == "en"]

In [47]:
with open('./data/words.csv', 'r') as f:
    reader = csv.reader(f)
    birdlist = list(map(lambda x: x[0], reader))
    
lemm = nltk.stem.WordNetLemmatizer()
birdlist = list(map(lemm.lemmatize,birdlist))

In [48]:
class DerivedLemmatizer (nltk.stem.WordNetLemmatizer):
    def __init__(self):
        super().__init__()
    
    def _convert_tags(self, tag):
        if tag.startswith("J"):
            return nltk.corpus.wordnet.ADJ
        elif tag.startswith("V"):
            return nltk.corpus.wordnet.VERB   
        elif tag.startswith("N"):
            return nltk.corpus.wordnet.NOUN
        elif tag.startswith("R"):
            return nltk.corpus.wordnet.ADV
        else:
            return nltk.corpus.wordnet.NOUN
    
    def lemmatize(self, word, pos = nltk.corpus.wordnet.NOUN):
        wn_tag = self._convert_tags(pos)
        return super().lemmatize(word,wn_tag)        
        
def get_nouns(text):
    tokens = nltk.tokenize.word_tokenize(text)
    pos = nltk.tag.pos_tag(tokens)
    lem = DerivedLemmatizer()
    lem_tokens = map(lambda x: (lem.lemmatize(x[0],x[1]), x[1]), pos)
    nouns_tokens = filter(lambda x: x[1].startswith("N"), lem_tokens)
    nouns = map(lambda x: x[0], nouns_tokens)
    return list(nouns)

def count_words(text, wordlist = birdlist):
    music_count = Counter()
    word_count = Counter()    
    nouns = get_nouns(text)
    for word in wordlist:
        count = nouns.count(word)
        if count > 0:            
            music_count[word] = 1
            word_count[word] = count
    return music_count, word_count

In [49]:
full_music = Counter()
full_word = Counter()

for idx, row in df_en.iterrows():
    music_count, word_count = count_words(row.lyrics)
    full_music += music_count
    full_word += word_count
    
print(full_music)
print(full_word)

Counter({'bird': 1169, 'tyrant': 678, 'raven': 641, 'vulture': 541, 'eagle': 535, 'crow': 401, 'dove': 216, 'swan': 158, 'owl': 142, 'swift': 124, 'swallow': 96, 'hawk': 81, 'falcon': 71, 'roller': 68, 'chicken': 46, 'sparrow': 43, 'duck': 39, 'nightingale': 30, 'pigeon': 29, 'crane': 28, 'kite': 27, 'raptor': 24, 'albatross': 22, 'gull': 18, 'blackbird': 17, 'lark': 16, 'peacock': 15, 'goose': 15, 'rooster': 13, 'rook': 13, 'heron': 12, 'condor': 12, 'buzzard': 11, 'magpie': 10, 'turkey': 9, 'parrot': 9, 'quail': 8, 'robin': 6, 'jay': 5, 'thrush': 4, 'cuckoo': 4, 'hummingbird': 4, 'woodpecker': 4, 'mockingbird': 4, 'shrike': 3, 'snipe': 3, 'ibis': 3, 'penguin': 3, 'stork': 3, 'puffin': 2, 'starling': 2, 'cockerel': 2, 'finch': 2, 'ostrich': 2, 'chickadee': 2, 'grouse': 2, 'cormorant': 2, 'macaw': 1, 'petrel': 1, 'kestrel': 1, 'wheatear': 1, 'toucan': 1, 'nightjar': 1, 'parakeet': 1})
Counter({'bird': 1531, 'tyrant': 951, 'eagle': 871, 'raven': 860, 'vulture': 678, 'crow': 546, 'dove':