In [58]:
import numpy as np
import pandas as pd

import os
import re

import nltk
from nltk.corpus import gutenberg, wordnet, stopwords
from nltk.tokenize import sent_tokenize , word_tokenize
from nltk.stem import WordNetLemmatizer
from string import punctuation

nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package omw-1.4 to /Users/jacob/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/jacob/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [56]:
# function to lemmatize

lemmatizer = WordNetLemmatizer()

# function to convert nltk tag to wordnet tag
def nltk_to_wordnet(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None
    
# list with stopwords and punctuation to remove
stoplist = set(stopwords.words('english') + list(punctuation))

def clean_lyrics(lyrics):
    # change everything to lower case
    lyrics = lyrics.lower()
    # remove numbers
    lyrics_nonum = re.sub(r'\d+', '', lyrics)
    
    #tokenize the lyrics and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(lyrics_nonum))  
    
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_to_wordnet(x[1])), nltk_tagged)
    lemmatized_lyrics = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_lyrics.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_lyrics.append(lemmatizer.lemmatize(word, tag))
            
    unique_tokens = unique_tokens = list(set(lemmatized_lyrics))
    
    # remove stopwords
    unique_nostop = [word for word in unique_tokens if word not in stoplist]
    return unique_nostop
    

In [305]:
path_songs = "billnius/songs/"
songlist = [song for song in os.listdir(path_songs) if os.path.isfile(os.path.join(path_songs, song))]
songlist

['A Holly Jolly Christmas by Burl Ives.txt',
 'Circles by Post Malone.txt',
 "Rockin' Around The Christmas Tree by Brenda Lee.txt",
 'All I Want For Christmas Is You by Mariah Carey.txt',
 'Jingle Bell Rock by Bobby Helms.txt']

In [307]:
df_songs = pd.DataFrame(columns = ['title','artist'])
song_lyrics = []

for song in songlist:    
    # getting title and artist from the file name
    title_artist = pd.DataFrame(song[:-4].split('by',1))
    title_artist = title_artist.transpose()
    title_artist.columns = ['title','artist']
        
    # clean lyrics
    source = open(os.path.join(path_songs, song), 'r', encoding='cp1252')
    lyrics = source.read()
    clean_tokens = clean_lyrics(lyrics)
    lyrics_string = ' '.join(clean_tokens)
    song_lyrics.append(lyrics_string)
    
    # add to the dataframe
    df_songs = pd.concat([df_songs, title_artist])

df_songs['lyrics'] = song_lyrics
df_songs['filename'] = songlist

df_songs

Unnamed: 0,title,artist,lyrics,filename
0,A Holly Jolly Christmas,Burl Ives,ives jolly like time best holly kiss might hun...,A Holly Jolly Christmas by Burl Ives.txt
0,Circles,Post Malone,,Circles by Post Malone.txt
0,Rockin' Around The Christmas Tree,Brenda Lee,stop jolly new get holly try hop ahh-ahh-ahh r...,Rockin' Around The Christmas Tree by Brenda Le...
0,All I Want For Christmas Is You,Mariah Carey,bridge even day embed thing north reindeer lik...,All I Want For Christmas Is You by Mariah Care...
0,Jingle Bell Rock,Bobby Helms,bridge ring blowin night away sleigh bushel ji...,Jingle Bell Rock by Bobby Helms.txt
