# Description

The lyrics are stored as a string in a dataframe that contains other information such as the artist and genre. The lyrics in raw format are those that were returned by a Genius API call. The functions below pre-process these raw lyrics and make them suitable to pass into a NLP model. 

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.util import ngrams
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\james\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
def read_lyrics_data(file_name):
    return pd.read_csv(file_name)

# Extract the song lyrics from the dataframe
def extract_song_lyrics(song_lyrics):
    processed_song_lyrics = ""
    phrase = ""
    for letter in song_lyrics:
        if letter == '[': # Look for opening bracket that marks beginning of verse description
            if phrase:
                processed_song_lyrics += phrase
            phrase = ""
        elif letter == ']': # Look for closing bracket to mark end of verse description
            phrase = ""
        else:
            phrase += letter
    processed_song_lyrics += phrase
    return processed_song_lyrics

In [3]:
# Pre-processes lyrics returned from API call so that they can be passed into model
def filter_tokens(lyrics):
    final_tokens = []
    tokens = list(ngrams(nltk.word_tokenize(lyrics), 1))
    tail_idx = 0
    
    lemmatizer = WordNetLemmatizer()
    for token in tokens:
        word = token[0]
        word = word.replace('"', '')
        word = word.replace("'", '')
        
        word = word.replace('\d+', '')
        word = word.replace('[^\w\s]', '')
        
        if word.lower().islower():
            if len(word) == 1 and (word.lower() == 'i' or word.lower() == 'a'):
                final_tokens.append(word)
                tail_idx += 1
            elif word == 're' or len(word) == 1: # Handle contractions
                if tail_idx >= 1:
                    spliced_word = final_tokens[tail_idx - 1] + "'" + word
                    final_tokens[tail_idx - 1] = spliced_word
                else:
                    final_tokens.append(word)
                    tail_idx += 1
            elif word == 'nt':
                if tail_idx >= 1:
                    spliced_word = final_tokens[tail_idx - 1] + "n't"
                    final_tokens[tail_idx - 1] = spliced_word
                else:
                    final_tokens.append(word)
                    tail_idx += 1
            else:
                new_word = lemmatizer.lemmatize(word)
                if tail_idx >= 2: # Try to remove duplicate words over and over
                    if new_word != final_tokens[tail_idx - 1] and new_word != final_tokens[tail_idx - 2]:
                        final_tokens.append(new_word)
                        tail_idx += 1
                else:
                    final_tokens.append(new_word)
                    tail_idx += 1
    return " ".join(final_tokens)

In [4]:
def process_lyrics_df(lyrics_df):
    num_rows = lyrics_df.shape[0]
    final_df = []
    for r in range(num_rows):
        point = {}
        single_row = lyrics_df.iloc[r]
        artist = single_row.Artist
        genre = single_row.Genre
        song = single_row.Song
        
        if artist == 'Cam':
            continue
        lyrics = single_row.Lyrics
        
        lyrics = lyrics[1:len(lyrics) - 1]
        lyrics = extract_song_lyrics(lyrics)
        lyrics = filter_tokens(lyrics)
        point['Artist'] = artist
        point['Genre'] = genre
        point['Song'] = song
        point['Lyrics'] = lyrics
        final_df.append(point)
    final_df = pd.DataFrame(final_df)
    final_df.to_csv("cleaned_lyrics.csv", index = False)
    return final_df

In [6]:
lyrics_data = read_lyrics_data("LyricsData.csv")

In [7]:
processed_data = process_lyrics_df(lyrics_data)
processed_data.to_csv("cleaned_lyrics.csv", index = False)

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\james/nltk_data'
    - 'C:\\Users\\james\\Anaconda3\\nltk_data'
    - 'C:\\Users\\james\\Anaconda3\\share\\nltk_data'
    - 'C:\\Users\\james\\Anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\james\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************
