In [1]:
import pandas as pd
import numpy as np
import os
import re

In [2]:
data = {'artist':[], 'song_name': [], 'file_name': [], 'lyrics': []}

loc = os.listdir('lyrics')
for artist in loc:
    if artist != '.DS_Store':
        for file in os.listdir(f'lyrics/{artist}'):
            if file[0] != '.':
                song_name = file.replace('.txt', '').replace('_', ' ').title()
                data['artist'].append(artist)
                data['song_name'].append(song_name)
                data['file_name'].append(file)
                with open(f'lyrics/{artist}/{file}', 'r') as f:
                    lyrics = f.read()
                data['lyrics'].append(lyrics.replace('\n', ' '))
            
data = pd.DataFrame.from_dict(data)
print(f'Record Count: {len(data)}')
data.head()

Record Count: 2116


Unnamed: 0,artist,song_name,file_name,lyrics
0,Gary Wright,Love Is Alive,love_is_alive.txt,"Well, I think it's time to get ready To realiz..."
1,LMFAO,Party Rock Anthem,party_rock_anthem.txt,Party rock Let's go! Party rock is in the hou...
2,Metallica,2 X 4,2_x_4.txt,"I’m gonna make you, shake you, take you I’m go..."
3,Metallica,Prince Charming,prince_charming.txt,There’s a black cloud overhead That’s me And t...
4,Metallica,Now That We'Re Dead,now_that_we're_dead.txt,When darkness falls May it be That we should s...


### Expand Contractions

In [3]:
from contractions import CONTRACTION_MAP

def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    # Cxpand contractions into individual words using pre-defined list of contractions.
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        # Find words in text that match with contraction map, and return expanded text
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

### Remove Special Characters

In [4]:
def remove_special_characters(text, remove_digits=False):
    # Remove special characters from text
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, ' ', text)
    return text

### Lemmatize Text

In [5]:
import spacy
nlp = spacy.load('en_core_web_md', parse=True, tag=True, entity=True)

def lemmatize_text(text):
    # Get base form for word variants (i.e. running ==> run, cats ==> cat, etc.)
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

### Remove Stopwords and Tokenize

In [6]:
import nltk
from nltk.tokenize.toktok import ToktokTokenizer

tokenizer = ToktokTokenizer()

# Remove words 'no' and 'not' from stopword list as these may change meaning of words/phrases
stopword_list = nltk.corpus.stopwords.words('english')
for word in ['no', 'not', 'he', 'she', 'his', 'her', 'hers']:
    stopword_list.remove(word)

# Add characters not captured in special character removal to stop list
stopword_list += ['[', ']', "\\"]                     

def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

### Apply All Cleaning Functions

In [7]:
import unicodedata

def clean_text(cust_text):
    # Remove accent characters => Expand Contractions => Remove special characters => Lammatize Text => Remove Stop Words
    no_accent_chars = unicodedata.normalize('NFKD', cust_text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    expanded_text = expand_contractions(no_accent_chars)
    no_special_chars = remove_special_characters(expanded_text, remove_digits=False)
    lem_text = lemmatize_text(no_special_chars)
    filtered_text = remove_stopwords(lem_text)

    # Make all text lowercase
    lowercase_text = filtered_text.lower()

    # Split text into list of words
    words = lowercase_text.split()
    return words

In [8]:
data['clean_lyrics'] = data.lyrics.apply(lambda x: clean_text(x))
data.head()

Unnamed: 0,artist,song_name,file_name,lyrics,clean_lyrics
0,Gary Wright,Love Is Alive,love_is_alive.txt,"Well, I think it's time to get ready To realiz...","[well, think, time, get, ready, realize, find,..."
1,LMFAO,Party Rock Anthem,party_rock_anthem.txt,Party rock Let's go! Party rock is in the hou...,"[party, rock, let, us, go, party, rock, house,..."
2,Metallica,2 X 4,2_x_4.txt,"I’m gonna make you, shake you, take you I’m go...","[go, make, shake, take, go, one, break, put, s..."
3,Metallica,Prince Charming,prince_charming.txt,There’s a black cloud overhead That’s me And t...,"[black, cloud, overhead, poison, ivy, choke, t..."
4,Metallica,Now That We'Re Dead,now_that_we're_dead.txt,When darkness falls May it be That we should s...,"[darkness, fall, may, see, light, reaper, call..."


In [11]:
data.to_csv('recommendation_resources/preprocessed_lyrics.csv')

In [57]:
rec_df = pd.read_csv('recommendation_resources/preprocessed_lyrics.csv', index_col=0, usecols=list(range(4)))
rec_df.head()

Unnamed: 0,artist,song_name,file_name
0,Gary Wright,Love Is Alive,love_is_alive.txt
1,LMFAO,Party Rock Anthem,party_rock_anthem.txt
2,Metallica,2 X 4,2_x_4.txt
3,Metallica,Prince Charming,prince_charming.txt
4,Metallica,Now That We'Re Dead,now_that_we're_dead.txt


In [66]:
all_recs = []

for i in rec_df.index:
    rec_ids = [np.random.randint(0, len(data)) for i in range(5)]
    recs = {}
    for j in rec_ids:
        recs[rec_df.artist[j]]={rec_df.song_name[j]:rec_df.file_name[j]}
#         recs.append(rec)
    all_recs.append(recs)

rec_df['recommendations'] = all_recs
rec_df.head()

Unnamed: 0,artist,song_name,file_name,recommendations
0,Gary Wright,Love Is Alive,love_is_alive.txt,{'Temptations': {'Love On My Mind Tonight': 'l...
1,LMFAO,Party Rock Anthem,party_rock_anthem.txt,{'Frank Sinatra': {'My Funny Valentine': 'my_f...
2,Metallica,2 X 4,2_x_4.txt,{'Little Richard': {'Tutti Frutti': 'tutti_fru...
3,Metallica,Prince Charming,prince_charming.txt,{'Frank Sinatra': {'I'Ve Got You Under My Skin...
4,Metallica,Now That We'Re Dead,now_that_we're_dead.txt,{'Hank Williams': {'I Won'T Be Home No More': ...


In [68]:
import json

recs = {}
for artist in rec_df.artist.unique():
    recs[artist] = {}

for i in rec_df.index:
    song_info = {
        'file_name':rec_df.file_name[i],
        'recommendations':rec_df.recommendations[i],
    }
    recs[rec_df.artist[i]][rec_df.song_name[i]]=song_info

with open("recommendations.json", "w") as outfile:  
    json.dump(recs, outfile)

recs

{'Gary Wright': {'Love Is Alive': {'file_name': 'love_is_alive.txt',
   'recommendations': {'Temptations': {'Love On My Mind Tonight': 'love_on_my_mind_tonight.txt'},
    'Beatles': {'You Like Me Too Much': 'you_like_me_too_much.txt'},
    'Simon & Garfunkel': {'Save The Life Of My Child': 'save_the_life_of_my_child.txt'},
    'Frank Sinatra': {'Taking A Chance On Love': 'taking_a_chance_on_love.txt'},
    'Monkees': {"I'M A Believer": "i'm_a_believer.txt"}}}},
 'LMFAO': {'Party Rock Anthem': {'file_name': 'party_rock_anthem.txt',
   'recommendations': {'Frank Sinatra': {'My Funny Valentine': 'my_funny_valentine.txt'},
    'Ray Charles': {'After My Laughter Came Tears': 'after_my_laughter_came_tears.txt'},
    'Jerry Lee Lewis': {'Help Me Make It Through The Night': 'help_me_make_it_through_the_night.txt'},
    'Little Richard': {'Baby Face': 'baby_face.txt'},
    'Who': {'Blue, Red And Grey': 'blue,_red_and_grey.txt'}}}},
 'Metallica': {'2 X 4': {'file_name': '2_x_4.txt',
   'recommen

In [17]:
# chars = '*" / \ [ ] : ; | ,'

artists = os.listdir('lyrics')
# for char in chars:
for artist in artists:
    songs = os.listdir(f'lyrics/{artist}')
    for song in songs:
        os.rename(f'lyrics/{artist}/{song}', f"lyrics/{artist}/{song.replace('txt', '.txt')}")