# Notebook 2 - Lyrics Preprocessing

In [1]:
# Needed Packages
from collections import Counter
import numpy as np
import pandas as pd
import re

In [2]:
# Read in compressed CSV file
raw_data_df = pd.read_csv("data/lyrics.csv.gz", compression = "gzip")

# Total Counts of Songs per Genre
print(raw_data_df["genre"].value_counts())

# Sample Table
raw_data_df.head()

genre
rock       1000
rap        1000
r-b         984
pop         979
country     976
Name: count, dtype: int64


Unnamed: 0.1,Unnamed: 0,artist,title,lyrics,genre
0,0,['Lil Nas X'],Old Town Road (Remix),"Oh, oh-oh\nOh\n\n[Chorus: Billy Ray Cyrus]\nYe...",country
1,1,['Taylor Swift'],All Too Well (10 Minute Version) (Taylor's Ver...,"I walked through the door with you, the air wa...",country
2,2,['Taylor Swift'],All Too Well (10 Minute Version) (Taylor's Ver...,"I walked through the door with you, the air wa...",country
3,3,['Lil Nas X'],Old Town Road,"Yeah, I'm gonna take my horse to the old town ...",country
4,4,['Taylor Swift'],Lover,We could leave the Christmas lights up 'til Ja...,country


## Drop Unnecessary Index Column

In [3]:
raw_data_df = raw_data_df.drop(raw_data_df.columns[0], axis = 1)

## Clean up `artist` names
Currently, artist names are formatted as ['_name_']. Artist names should be strings without brackets or quoation marks

In [4]:
# Remove preceding [' and ending '] from artist names
raw_data_df["artist"] = raw_data_df["artist"].apply(lambda x: x[2:-2])

# Some songs with multiple artists are now listed as artist_1_', 'artist_2
# Remove ', ' and replace with just ,

# NOTE: This transformation keeps apostrophes in-place
# Where the apostropher is in the artist's name
raw_data_df["artist"] = raw_data_df["artist"].apply(
    lambda x: x.replace("', '", ", ")
)

# Some artist have a randomly inserted backslash in the name
# Clean out the backslash
raw_data_df["artist"] = raw_data_df["artist"].apply(
    lambda x: x.replace("\\", "")
)

## Remove multiple versions of the same songs
Some songs titles are represented twice. This is due to the song having multiple genres. The first step is to consolidate these duplicate titles into one rwo with where the genre value is a list of genres.

In addition, some songs have the same lyrics due to the fact that they are either remixes or alternative versions of the original song. For example, Taylor's Swift's All to Well appears at least twice, once in its original form and again as a "Live Version". To ensure that each record is unique, these duplicate versions are removed. This assumes that remixes only affect the instrumentation, not the lyrics. 

However, some songs are only represented by their remixed versions. We must also ensure that those songs are not removed.

In [5]:
# 1. Consolidating songs with two genres
# Count all occurrences of songs in the data
title_counter = Counter(raw_data_df["title"])

# Create a list of only duplicate songs
dupe_songs = []
for title, count in title_counter.items():
    if count > 1:
        dupe_songs.append(title)   

In [6]:
compiled_genre_list = []
for song in dupe_songs:
    # Get a list of singers - in case multiple singers have the same song title
    singer_list = list(raw_data_df.loc[raw_data_df["title"] == song]["artist"].unique())
    # Compile a list of genres for the singer-song combination
    for singer in singer_list:
        sub_df =  raw_data_df.loc[
            (raw_data_df["title"] == song) & (raw_data_df["artist"] == singer)
            ]
        genre_list = list(sub_df["genre"].values)
        lyrics = sub_df.iloc[0,2]
        # Compile information into a dictionary
        new_dict = {
            "artist" : singer, "title" : song,
            "lyrics" : lyrics, "genre": genre_list
            }
        compiled_genre_list.append(new_dict)

In [7]:
# For consistency, put all the genres into a list
raw_genre_df = raw_data_df
raw_genre_df["genre"] = raw_genre_df["genre"].apply(lambda x: [x])

In [8]:
# From the above song Counter
# Create a new dataframe, with songs occuring only once
one_count = [s for s,c in title_counter.items() if c == 1]
one_instance_df = raw_data_df[raw_data_df["title"].isin(one_count)]

# Create dataframe from multi-genre list
multi_genre_df = pd.DataFrame(compiled_genre_list)

# Merge Dataframes together
genre_final_df = pd.concat([one_instance_df, multi_genre_df]).reset_index()


In [14]:
genre_final_df

Unnamed: 0,index,artist,title,lyrics,genre
0,5,Taylor Swift,​betty,"Betty, I won't make assumptions\nAbout why you...",[country]
1,10,John Denver,"Take Me Home, Country Roads","Almost Heaven, West Virginia\nBlue Ridge Mount...",[country]
2,18,Post Malone,Feeling Whitney,"I've been looking for someone...\nOoh, ooh, oo...",[country]
3,20,Cam,Burning House,\n[Verse 1]\nI had a dream about a burning hou...,[country]
4,21,Johnny Cash,Folsom Prison Blues,"I hear the train a-comin', it's rolling 'round...",[country]
...,...,...,...,...,...
4119,962,Rihanna,Close to You,"Nothing but a tear, that's all for breakfast\n...",[r-b]
4120,963,Big Sean,Blessings,"Look\nI feel blessed\nWay up, I feel blessed\n...",[rap]
4121,964,Chance the Rapper,Blessings,"I'm gon' praise Him, praise Him 'til I'm gone\...",[rap]
4122,965,Drake,Sacrifices,"Wrote this shit, January 21\nBaby girl, I had ...",[rap]


## Lyric Preprocessing
Preprocessing Steps:
- Lowercasing
- ...
- ...