# Notebook 2 - Lyrics Preprocessing

In [97]:
# Needed Packages
from collections import Counter
import numpy as np
import pandas as pd
import re

In [6]:
# Read in compressed CSV file
raw_data_df = pd.read_csv("data/lyrics.csv.gz", compression = "gzip")

# Total Counts of Songs per Genre
print(raw_data_df["genre"].value_counts())

# Sample Table
raw_data_df.head()

genre
rock       1000
rap        1000
r-b         984
pop         979
country     976
Name: count, dtype: int64
(4939, 5)


Unnamed: 0.1,Unnamed: 0,artist,title,lyrics,genre
0,0,['Lil Nas X'],Old Town Road (Remix),"Oh, oh-oh\nOh\n\n[Chorus: Billy Ray Cyrus]\nYe...",country
1,1,['Taylor Swift'],All Too Well (10 Minute Version) (Taylor's Ver...,"I walked through the door with you, the air wa...",country
2,2,['Taylor Swift'],All Too Well (10 Minute Version) (Taylor's Ver...,"I walked through the door with you, the air wa...",country
3,3,['Lil Nas X'],Old Town Road,"Yeah, I'm gonna take my horse to the old town ...",country
4,4,['Taylor Swift'],Lover,We could leave the Christmas lights up 'til Ja...,country


## Drop Unnecessary Index Column

In [21]:
raw_data_df = raw_data_df.drop(raw_data_df.columns[0], axis = 1)

## Clean up `artist` names
Currently, artist names are formatted as ['_name_']. Artist names should be strings without brackets or quoation marks

In [35]:
# Remove preceding [' and ending '] from artist names
raw_data_df["artist"] = raw_data_df["artist"].apply(lambda x: x[2:-2])

# Some songs with multiple artists are now listed as artist_1_', 'artist_2
# Remove ', ' and replace with just ,

# NOTE: This transformation keeps apostrophes in-place
# Where the apostropher is in the artist's name
raw_data_df["artist"] = raw_data_df["artist"].apply(
    lambda x: x.replace("', '", ", ")
)

# Some artist have a randomly inserted backslash in the name
# Clean out the backslash
raw_data_df["artist"] = raw_data_df["artist"].apply(
    lambda x: x.replace("\\", "")
)

## Remove multiple versions of the same songs
Some songs titles are represented twice. This is due to the song having multiple genres. The first step is to consolidate these duplicate titles into one rwo with where the genre value is a list of genres.

In addition, some songs have the same lyrics due to the fact that they are either remixes or alternative versions of the original song. For example, Taylor's Swift's All to Well appears at least twice, once in its original form and again as a "Live Version". To ensure that each record is unique, these duplicate versions are removed. This assumes that remixes only affect the instrumentation, not the lyrics. 

However, some songs are only represented by their remixed versions. We must also ensure that those songs are not removed.

In [160]:
# 1. Consolidating songs with two genres
# Count all occurrences of songs in the data
title_counter = Counter(raw_data_df["title"])

# Create a list of only duplicate songs
dupe_songs = []
for title, count in title_counter.items():
    if count > 1:
        dupe_songs.append(title)   



In [182]:
list(raw_data_df.loc[(raw_data_df["title"] == "Sunflower") & (raw_data_df["artist"] == "Rex Orange County")]["genre"].values)

['rock', 'pop', 'r-b']

In [None]:
# Final df

# nondupe_df = 

In [138]:
# Look for remixes to drop - remixes with standard versions included in the dataframe

# All Titles with the Phrase "Remix" in the title
remix_list = [x for x in nondupe_df.title.values if "Remix" in x]

# Empty list of titles to remove
drop_list = []

# Iterate over remix list
for title in remix_list:
    try:
        # Look for the phrase "____ (" - 
        # where Remix is often somewhere in the () of a title
        nonremix_title = re.match(r"([\w\s])+[(]", title).group(0)
        # Remove the ending "(" and any unnecessary whitespaces
        nonremix_title = nonremix_title.replace("(", "").strip()
        # Look for that root title in the dataframe
        # If the root title occurs > 1, then the song can be removed
        # If the root title occurs only once, the remix is the only representation of the song
        ## Thus, the song should be kept 
        if len([x for x in nondupe_df.title.values if nonremix_title in x]) > 1:
            drop_list.append(title)
    except:
        pass


['Old Town Road (Remix)', 'Old Town Road (Young Thug & Mason Ramsey Remix)', 'Lover (Remix)', 'Cruise (Remix)', 'Old Town Road (Seoul Town Road Remix)', 'Like a Farmer (Remix)', 'Gasoline (Remix)', 'The Bones (Remix)', 'Daddy Issues (Remix)', 'Radioactive (Remix)', 'The Night We Met (Remix)', 'POWER (Remix)', 'Despacito (Remix)', 'Girls Like You (Remix)', 'Finesse (Remix)', 'Say So (Remix)', 'Toda (Remix)', 'Save Your Tears (Remix)', 'Drunk in Love (Remix)', 'Girls Need Love (Remix)', 'Bad (Remix)', 'The Hills (Eminem Remix)', 'Beautiful (Remix)', 'Havana (Remix)', 'Deuces (Remix)', 'No Love (Remix)', 'Diamonds (Remix)', 'Wanna Know (Remix)', 'Die For You (Remix)', 'All I Need (Remix)', 'Plain Jane (Remix)', 'Versace (Remix)', 'My Way (Remix)', 'WHATS POPPIN (Remix)', 'Body (Remix)', 'Dance (A$$) (Remix)', 'Thotiana (Cardi B Remix)', 'Down in the DM (Remix)']


In [145]:
# Remove songs from the drop list
no_remix_df = nondupe_df[~nondupe_df["title"].isin(drop_list)]

## Lyric Preprocessing
Preprocessing Steps:
- Lowercasing
- ...
- ...