# Notebook 2 - Lyrics Preprocessing

In [64]:
# Needed Packages
import pandas as pd
import re

In [6]:
# Read in compressed CSV file
raw_data_df = pd.read_csv("data/lyrics.csv.gz", compression = "gzip")

# Total Counts of Songs per Genre
print(raw_data_df["genre"].value_counts())

# Sample Table
raw_data_df.head()

genre
rock       1000
rap        1000
r-b         984
pop         979
country     976
Name: count, dtype: int64
(4939, 5)


Unnamed: 0.1,Unnamed: 0,artist,title,lyrics,genre
0,0,['Lil Nas X'],Old Town Road (Remix),"Oh, oh-oh\nOh\n\n[Chorus: Billy Ray Cyrus]\nYe...",country
1,1,['Taylor Swift'],All Too Well (10 Minute Version) (Taylor's Ver...,"I walked through the door with you, the air wa...",country
2,2,['Taylor Swift'],All Too Well (10 Minute Version) (Taylor's Ver...,"I walked through the door with you, the air wa...",country
3,3,['Lil Nas X'],Old Town Road,"Yeah, I'm gonna take my horse to the old town ...",country
4,4,['Taylor Swift'],Lover,We could leave the Christmas lights up 'til Ja...,country


## Drop Unnecessary Index Column

In [21]:
raw_data_df = raw_data_df.drop(raw_data_df.columns[0], axis = 1)

## Clean up `artist` names
Currently, artist names are formatted as ['_name_']. Artist names should be strings without brackets or quoation marks

In [35]:
# Remove preceding [' and ending '] from artist names
raw_data_df["artist"] = raw_data_df["artist"].apply(lambda x: x[2:-2])

# Some songs with multiple artists are now listed as artist_1_', 'artist_2
# Remove ', ' and replace with just ,

# NOTE: This transformation keeps apostrophes in-place
# Where the apostropher is in the artist's name
raw_data_df["artist"] = raw_data_df["artist"].apply(
    lambda x: x.replace("', '", ", ")
)

# Some artist have a randomly inserted backslash in the name
# Clean out the backslash
raw_data_df["artist"] = raw_data_df["artist"].apply(
    lambda x: x.replace("\\", "")
)

## Remove multiple versions of the same songs
In the scraped data, some songs have the same lyrics due to the fact that they are either remixes or alternative versions of the original song. For example, Taylor's Swift's All to Well appears at least twice, once in its original form and again as a "Live Version". To ensure that each record is unique, these duplicate versions are removed.

In [91]:
[x for x in raw_data_df.title.values if "Remix" in x]

['Old Town Road (Remix)',
 'Old Town Road (Young Thug & Mason Ramsey Remix)',
 'Lover (Remix)',
 'Cruise (Remix)',
 'Old Town Road (Seoul Town Road Remix)',
 'Like a Farmer (Remix)',
 'Old Town Hoe (Old Town Road Remix)',
 'Gasoline (Remix)',
 'Lost in the Middle of Nowhere (Spanish Remix)',
 'The Bones (Remix)',
 'Daddy Issues (Remix)',
 'Radioactive (Remix)',
 'The Night We Met (Remix)',
 'POWER (Remix)',
 'Despacito (Remix)',
 'Old Town Road (Remix)',
 'Girls Like You (Remix)',
 'Finesse (Remix)',
 'Te Boté (Remix)',
 '***Flawless (Remix)',
 'Luis Fonsi & Daddy Yankee - Despacito (Remix) ft. Justin Bieber (English Translation)',
 'MIC Drop (Steve Aoki Remix)',
 'Reggaetón Lento (Remix)',
 'Bad Blood (Remix)',
 'Say So (Remix)',
 'No Me Conoce (Remix)',
 'Daddy Issues (Remix)',
 'Toda (Remix)',
 'Dream - Mask (Official Sus Remix)',
 '34+35 (Remix)',
 'Save Your Tears (Remix)',
 'Drunk in Love (Remix)',
 '***Flawless (Remix)',
 'Girls Need Love (Remix)',
 'Daddy Issues (Remix)',
 'Bad

## Lyric Preprocessing
Preprocessing Steps:
- Lowercasing
- ...
- ...