# Data Wrangling
## Imports

In [None]:
import pandas as pd
from dotenv import load_dotenv
import os
from lyricsgenius import Genius

## Genius API environment

In [None]:
load_dotenv()
API_KEY = os.getenv("API_KEY")
genius = Genius(API_KEY)
genius.remove_section_headers = True


## Read in the Hot 100 billboard song data

In [None]:
df = pd.read_csv('Hot 100.csv')

## Data Cleaning

Make sure dates are being read as datetimes

In [None]:
df['chart_date'] = pd.to_datetime(df.chart_date)
df['chart_debut'] = pd.to_datetime(df.chart_debut)

In [None]:
df['consecutive_weeks'].fillna(value=0,inplace=True)
df['previous_week'].fillna(value=0,inplace=True)

Make sure whole numbers are being saved as integers

In [None]:
frame = pd.DataFrame(df.dtypes)
numericals = list(frame.loc[frame.iloc[:,0]=='float64'].iloc[:,0].keys())
numericals = numericals + list(frame.loc[frame.iloc[:,0]=='int64'].iloc[:,0].keys())
for x in numericals:
    df[x] = df[x].astype('int32')

In [None]:
df.sort_values(by='chart_date',inplace=True)

In [None]:
df.info()

## Create dataframe with unique songs

In [None]:

unique_df = df.drop_duplicates(subset=['song_id'])

In [None]:
unique_df.info()

## Define Function for API calls to Genius

In [None]:
def get_lyrics_genius(row):
    ans = float('nan')
    title = row['song']
    artist = row['performer']
    try:
        song = genius.search_song(title, artist=artist)
    except:
        ans = float('nan')
    try:
        song.lyrics
    except:
        ans = float('nan')
    else:
        if song.lyrics[:len(title)]==title:
            ans = song.lyrics[len(title)+len(' lyrics'):]
        else:
            ans = float('nan')
    return(ans)


## Define how much of the dataframe we want to look at

In [None]:
start = 1
stop = 2
subset_df = unique_df.iloc[start:stop]

## Run function and save data to the subset of the dataframe

In [None]:
subset_df['lyrics'] = subset_df.apply(get_lyrics_genius,axis=1)

## Save data

In [None]:
subset_df.to_pickle('./dataframe'+str(start)+'-'+str(stop)+'.pkl')

In [None]:
first_pass_df = pd.read_pickle('./complete_df.pkl')