# Data Wrangling
## Imports

In [1]:
import pandas as pd
from dotenv import load_dotenv
import os
from lyricsgenius import Genius

## Genius API environment

In [2]:
load_dotenv()
API_KEY = os.getenv("API_KEY")
genius = Genius(API_KEY)
genius.remove_section_headers = True


## Read in the Hot 100 billboard song data

In [4]:
df = pd.read_csv('./data/Hot 100.csv')

## Data Cleaning

Make sure dates are being read as datetimes

In [5]:
df['chart_date'] = pd.to_datetime(df.chart_date)
df['chart_debut'] = pd.to_datetime(df.chart_debut)

In [6]:
df['consecutive_weeks'].fillna(value=0,inplace=True)
df['previous_week'].fillna(value=0,inplace=True)

Make sure whole numbers are being saved as integers

In [7]:
frame = pd.DataFrame(df.dtypes)
numericals = list(frame.loc[frame.iloc[:,0]=='float64'].iloc[:,0].keys())
numericals = numericals + list(frame.loc[frame.iloc[:,0]=='int64'].iloc[:,0].keys())
for x in numericals:
    df[x] = df[x].astype('int32')

In [8]:
df.sort_values(by='chart_date',inplace=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 335195 entries, 100546 to 79691
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   chart_position     335195 non-null  int32         
 1   chart_date         335195 non-null  datetime64[ns]
 2   song               335195 non-null  object        
 3   performer          335195 non-null  object        
 4   song_id            335195 non-null  object        
 5   instance           335195 non-null  int32         
 6   time_on_chart      335195 non-null  int32         
 7   consecutive_weeks  335195 non-null  int32         
 8   previous_week      335195 non-null  int32         
 9   peak_position      335195 non-null  int32         
 10  worst_position     335195 non-null  int32         
 11  chart_debut        335195 non-null  datetime64[ns]
 12  chart_url          335195 non-null  object        
dtypes: datetime64[ns](2), int32(7), object(4

## Create dataframe with unique songs

In [10]:

unique_df = df.drop_duplicates(subset=['song_id'])

In [11]:
unique_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30314 entries, 100546 to 305276
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   chart_position     30314 non-null  int32         
 1   chart_date         30314 non-null  datetime64[ns]
 2   song               30314 non-null  object        
 3   performer          30314 non-null  object        
 4   song_id            30314 non-null  object        
 5   instance           30314 non-null  int32         
 6   time_on_chart      30314 non-null  int32         
 7   consecutive_weeks  30314 non-null  int32         
 8   previous_week      30314 non-null  int32         
 9   peak_position      30314 non-null  int32         
 10  worst_position     30314 non-null  int32         
 11  chart_debut        30314 non-null  datetime64[ns]
 12  chart_url          30314 non-null  object        
dtypes: datetime64[ns](2), int32(7), object(4)
memory usage:

## Define Function for API calls to Genius

In [12]:
def get_lyrics_genius(row):
    ans = float('nan')
    title = row['song']
    artist = row['performer']
    try:
        song = genius.search_song(title, artist=artist)
    except:
        ans = float('nan')
    try:
        song.lyrics
    except:
        ans = float('nan')
    else:
        if song.lyrics[:len(title)]==title:
            ans = song.lyrics[len(title)+len(' lyrics'):]
        else:
            ans = float('nan')
    return(ans)


## Define how much of the dataframe we want to look at

In [13]:
start = 1
stop = 2
subset_df = unique_df.iloc[start:stop]

## Run function and save data to the subset of the dataframe

In [14]:
subset_df['lyrics'] = subset_df.apply(get_lyrics_genius,axis=1)

Searching for "The Bird On My Head" by David Seville...
Done.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_df['lyrics'] = subset_df.apply(get_lyrics_genius,axis=1)


## Save data

In [15]:
#subset_df.to_pickle('./dataframe'+str(start)+'-'+str(stop)+'.pkl')

In [16]:
#first_pass_df = pd.read_pickle('./complete_df.pkl')