# Analysis of My Music/Health Data

## Read in Spotify data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import time
from lyricsgenius import Genius
from dotenv import load_dotenv
from tqdm import tqdm

pd.options.display.max_columns = 200
tqdm.pandas()

load_dotenv()

df = pd.read_json('./spotify_data/Streaming_History_Audio_2024_29.json')
df.head(10)

Unnamed: 0,ts,platform,ms_played,conn_country,ip_addr,master_metadata_track_name,master_metadata_album_artist_name,master_metadata_album_album_name,spotify_track_uri,episode_name,episode_show_name,spotify_episode_uri,audiobook_title,audiobook_uri,audiobook_chapter_uri,audiobook_chapter_title,reason_start,reason_end,shuffle,skipped,offline,offline_timestamp,incognito_mode
0,2024-06-25T11:58:46Z,ios,132060,US,2601:14d:4b84:4a70:713f:c9ce:309c:32b9,Almost Fantasy,Fog Lake,Almost Fantasy,spotify:track:30U7MwsSebniOJyZ8y1hcE,,,,,,,,trackdone,trackdone,True,False,False,1719316594,False
1,2024-06-25T12:01:20Z,ios,152571,US,166.199.139.64,High,Slow Pulp,Big Day,spotify:track:3GaamyFHQYbQi1ykc4nV4o,,,,,,,,trackdone,trackdone,True,False,False,1719316726,False
2,2024-06-25T12:02:04Z,ios,42733,US,166.199.139.64,From the Subway Train,Vansire,Angel Youth,spotify:track:0qOCy2LqvdkMora7eZsJhQ,,,,,,,,trackdone,endplay,True,True,False,1719316880,False
3,2024-06-25T12:05:30Z,ios,205554,US,166.199.139.64,I Hope to Be Around,Men I Trust,I Hope to Be Around,spotify:track:3xnM0WCRJKk4Iz49rkG5OC,,,,,,,,clickrow,trackdone,False,False,False,1719316924,False
4,2024-06-25T12:09:17Z,ios,227426,US,166.199.139.64,Your face,Wisp,Your face,spotify:track:7ne2hzW4jaU5tacaCI4kJH,,,,,,,,trackdone,trackdone,True,False,False,1719317130,False
5,2024-06-25T12:11:07Z,ios,108182,US,166.199.139.64,Sour Switchblade,Elita,Sour Switchblade,spotify:track:4wFeMmJDlgkAxlQ07PbdGZ,,,,,,,,trackdone,fwdbtn,True,True,False,1719317357,False
6,2024-06-25T12:15:00Z,ios,233639,US,166.199.139.64,Money,The Drums,Portamento,spotify:track:3VIJBrMpvimHEw5wtPh2wB,,,,,,,,fwdbtn,trackdone,True,False,False,1719317467,False
7,2024-06-25T12:16:18Z,ios,76450,US,166.199.139.64,Sea Sick,binki,Sea Sick,spotify:track:3qiX7QXv6w6y4wsr5KP6U1,,,,,,,,trackdone,fwdbtn,True,True,False,1719317700,False
8,2024-06-25T12:22:58Z,ios,400334,US,166.199.139.64,Money,The Drums,Portamento,spotify:track:3VIJBrMpvimHEw5wtPh2wB,,,,,,,,fwdbtn,fwdbtn,True,True,False,1719317778,False
9,2024-06-25T12:23:47Z,ios,48785,US,166.199.139.64,anything,Adrianne Lenker,songs,spotify:track:4PwWESSlTwzvw9B7bmtTLS,,,,,,,,fwdbtn,fwdbtn,True,True,False,1719318178,False


## Drop, rename, and changes dtypes

In [None]:
# Drop and rename columns
df = df[['ts', 'ms_played', 'master_metadata_track_name', 'master_metadata_album_artist_name', 'master_metadata_album_album_name', 'spotify_track_uri']]
df = df.rename(columns={'ts': 'date', 'master_metadata_track_name': 'track', 'master_metadata_album_artist_name': 'artist', 'master_metadata_album_album_name': 'album', 'spotify_track_uri': 'uri'})

# Filter out songs listened to for < 1 minute
df = df[df['ms_played'] > 60000]

# Drop missing values and changes dtypes
df.dropna(subset=["track"], inplace=True)
df['date'] = pd.to_datetime(df['date'])
df.reset_index(drop=True, inplace=True)
df.head(10)

Unnamed: 0,date,ms_played,track,artist,album,uri
0,2024-06-25 11:58:46+00:00,132060,Almost Fantasy,Fog Lake,Almost Fantasy,spotify:track:30U7MwsSebniOJyZ8y1hcE
1,2024-06-25 12:01:20+00:00,152571,High,Slow Pulp,Big Day,spotify:track:3GaamyFHQYbQi1ykc4nV4o
2,2024-06-25 12:05:30+00:00,205554,I Hope to Be Around,Men I Trust,I Hope to Be Around,spotify:track:3xnM0WCRJKk4Iz49rkG5OC
3,2024-06-25 12:09:17+00:00,227426,Your face,Wisp,Your face,spotify:track:7ne2hzW4jaU5tacaCI4kJH
4,2024-06-25 12:11:07+00:00,108182,Sour Switchblade,Elita,Sour Switchblade,spotify:track:4wFeMmJDlgkAxlQ07PbdGZ
5,2024-06-25 12:15:00+00:00,233639,Money,The Drums,Portamento,spotify:track:3VIJBrMpvimHEw5wtPh2wB
6,2024-06-25 12:16:18+00:00,76450,Sea Sick,binki,Sea Sick,spotify:track:3qiX7QXv6w6y4wsr5KP6U1
7,2024-06-25 12:22:58+00:00,400334,Money,The Drums,Portamento,spotify:track:3VIJBrMpvimHEw5wtPh2wB
8,2024-06-25 12:26:27+00:00,141920,Lil Thing,Knox Fortune,Paradise,spotify:track:4At1qbqCOJ9WFbvpWTOOY4
9,2024-06-25 15:21:23+00:00,100542,You're Not The Only One I Know,The Sundays,"Reading, Writing And Arithmetic",spotify:track:7BaxYnTazocAOK3istsW1z


## Get song lyrics

In [None]:
saved_lyrics = {}

def get_lyrics(row):
    if row['track'] in saved_lyrics:
        return saved_lyrics[row['track']]
    else:
        song = None
        try:
            song = genius.search_song(row['track'], row['artist'])
            time.sleep(1)
            saved_lyrics[row['track']] = song.lyrics if song else None
        except:
            saved_lyrics[row['track']] = None
            time.sleep(10)

        return song.lyrics if song else None

genius = Genius(os.getenv('GENIUS_TOKEN'), verbose=True, remove_section_headers=True)
df['lyrics'] = df.progress_apply(get_lyrics, axis=1)

## Clean lyrics

In [106]:
df_copy = df.copy(deep=True)

In [104]:
df = df_copy

In [None]:
df.dropna(subset=["lyrics"], inplace=True)
df.reset_index(drop=True, inplace=True)
df.head(10)

Unnamed: 0,date,ms_played,track,artist,album,uri,lyrics
0,2024-06-25 11:58:46+00:00,132060,Almost Fantasy,Fog Lake,Almost Fantasy,spotify:track:30U7MwsSebniOJyZ8y1hcE,6 ContributorsAlmost Fantasy LyricsAnd maybe I...
1,2024-06-25 12:01:20+00:00,152571,High,Slow Pulp,Big Day,spotify:track:3GaamyFHQYbQi1ykc4nV4o,12 ContributorsHigh Lyrics\nI have too much in...
2,2024-06-25 12:05:30+00:00,205554,I Hope to Be Around,Men I Trust,I Hope to Be Around,spotify:track:3xnM0WCRJKk4Iz49rkG5OC,14 ContributorsI Hope to Be Around Lyrics\nI h...
3,2024-06-25 12:09:17+00:00,227426,Your face,Wisp,Your face,spotify:track:7ne2hzW4jaU5tacaCI4kJH,25 ContributorsYour face Lyrics\nIs this all r...
4,2024-06-25 12:11:07+00:00,108182,Sour Switchblade,Elita,Sour Switchblade,spotify:track:4wFeMmJDlgkAxlQ07PbdGZ,26 ContributorsTranslationsEspañolSour Switchb...
5,2024-06-25 12:15:00+00:00,233639,Money,The Drums,Portamento,spotify:track:3VIJBrMpvimHEw5wtPh2wB,56 ContributorsTranslationsItalianoMoney Lyric...
6,2024-06-25 12:16:18+00:00,76450,Sea Sick,binki,Sea Sick,spotify:track:3qiX7QXv6w6y4wsr5KP6U1,15 ContributorsSea Sick Lyrics\nI feel like I'...
7,2024-06-25 12:22:58+00:00,400334,Money,The Drums,Portamento,spotify:track:3VIJBrMpvimHEw5wtPh2wB,56 ContributorsTranslationsItalianoMoney Lyric...
8,2024-06-25 12:26:27+00:00,141920,Lil Thing,Knox Fortune,Paradise,spotify:track:4At1qbqCOJ9WFbvpWTOOY4,11 ContributorsLil Thing LyricsCurrently worki...
9,2024-06-25 15:21:23+00:00,100542,You're Not The Only One I Know,The Sundays,"Reading, Writing And Arithmetic",spotify:track:7BaxYnTazocAOK3istsW1z,23 ContributorsYou’re Not the Only One I Know ...


In [None]:
def slice_lyrics(row):
    lyrics = row['lyrics']
    lyric_start = lyrics.find('Lyrics')
    read_more = lyrics.find('Read More')

    if read_more != -1:
        lyrics = lyrics[read_more+(len('Read More')):]
    elif lyric_start != -1:
        lyrics = lyrics[lyric_start+(len('Lyrics')):]
    
    lyrics = lyrics.replace("\n", " ")
    return lyrics

df['lyrics'] = df.apply(slice_lyrics, axis=1)

## Get Garmin Data

In [119]:
import garminconnect
from datetime import timedelta

# Log in to Garmin
garmin = garminconnect.Garmin(os.getenv('GARMIN_EMAIL'), os.getenv('GARMIN_PW'))
garmin.login()

start_date = df['date'].min().to_pydatetime().date()
end_date = df['date'].max().to_pydatetime().date()

# Iterate over relevant Spotify dates
rows = []
delta = timedelta(days=1)
while start_date <= end_date:
    stats = garmin.get_stats(start_date)
    row = [
            start_date, 
            int(stats['totalKilocalories']), 
            stats['totalSteps'], 
            round((stats['sleepingSeconds']/3600),2),
            round((stats['highlyActiveSeconds']/3600), 2),
            round((stats['activeSeconds']/3600), 2),
            round((stats['sedentarySeconds']/3600), 2),
            stats['minHeartRate'],
            stats['maxHeartRate'],
            stats['restingHeartRate'],
            stats['averageStressLevel'],
            stats['maxStressLevel']
           ]
    rows.append(row)
    start_date += delta

# Create Garmin dataframe
garmin_df = pd.DataFrame(rows)
garmin_df.columns = ['date', 'calories', 'steps', 'sleep_hours', 'highly_active_hours', 'active_hours',
                     'sedentary_hours', 'min_hr', 'max_hr', 'resting_hr', 'avg_stress', 'max_stress']
garmin_df.head(10)

Unnamed: 0,date,calories,steps,sleep_hours,highly_active_hours,active_hours,sedentary_hours,min_hr,max_hr,resting_hr,avg_stress,max_stress
0,2024-06-25,2861,15188,7.45,1.32,2.25,12.98,33,175,35,24,97
1,2024-06-26,2331,8347,8.8,0.29,2.57,12.34,34,104,36,24,97
2,2024-06-27,2214,5737,8.95,0.15,1.91,12.99,34,89,35,21,90
3,2024-06-28,3366,30441,8.1,1.47,4.64,9.79,36,164,37,29,99
4,2024-06-29,3237,26507,7.65,2.18,3.25,10.93,39,163,40,34,98
5,2024-06-30,2217,6005,10.98,0.31,1.83,10.87,35,96,37,18,93
6,2024-07-01,2824,13606,7.67,0.97,2.18,13.18,32,169,35,20,97
7,2024-07-02,2228,7438,7.5,0.22,2.34,13.94,35,97,36,19,96
8,2024-07-03,2373,10950,8.15,0.44,2.64,12.77,34,108,34,21,96
9,2024-07-04,2935,14321,8.43,0.93,1.83,12.81,34,172,36,20,91


# 