# Analysis of My Music/Health Data

## Imports & configurations

In [202]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import time
import spotipy
import garminconnect
from datetime import timedelta
from lyricsgenius import Genius
from spotipy.oauth2 import SpotifyOAuth
from dotenv import load_dotenv
from tqdm import tqdm
from transformers import pipeline

pd.options.display.max_columns = 200
tqdm.pandas()

load_dotenv()

True

## Read in Spotify data

In [141]:
df = pd.read_json('./spotify_data/Streaming_History_Audio_2024_29.json')
df.head()

Unnamed: 0,ts,platform,ms_played,conn_country,ip_addr,master_metadata_track_name,master_metadata_album_artist_name,master_metadata_album_album_name,spotify_track_uri,episode_name,episode_show_name,spotify_episode_uri,audiobook_title,audiobook_uri,audiobook_chapter_uri,audiobook_chapter_title,reason_start,reason_end,shuffle,skipped,offline,offline_timestamp,incognito_mode
0,2024-06-25T11:58:46Z,ios,132060,US,2601:14d:4b84:4a70:713f:c9ce:309c:32b9,Almost Fantasy,Fog Lake,Almost Fantasy,spotify:track:30U7MwsSebniOJyZ8y1hcE,,,,,,,,trackdone,trackdone,True,False,False,1719316594,False
1,2024-06-25T12:01:20Z,ios,152571,US,166.199.139.64,High,Slow Pulp,Big Day,spotify:track:3GaamyFHQYbQi1ykc4nV4o,,,,,,,,trackdone,trackdone,True,False,False,1719316726,False
2,2024-06-25T12:02:04Z,ios,42733,US,166.199.139.64,From the Subway Train,Vansire,Angel Youth,spotify:track:0qOCy2LqvdkMora7eZsJhQ,,,,,,,,trackdone,endplay,True,True,False,1719316880,False
3,2024-06-25T12:05:30Z,ios,205554,US,166.199.139.64,I Hope to Be Around,Men I Trust,I Hope to Be Around,spotify:track:3xnM0WCRJKk4Iz49rkG5OC,,,,,,,,clickrow,trackdone,False,False,False,1719316924,False
4,2024-06-25T12:09:17Z,ios,227426,US,166.199.139.64,Your face,Wisp,Your face,spotify:track:7ne2hzW4jaU5tacaCI4kJH,,,,,,,,trackdone,trackdone,True,False,False,1719317130,False


## Drop, rename, and changes dtypes

In [142]:
# Drop and rename columns
df = df[['ts', 'ms_played', 'master_metadata_track_name', 'master_metadata_album_artist_name', 'master_metadata_album_album_name', 'spotify_track_uri']]
df = df.rename(columns={'ts': 'date', 'master_metadata_track_name': 'track', 'master_metadata_album_artist_name': 'artist', 'master_metadata_album_album_name': 'album', 'spotify_track_uri': 'uri'})

# Filter out songs listened to for < 1 minute
df = df[df['ms_played'] > 60000]

# Drop missing values and changes dtypes
df.dropna(subset=["track"], inplace=True)
df['date'] = pd.to_datetime(df['date'])
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,date,ms_played,track,artist,album,uri
0,2024-06-25 11:58:46+00:00,132060,Almost Fantasy,Fog Lake,Almost Fantasy,spotify:track:30U7MwsSebniOJyZ8y1hcE
1,2024-06-25 12:01:20+00:00,152571,High,Slow Pulp,Big Day,spotify:track:3GaamyFHQYbQi1ykc4nV4o
2,2024-06-25 12:05:30+00:00,205554,I Hope to Be Around,Men I Trust,I Hope to Be Around,spotify:track:3xnM0WCRJKk4Iz49rkG5OC
3,2024-06-25 12:09:17+00:00,227426,Your face,Wisp,Your face,spotify:track:7ne2hzW4jaU5tacaCI4kJH
4,2024-06-25 12:11:07+00:00,108182,Sour Switchblade,Elita,Sour Switchblade,spotify:track:4wFeMmJDlgkAxlQ07PbdGZ


## Get song lyrics with Genius API

In [143]:
saved_lyrics = {}

def get_lyrics(row):
    if row['track'] in saved_lyrics:
        return saved_lyrics[row['track']]
    else:
        song = None
        try:
            song = genius.search_song(row['track'], row['artist'])
            saved_lyrics[row['track']] = song.lyrics if song else None
            time.sleep(1)
        except:
            saved_lyrics[row['track']] = None
            time.sleep(10)

        return song.lyrics if song else None

genius = Genius(os.getenv('GENIUS_TOKEN'), verbose=True, remove_section_headers=True)
#df['lyrics'] = df.progress_apply(get_lyrics, axis=1)

## Copy dataframe as backup

In [144]:
df_copy = df.copy(deep=True)

## Restore dataframe from copy (if necessary)

In [145]:
#df = df_copy

In [146]:
lyric_df = pd.read_csv('lyrics.csv')
df['lyrics'] = lyric_df['lyrics']
df.head()

Unnamed: 0,date,ms_played,track,artist,album,uri,lyrics
0,2024-06-25 11:58:46+00:00,132060,Almost Fantasy,Fog Lake,Almost Fantasy,spotify:track:30U7MwsSebniOJyZ8y1hcE,6 ContributorsAlmost Fantasy LyricsAnd maybe I...
1,2024-06-25 12:01:20+00:00,152571,High,Slow Pulp,Big Day,spotify:track:3GaamyFHQYbQi1ykc4nV4o,12 ContributorsHigh Lyrics\nI have too much in...
2,2024-06-25 12:05:30+00:00,205554,I Hope to Be Around,Men I Trust,I Hope to Be Around,spotify:track:3xnM0WCRJKk4Iz49rkG5OC,14 ContributorsI Hope to Be Around Lyrics\nI h...
3,2024-06-25 12:09:17+00:00,227426,Your face,Wisp,Your face,spotify:track:7ne2hzW4jaU5tacaCI4kJH,25 ContributorsYour face Lyrics\nIs this all r...
4,2024-06-25 12:11:07+00:00,108182,Sour Switchblade,Elita,Sour Switchblade,spotify:track:4wFeMmJDlgkAxlQ07PbdGZ,26 ContributorsTranslationsEspañolSour Switchb...


## Drop missing lyrics and remove extra characters

In [147]:
df.dropna(subset=["lyrics"], inplace=True)
df.reset_index(drop=True, inplace=True)

def slice_lyrics(row):
    """
    Removes extraneous characters from 'lyrics' column.
    """
    lyrics = row['lyrics']
    lyric_start = lyrics.find('Lyrics')
    read_more = lyrics.find('Read More')

    if read_more != -1:
        lyrics = lyrics[read_more+(len('Read More')):]
    elif lyric_start != -1:
        lyrics = lyrics[lyric_start+(len('Lyrics')):]
    
    lyrics = lyrics.replace("\n", " ")
    return lyrics

df['lyrics'] = df.apply(slice_lyrics, axis=1)

df.head()

Unnamed: 0,date,ms_played,track,artist,album,uri,lyrics
0,2024-06-25 11:58:46+00:00,132060,Almost Fantasy,Fog Lake,Almost Fantasy,spotify:track:30U7MwsSebniOJyZ8y1hcE,And maybe I found paradise A little empathy bu...
1,2024-06-25 12:01:20+00:00,152571,High,Slow Pulp,Big Day,spotify:track:3GaamyFHQYbQi1ykc4nV4o,I have too much in my pockets I wish they wer...
2,2024-06-25 12:05:30+00:00,205554,I Hope to Be Around,Men I Trust,I Hope to Be Around,spotify:track:3xnM0WCRJKk4Iz49rkG5OC,I hope to be around The day we grasp in truth...
3,2024-06-25 12:09:17+00:00,227426,Your face,Wisp,Your face,spotify:track:7ne2hzW4jaU5tacaCI4kJH,Is this all real? You're stuck in my head Dro...
4,2024-06-25 12:11:07+00:00,108182,Sour Switchblade,Elita,Sour Switchblade,spotify:track:4wFeMmJDlgkAxlQ07PbdGZ,I wanna talk to you About the things we could...


## Perform sentiment analysis on lyrics

In [None]:
sa_pipeline = pipeline("sentiment-analysis", model="j-hartmann/emotion-english-distilroberta-base")
# SA model: distilbert-base-uncased-finetuned-sst-2-english
# Emotion model: j-hartmann/emotion-english-distilroberta-base

def sentiment_analysis(row):
    """
    Returns sentiment of lyrics.
    """
    lyrics = row['lyrics']
    if len(lyrics) > 500:
        lyrics = lyrics[:420]
    
    try:
        sentiment = sa_pipeline(lyrics)
    except:
        sentiment = [{'label': None, 'score': None}]
    return sentiment[0]['label'], sentiment[0]['score']

df[['sentiment_label', 'sentiment_score']] = df.apply(sentiment_analysis, axis=1, result_type='expand')
df.head()

Device set to use mps:0
Token indices sequence length is longer than the specified maximum sequence length for this model (524 > 512). Running this sequence through the model will result in indexing errors


450
450
450
450
450


Unnamed: 0,date,ms_played,track,artist,album,uri,lyrics,sentiment_label,sentiment_score
0,2024-06-25 11:58:46+00:00,132060,Almost Fantasy,Fog Lake,Almost Fantasy,spotify:track:30U7MwsSebniOJyZ8y1hcE,And maybe I found paradise A little empathy bu...,fear,0.505382
1,2024-06-25 12:01:20+00:00,152571,High,Slow Pulp,Big Day,spotify:track:3GaamyFHQYbQi1ykc4nV4o,I have too much in my pockets I wish they wer...,sadness,0.91173
2,2024-06-25 12:05:30+00:00,205554,I Hope to Be Around,Men I Trust,I Hope to Be Around,spotify:track:3xnM0WCRJKk4Iz49rkG5OC,I hope to be around The day we grasp in truth...,fear,0.508664
3,2024-06-25 12:09:17+00:00,227426,Your face,Wisp,Your face,spotify:track:7ne2hzW4jaU5tacaCI4kJH,Is this all real? You're stuck in my head Dro...,fear,0.313638
4,2024-06-25 12:11:07+00:00,108182,Sour Switchblade,Elita,Sour Switchblade,spotify:track:4wFeMmJDlgkAxlQ07PbdGZ,I wanna talk to you About the things we could...,surprise,0.492737


## Perform audio based sentiment analysis

In [None]:
auth_manager = SpotifyOAuth(client_id=os.getenv('SPOTIPY_CLIENT_ID'), client_secret=os.getenv('SPOTIPY_CLIENT_SECRET'), scope='user-library-read', redirect_uri=os.getenv('SPOTIPY_REDIRECT_URI'))
sp = spotipy.Spotify(auth_manager=auth_manager)

track_id = "30U7MwsSebniOJyZ8y1hcE"
features = sp.audio_features([track_id])

OSError: [Errno 48] Address already in use

## Get Garmin Data

In [150]:
# Log in to Garmin
garmin = garminconnect.Garmin(os.getenv('GARMIN_EMAIL'), os.getenv('GARMIN_PW'))
garmin.login()

# Get start/end dates from spotify dataframe
start_date = df['date'].min().to_pydatetime().date()
end_date = df['date'].max().to_pydatetime().date()

# Iterate over relevant Spotify dates
rows = []
delta = timedelta(days=1)
while start_date <= end_date:
    stats = garmin.get_stats(start_date)
    row = [
            start_date, 
            int(stats['totalKilocalories']), 
            stats['totalSteps'], 
            round((stats['sleepingSeconds']/3600),2),
            round((stats['highlyActiveSeconds']/3600), 2),
            round((stats['activeSeconds']/3600), 2),
            round((stats['sedentarySeconds']/3600), 2),
            stats['minHeartRate'],
            stats['maxHeartRate'],
            stats['restingHeartRate'],
            stats['averageStressLevel'],
            stats['maxStressLevel']
           ]
    rows.append(row)
    start_date += delta

# Create Garmin dataframe
garmin_df = pd.DataFrame(rows)
garmin_df.columns = ['date', 'calories', 'steps', 'sleep_hours', 'highly_active_hours', 'active_hours',
                     'sedentary_hours', 'min_hr', 'max_hr', 'resting_hr', 'avg_stress', 'max_stress']
garmin_df.head()

Unnamed: 0,date,calories,steps,sleep_hours,highly_active_hours,active_hours,sedentary_hours,min_hr,max_hr,resting_hr,avg_stress,max_stress
0,2024-06-25,2861,15188,7.45,1.32,2.25,12.98,33,175,35,24,97
1,2024-06-26,2331,8347,8.8,0.29,2.57,12.34,34,104,36,24,97
2,2024-06-27,2214,5737,8.95,0.15,1.91,12.99,34,89,35,21,90
3,2024-06-28,3366,30441,8.1,1.47,4.64,9.79,36,164,37,29,99
4,2024-06-29,3237,26507,7.65,2.18,3.25,10.93,39,163,40,34,98


# 