CS471 Final Project

In [1]:
print("hello")
import pandas as pd

hello


Read in the data, convert to csv

In [2]:
def json_to_csv(json_filename, csv_filename=None):
    try:
        if csv_filename is None:
            csv_filename = json_filename.replace('.json', '.csv')
        
        with open(json_filename, encoding='utf-8') as inputfile:
            df = pd.read_json(inputfile)
        
        df.to_csv(csv_filename, encoding='utf-8', index=False)
        
        return f"CSV file '{csv_filename}' created successfully."
    
    except FileNotFoundError:
        return f"Error: The file '{json_filename}' does not exist."
    except ValueError as e:
        return f"Error: {e}. Ensure the JSON structure is valid for conversion."
    except Exception as e:
        return f"An unexpected error occurred: {e}"


In [5]:
result = json_to_csv('data/Kovalsky_Streaming_History_Audio_2014-2020.json')
print(result)

CSV file 'data/Kovalsky_Streaming_History_Audio_2014-2020.csv' created successfully.


Spotify Web API install and set up

In [6]:
pip install pandas spotipy

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [16]:
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import time

# Replace with your Spotify API credentials
CLIENT_ID = 'ff359b48e7074fe09b99477eb78af081'         # Replace with your Client ID
CLIENT_SECRET = 'b726f74b70a54c458a59839442f6a710' # Replace with your Client Secret

# Authenticate with Spotify
client_credentials_manager = SpotifyClientCredentials(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET
)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)


In [59]:
# Replace with your JSON file path
json_filename = '.\\data\\Brezenski_Streaming_History_Audio_2023-2024_8.json'

# Load the JSON data into a DataFrame
df = pd.read_json(json_filename)


In [60]:
# Ensure the column name matches your data structure
track_uri_column = 'spotify_track_uri'  # Adjust if your column name is different

# Drop rows without a track URI
df = df.dropna(subset=[track_uri_column])

# Extract unique track URIs
unique_track_uris = df[track_uri_column].unique().tolist()

print(f"Total unique tracks: {len(unique_track_uris)}")


Total unique tracks: 5407


In [61]:
# Dictionaries to hold fetched data
track_info_dict = {}
audio_features_dict = {}
artist_genres_dict = {}


In [62]:
# Spotify API allows fetching up to 50 tracks per request
BATCH_SIZE = 50

for i in range(0, len(unique_track_uris), BATCH_SIZE):
    batch_uris = unique_track_uris[i:i+BATCH_SIZE]
    
    # Fetch track details
    try:
        tracks = sp.tracks(batch_uris)['tracks']
        for track in tracks:
            if track:  # Check if track is not None
                track_id = track['uri']
                track_info_dict[track_id] = {
                    'track_name': track['name'],
                    'album_name': track['album']['name'],
                    'album_release_date': track['album']['release_date'],
                    'track_popularity': track['popularity'],
                    'duration_ms': track['duration_ms'],
                    'explicit': track['explicit'],
                    'artist_ids': [artist['id'] for artist in track['artists']],
                    'artist_names': [artist['name'] for artist in track['artists']],
                }
    except Exception as e:
        print(f"Error fetching track details for batch starting at index {i}: {e}")
    
    # Fetch audio features
    try:
        features = sp.audio_features(batch_uris)
        for feature in features:
            if feature:  # Check if feature is not None
                track_id = feature['uri']
                audio_features_dict[track_id] = feature
    except Exception as e:
        print(f"Error fetching audio features for batch starting at index {i}: {e}")
    
    time.sleep(0.1)  # Adjust sleep time as necessary


In [63]:
# Collect all unique artist IDs
all_artist_ids = set()
for track_data in track_info_dict.values():
    all_artist_ids.update(track_data['artist_ids'])

print(f"Total unique artists: {len(all_artist_ids)}")

# Fetch artist genres in batches
artist_ids_list = list(all_artist_ids)
for i in range(0, len(artist_ids_list), BATCH_SIZE):
    batch_artist_ids = artist_ids_list[i:i+BATCH_SIZE]
    try:
        artists = sp.artists(batch_artist_ids)['artists']
        for artist in artists:
            if artist:  # Check if artist is not None
                artist_id = artist['id']
                artist_genres_dict[artist_id] = artist['genres']
    except Exception as e:
        print(f"Error fetching artist genres for batch starting at index {i}: {e}")
    
    time.sleep(0.1)  # Adjust sleep time as necessary


Total unique artists: 3542


In [64]:
# Convert track info dict to DataFrame
track_info_df = pd.DataFrame.from_dict(track_info_dict, orient='index')

# Convert audio features dict to DataFrame
audio_features_df = pd.DataFrame.from_dict(audio_features_dict, orient='index')

# Reset index to have 'spotify_track_uri' as a column
track_info_df = track_info_df.reset_index().rename(columns={'index': 'spotify_track_uri'})
audio_features_df = audio_features_df.reset_index().rename(columns={'index': 'spotify_track_uri'})


In [65]:
print(track_info_df)

                         spotify_track_uri  \
0     spotify:track:7aqfrAY2p9BUSiupwk3svU   
1     spotify:track:7LTCmtMpf3JPfBE8gAl64z   
2     spotify:track:0VJXWgw0GWsprapvlpbuyu   
3     spotify:track:4M68xjcc42oxyphhzpOWXS   
4     spotify:track:5VE6OSfJkMHyMpHaJzuUqn   
...                                    ...   
5402  spotify:track:688ucfDoelAC8sY7wgHCV4   
5403  spotify:track:2wAJTrFhCnQyNSD3oUgTZO   
5404  spotify:track:3mTpegrOwRn0oJjv4TSbEE   
5405  spotify:track:6gbiTbclnHlmSIPfmF2zEc   
5406  spotify:track:5A8xI7PN4WDe9e61xEdt94   

                                track_name                      album_name  \
0     First Person Shooter (feat. J. Cole)                For All The Dogs   
1           Turn Yo Clic Up (feat. Future)  Turn Yo Clic Up (feat. Future)   
2                                     Okay                            Okay   
3                             Great Gatsby                       Nostalgia   
4                            Gimme Da Lite               

In [66]:
# Merge track info and audio features on 'spotify_track_uri'
track_data_df = pd.merge(track_info_df, audio_features_df, on='spotify_track_uri', how='left')


In [67]:
print(track_data_df.columns)

Index(['spotify_track_uri', 'track_name', 'album_name', 'album_release_date',
       'track_popularity', 'duration_ms_x', 'explicit', 'artist_ids',
       'artist_names', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'type', 'id', 'uri', 'track_href', 'analysis_url',
       'duration_ms_y', 'time_signature'],
      dtype='object')


In [68]:
# Function to get genres for a list of artist IDs
def get_genres_for_artists(artist_ids):
    genres = set()
    for artist_id in artist_ids:
        artist_genres = artist_genres_dict.get(artist_id, [])
        genres.update(artist_genres)
    return list(genres)

# Apply the function to each row
track_data_df['artist_genres'] = track_data_df['artist_ids'].apply(get_genres_for_artists)


In [69]:
# Flatten 'artist_ids' and 'artist_names' to strings
track_data_df['artist_ids'] = track_data_df['artist_ids'].apply(lambda x: ', '.join(x))
track_data_df['artist_names'] = track_data_df['artist_names'].apply(lambda x: ', '.join(x))
track_data_df['artist_genres'] = track_data_df['artist_genres'].apply(lambda x: ', '.join(x))

# Optionally, drop columns you don't need
# track_data_df = track_data_df.drop(columns=['type', 'id', 'track_href', 'analysis_url', 'time_signature'])


In [70]:
# Merge the enriched track data back into the original DataFrame
df_enriched = pd.merge(df, track_data_df, on='spotify_track_uri', how='left')


In [71]:
print(df_enriched)

                         ts    username platform  ms_played conn_country  \
0      2023-12-22T15:58:39Z  kingzenski      ios     109100           US   
1      2023-12-22T15:58:40Z  kingzenski      ios       1010           US   
2      2023-12-22T15:58:56Z  kingzenski      ios      16620           US   
3      2023-12-22T15:59:00Z  kingzenski      ios       4050           US   
4      2023-12-22T15:59:06Z  kingzenski      ios       5700           US   
...                     ...         ...      ...        ...          ...   
16909  2024-05-31T22:45:15Z  kingzenski      ios      34943           US   
16910  2024-05-31T22:45:19Z  kingzenski      ios       3370           US   
16911  2024-05-31T22:45:22Z  kingzenski      ios       3541           US   
16912  2024-05-31T22:45:25Z  kingzenski      ios       2560           US   
16913  2024-05-31T22:45:29Z  kingzenski      ios       3754           US   

                           ip_addr_decrypted user_agent_decrypted  \
0                 

In [72]:
# Replace with your desired output file path
output_csv_filename = 'data/your_streaming_history_enriched.csv'

# Save to CSV
df_enriched.to_csv(output_csv_filename, index=False)

print(f"Enriched data saved to {output_csv_filename}")


Enriched data saved to data/your_streaming_history_enriched.csv
