In [53]:
import spotipy
from spotipy.oauth2 import SpotifyOAuth
import re
import time
import os
from dotenv import load_dotenv
import json
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from tqdm import tqdm
import calendar
import random
from PIL import Image, ImageDraw, ImageFont, ImageFilter
import numpy as np

In [2]:
data_directory = '/Users/kyriakospapadopoulos/Desktop/University/Big Blue Data Academy/Personal/Projects/API_Projects/Spotify/Spotify Account Data'

## Extract information from spotify account data export

### Identify streaming history music files

In [3]:
def find_streaming_history_files(directory):
    try:
        # Get a list of all files in the given directory
        all_files = os.listdir(directory)
        
        # Filter the list to include only those with 'StreamingHistory_music' in the file name
        streaming_history_files = [file for file in all_files if 'StreamingHistory_music' in file]
        
        return streaming_history_files

    except FileNotFoundError:
        print(f"Error: The directory '{directory}' was not found.")
        return []

    except PermissionError:
        print(f"Error: You do not have permission to access the directory '{directory}'.")
        return []

    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return []

In [4]:
file_names = find_streaming_history_files(data_directory)

### Load and join streaming data

In [5]:
def load_and_join_streaming_data(directory, file_names):

    streaming_data = []

    for file_name in file_names:
        file_path = os.path.join(directory, file_name)
        try:
            with open(file_path, 'r') as file:
                data = json.load(file)
                if isinstance(data, list):
                    streaming_data.extend(data)  # Join the data if it's a list
                else:
                    streaming_data.append(data)  # Append the data as it is
                print(f"Appended data from {file_name}")
        except FileNotFoundError:
            print(f"Error: The file '{file_name}' was not found in the directory '{directory}'.")
        except json.JSONDecodeError:
            print(f"Error: Failed to decode JSON in the file '{file_name}'. The file might be corrupted.")
        except Exception as e:
            print(f"An unexpected error occurred while loading '{file_name}': {e}")

    return streaming_data

In [6]:
streaming_data = load_and_join_streaming_data(data_directory,file_names)

Appended data from StreamingHistory_music_1.json
Appended data from StreamingHistory_music_0.json
Appended data from StreamingHistory_music_2.json


### Convert data from list to df

In [7]:
def create_streaming_data_df(streaming_data):

    # Initialize an empty list to store rows
    rows = []

    # Iterate over each file's data in the streaming_data list
    for entry in streaming_data:
            rows.append({
                'endTime': entry['endTime'],
                'artistName': entry['artistName'],
                'trackName': entry['trackName'],
                'msPlayed': entry['msPlayed']
            })

    # Convert the list of dictionaries to a DataFrame
    streaming_data_df = pd.DataFrame(rows)

    # Convert the 'endTime' column to datetime format
    streaming_data_df['endTime'] = pd.to_datetime(streaming_data_df['endTime'], format='%Y-%m-%d %H:%M')

    # Define the output pickle file path in the current working directory
    output_pickle_path = os.path.join(os.getcwd(), 'streaming_data_df.pkl')

    # Save the DataFrame as a pickle file
    streaming_data_df.to_pickle(output_pickle_path)
    print(f"DataFrame saved as pickle file at: {output_pickle_path}")

    return streaming_data_df

In [8]:
create_streaming_data_df(streaming_data)

DataFrame saved as pickle file at: /Users/kyriakospapadopoulos/Desktop/University/Big Blue Data Academy/Personal/Projects/API_Projects/Spotify/Monthly_Playlists/streaming_data_df.pkl


Unnamed: 0,endTime,artistName,trackName,msPlayed
0,2023-11-28 15:12:00,Smif-N-Wessun,Bucktown 360,264750
1,2023-11-28 15:32:00,Fugees,Family Business (feat. John Forté & Omega),90813
2,2023-11-28 17:18:00,Fugees,Family Business (feat. John Forté & Omega),25851
3,2023-11-28 17:18:00,King Geedorah,Anti-Matter,11204
4,2023-11-28 17:18:00,The Beatnuts,Do You Believe,2391
...,...,...,...,...
25694,2024-08-12 22:34:00,Wu-Tang Clan,C.R.E.A.M. (Cash Rules Everything Around Me),252026
25695,2024-08-12 22:38:00,Afu-Ra,Whirlwind Thru Cities,239413
25696,2024-08-12 22:42:00,Gang Starr,I'm The Man,244266
25697,2024-08-12 22:47:00,Jeru The Damaja,How I'm Living,264240


## Get more additional information using the Spotipy API

### Create Spotipy instance

In [9]:
def get_spotify_instance():
    
    load_dotenv()
    
    sp_oauth = SpotifyOAuth(
        client_id=os.getenv('SPOTIPY_CLIENT_ID'),
        client_secret=os.getenv('SPOTIPY_CLIENT_SECRET'),
        redirect_uri=os.getenv('SPOTIPY_REDIRECT_URI'),
        scope='playlist-modify-public'
    )
    
    token_info = sp_oauth.get_cached_token()
    if not token_info:
        token_info = sp_oauth.get_access_token()
    
    return spotipy.Spotify(auth=token_info['access_token'])

### Create a df with the unique songs from streaming_data_df in order to make the spotipy query less time consuming

In [10]:
streaming_data_df = pd.read_pickle('streaming_data_df.pkl')
song_data_df = streaming_data_df.drop_duplicates(subset=['trackName'])
song_data_df.to_pickle(os.path.join(os.getcwd(), 'song_data_df.pkl'))

### Query spotify and update song_data_df with the additional information 

In [25]:
def query_track_info(df, artist_col='artistName', track_col='trackName'):
    sp = get_spotify_instance()
    durations = []
    spotify_ids = []
    
    # Initialize tqdm progress bar
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Querying Spotify"):
        query = f"track:{row[track_col]} artist:{row[artist_col]}"
        results = sp.search(q=query, type='track', limit=1)
        
        if results['tracks']['items']:
            track_info = results['tracks']['items'][0]
            durations.append(track_info['duration_ms'])
            spotify_ids.append(track_info['id'])
        else:
            durations.append(None)
            spotify_ids.append(None)
    
    # Add the new columns to the DataFrame
    df['duration'] = durations
    df['spotify_id'] = spotify_ids
    
    # Save the updated DataFrame to a pickle file, overwriting the existing one
    df.to_pickle(os.path.join(os.getcwd(), 'final_song_data_df.pkl'))
    
    return df

In [12]:
query_track_info(song_data_df)

Querying Spotify: 100%|█████████████████████| 4608/4608 [15:02<00:00,  5.11it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,endTime,artistName,trackName,msPlayed,duration,spotify_id
0,2023-11-28 15:12:00,Smif-N-Wessun,Bucktown 360,264750,264750.0,7ivZRLotUUwwiahsIfvCw7
1,2023-11-28 15:32:00,Fugees,Family Business (feat. John Forté & Omega),90813,343800.0,4TGdvA9hBvyoOkuvHNje0M
3,2023-11-28 17:18:00,King Geedorah,Anti-Matter,11204,206946.0,4kVxStqN7DeoZje5aidAn3
4,2023-11-28 17:18:00,The Beatnuts,Do You Believe,2391,200066.0,1jNW3TfaKAuW3wnNEKTxdb
5,2023-11-28 17:23:00,Ed O.G. & Da Bulldogs,I'm Different,256266,256266.0,5Ek5hDRLwASGknqLtUqecT
...,...,...,...,...,...,...
25505,2024-08-12 20:00:00,Godfather Don,Takin' It Back,917,266203.0,6eqBYfD3gWX1FaXAoIg5NY
25509,2024-08-12 20:01:00,Talib Kweli,Get By,49295,227426.0,1LM6EReMkAxuDXDF26ekl2
25570,2024-08-12 20:56:00,Naughty By Nature,Yoke the Joker,2635,313640.0,4Aujn3OS6bSqprR2UVGlfy
25576,2024-08-12 21:00:00,Eazy-E,Eazy-Duz-It,270761,259493.0,1ohwrBQrxIlqMYfkoYERiN


## Preparation of final df

In [26]:
final_song_data_df = pd.read_pickle(os.path.join(os.getcwd(), 'final_song_data_df.pkl'))

In [27]:
final_streaming_data_df = pd.read_pickle(os.path.join(os.getcwd(), 'streaming_data_df.pkl'))

In [28]:
# Merge the two DataFrames based on the 'trackName' column
final_streaming_data_df = final_streaming_data_df.merge(
    final_song_data_df[['trackName', 'duration', 'spotify_id']],
    on='trackName',
    how='left'  # Use 'left' join to keep all rows from final_streaming_data_df
)

In [32]:
final_streaming_data_df['msPlayed'] = final_streaming_data_df['msPlayed'].astype(float)
final_streaming_data_df['play_ratio'] = final_streaming_data_df['msPlayed']/final_streaming_data_df['duration']

In [35]:
final_streaming_data_df = final_streaming_data_df.dropna(subset=['spotify_id'])

In [38]:
# Step 1: Add a 'month' and 'year' column to the DataFrame
final_streaming_data_df['month'] = final_streaming_data_df['endTime'].dt.month
final_streaming_data_df['year'] = final_streaming_data_df['endTime'].dt.year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [39]:
final_streaming_data_df.head()

Unnamed: 0,endTime,artistName,trackName,msPlayed,duration,spotify_id,play_ratio,month,year
0,2023-11-28 15:12:00,Smif-N-Wessun,Bucktown 360,264750.0,264750.0,7ivZRLotUUwwiahsIfvCw7,1.0,11,2023
1,2023-11-28 15:32:00,Fugees,Family Business (feat. John Forté & Omega),90813.0,343800.0,4TGdvA9hBvyoOkuvHNje0M,0.264145,11,2023
2,2023-11-28 17:18:00,Fugees,Family Business (feat. John Forté & Omega),25851.0,343800.0,4TGdvA9hBvyoOkuvHNje0M,0.075192,11,2023
3,2023-11-28 17:18:00,King Geedorah,Anti-Matter,11204.0,206946.0,4kVxStqN7DeoZje5aidAn3,0.05414,11,2023
4,2023-11-28 17:18:00,The Beatnuts,Do You Believe,2391.0,200066.0,1jNW3TfaKAuW3wnNEKTxdb,0.011951,11,2023


## Create playlist covers

In [55]:
# Predefined list of vibrant and light colors
vibrant_colors = [
    (255, 182, 193),  # Light Pink
    (255, 223, 186),  # Light Peach
    (255, 240, 187),  # Light Yellow
    (173, 216, 230),  # Light Blue
    (144, 238, 144),  # Light Green
    (240, 128, 128),  # Light Coral
    (255, 239, 213),  # Papaya Whip
    (255, 218, 185),  # Peach Puff
    (221, 160, 221),  # Plum
    (245, 222, 179),  # Wheat
    (255, 160, 122),  # Light Salmon
    (152, 251, 152),  # Pale Green
    (224, 255, 255),  # Light Cyan
    (230, 230, 250),  # Lavender
]

def add_noise_texture(size, base_color, noise_level):
    img = Image.new("RGB", size, base_color)
    np_image = np.array(img)
    noise = np.random.normal(0, noise_level, np_image.shape).astype(np.uint8)
    noisy_image = Image.fromarray(np.clip(np_image + noise, 0, 255).astype(np.uint8))
    return noisy_image

def create_images_from_df(df, output_dir='covers'):
    # Create a dictionary to hold base colors for each year
    year_colors = {}
    
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Image size (640x640 for Spotify cover)
    image_size = (640, 640)
    
    # Imaginary box size and margin
    box_size = (560, 560)
    border_margin = 40
    
    # Map month names to numbers
    month_mapping = {index: month for index, month in enumerate(calendar.month_name) if month}
    
    # Aggregate by year and month
    df_grouped = df.groupby(['year', 'month']).size().reset_index(name='count')
    
    # Set to track used colors
    used_colors = set()
    
    # Dictionary to hold the mapping between playlist and image paths
    playlist_image_map = {}
    
    # Iterate through each row in the grouped DataFrame
    for index, row in df_grouped.iterrows():
        # Convert month number to month name
        month_name = month_mapping[row['month']]
        # Abbreviate the month name for the text on the image
        month_name_abbrev = month_name[:3].upper()
        # Format the year as 'YY (e.g., '19)
        formatted_year = f"{str(row['year'])[-2:]}"
        text = f"{month_name_abbrev} {formatted_year}"
        
        # Assign a base color for the year if not already assigned
        if row['year'] not in year_colors:
            available_colors = [color for color in vibrant_colors if color not in used_colors]
            if available_colors:
                chosen_color = random.choice(available_colors)
                used_colors.add(chosen_color)
                year_colors[row['year']] = chosen_color
            else:
                print("Ran out of unique colors!")  # Shouldn't happen with enough colors
        
        # Use the chosen color to create a noise background
        background_img = add_noise_texture(image_size, year_colors[row['year']], noise_level=15)
        
        # Initialize ImageDraw
        d = ImageDraw.Draw(background_img)
        
        # Load a suitable font
        try:
            font_path = "/Users/kyriakospapadopoulos/Desktop/University/Big Blue Data Academy/Personal/Projects/API_Projects/Spotify/Monthly_Playlists/fonts/Paskowy.ttf"
            font = ImageFont.truetype(font_path, 10)  # Start with a small font size
        except IOError:
            font = ImageFont.load_default()  # Fallback to default font if TrueType is not available
        
        # Adjust font size to fit within the imaginary box
        max_font_size = 200  # Starting point for maximum font size
        while True:
            font = ImageFont.truetype(font_path, max_font_size)
            text_width, text_height = d.textsize(text, font=font)
            if text_width <= box_size[0] and text_height <= box_size[1]:
                break
            max_font_size -= 1  # Reduce font size if it doesn't fit
            
        # Calculate the position to center the text within the image
        position = ((image_size[0] - text_width) // 2, (image_size[1] - text_height) // 2)
        
        # Add text to the image using black color
        d.text(position, text, font=font, fill=(0, 0, 0))
        
        # Save the image with a properly ordered filename
        filename = f"{output_dir}/{row['year']}_{str(row['month']).zfill(2)}_{month_name_abbrev}.png"
        background_img.save(filename)
        
        # Add to the playlist-image map
        playlist_name = f"{month_name_abbrev} {row['year']}"
        playlist_image_map[playlist_name] = filename
    
    return playlist_image_map

In [56]:
playlist_image_map = create_images_from_df(final_streaming_data_df)



## Create playlists