# Spotify Streaming History Analysis

## Library Imports

In [None]:
import json # to handle JSON data
import pandas as pd # for data manipulation
import seaborn as sns # for data visualization
import matplotlib.pyplot as plt # also for data visualization

## Data Loading

In [None]:
# Load the provided JSON files
files = [
    "data/StreamingHistory_music_0.json",
    "data/StreamingHistory_music_1.json",
    "data/StreamingHistory_music_2.json",
    "data/StreamingHistory_music_3.json"
]

data = []
for file in files:
    with open(file, 'r', encoding='utf-8') as f:
        data.extend(json.load(f))

# Convert data into a pandas DataFrame
df = pd.DataFrame(data)

# Convert endTime to datetime format
df['endTime'] = pd.to_datetime(df['endTime'])

# Filter for only 2025 streaming
df = df[df['endTime'] >= '2025-01-01']


# What are the first few songs I listened to in 2025?
df.head()


## Analysis

### Most Listened-to Artists

In [None]:
# List my 10 most listened-to artists
artist_time_played = df.groupby('artistName')['msPlayed'].sum().sort_values(ascending=False)
artist_time_played.head(10)


### Most Played Tracks

In [None]:

# List my 10 most played tracks
most_played_tracks = df.groupby(['artistName', 'trackName']).size().sort_values(ascending=False)
most_played_tracks.head(10)


### Total Listening Time

In [None]:

# Total listening time
total_listening_time = df['msPlayed'].sum()

# convert ms to hours
total_listening_time = total_listening_time / (1000 * 60 * 60)

#round to 3 decimal places
total_listening_time = round(total_listening_time, 3)

print(total_listening_time, "hours")


### Distribution of Listening by Date

In [None]:

# Distribution of listening by date
listening_by_date = df.groupby(df['endTime'].dt.date)['msPlayed'].sum()
listening_by_date.head()

# Convert to hours for better readability
listening_by_date = listening_by_date / (1000 * 60 * 60)

listening_by_date.head()

### Time Series Plot for Listening Time Distribution by Date

In [None]:
# 4. Time series plot for distribution by date
plt.figure(figsize=(14, 6))
#convert to hours
listening_by_date = listening_by_date / (1000 * 60 * 60)
listening_by_date.plot(kind='line', color='green')
plt.title('Distribution of Listening by Date')
plt.ylabel('Total Time Played (hours)')
plt.xlabel('Date')
plt.tight_layout()
plt.show()

### Hourly Listening Activity by Week

In [None]:
# Extract day of the week from the endTime column
df['day_of_week'] = df['endTime'].dt.day_name()

# Group by day of the week and sum the msPlayed
listening_by_day = df.groupby('day_of_week')['msPlayed'].sum()

# Order the days for visualization
ordered_days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
listening_by_day = listening_by_day.reindex(ordered_days)

# Calculate the number of unique weeks in the dataset to get the average
num_weeks = (df['endTime'].max() - df['endTime'].min()).days / 7
average_listening_by_day = listening_by_day / num_weeks

average_listening_by_day


In [None]:
# Extract the hour from the endTime column
df['hour'] = df['endTime'].dt.hour

# Group by day of the week and hour, then sum the msPlayed, converted to hours
listening_by_day_hour = df.groupby(['day_of_week', 'hour'])['msPlayed'].sum().unstack() / (1000 * 60 * 60)

# Order the days for visualization
listening_by_day_hour = listening_by_day_hour.reindex(ordered_days)

# Calculate the average listening time for each day-hour combination
average_listening_by_day_hour = listening_by_day_hour / num_weeks

average_listening_by_day_hour


### Convert from Values to Visuals

In [None]:
# Plotting the heatmap for listening activity by day of the week
plt.figure(figsize=(14, 7))
sns.heatmap(average_listening_by_day_hour, cmap="YlGnBu", linewidths=.5)
plt.title('Average Hourly Listening Activity by Day of the Week')
plt.xlabel('Hour of the Day')
plt.ylabel('Day of the Week')

# label the legend
cbar = plt.gcf().axes[-1]
cbar.set_ylabel('Listening Time (hours)', rotation=270, labelpad=15)

plt.tight_layout()
plt.show()


How about filtering for a specific artist?

In [None]:
def listening_hours_for_artist(artist_name):
    # Filter the dataframe for the given artist
    artist_data = df[df['artistName'] == artist_name]
    
    # Sum the total listening time in milliseconds
    total_ms = artist_data['msPlayed'].sum()
    
    # Convert milliseconds to hours
    total_hours = total_ms / (1000 * 60 * 60)
    
    return total_hours

artist_name = "Linkin Park"  # Replace with the artist's name you want to highlight
hours = listening_hours_for_artist(artist_name)
print(f"Total listening hours for {artist_name}: {hours:.2f} hours")


# Some More Statistics

In [None]:
# Mean song duration
mean_song_duration_ms = df['msPlayed'].mean()
# Convert to seconds
mean_song_duration_sec = mean_song_duration_ms / 1000
print(f"Mean song duration: {mean_song_duration_sec:.2f} seconds")

# Convert to minutes and seconds
mean_minutes = int(mean_song_duration_sec // 60)
mean_seconds = int(mean_song_duration_sec % 60)
print(f"Or: {mean_minutes} minutes and {mean_seconds} seconds")

In [None]:
# Median song duration
median_song_duration_ms = df['msPlayed'].median()
# Convert to seconds
median_song_duration_sec = median_song_duration_ms / 1000
print(f"Median song duration: {median_song_duration_sec:.2f} seconds")

# Convert to minutes and seconds
median_minutes = int(median_song_duration_sec // 60)
median_seconds = int(median_song_duration_sec % 60)
print(f"Or: {median_minutes} minutes and {median_seconds} seconds")

In [None]:
# Song length quartiles
quartiles = df['msPlayed'].quantile([0.25, 0.5, 0.75])
quartiles_sec = quartiles / 1000  # convert to seconds
# convert to minutes and seconds for each quartile
for q in quartiles_sec.index:
    total_seconds = quartiles_sec[q]
    minutes = int(total_seconds // 60)
    seconds = int(total_seconds % 60)
    print(f"{int(q*100)}th percentile: {minutes} minutes and {seconds} seconds")

# 25th percentile: value at which 25% of songs are shorter and 75% are longer
# 50th percentile: median
# 75th percentile: value at which 75% of songs are shorter and 25% are longer


## Some More Visualizations

### Average listening time by day of the week

In [None]:
# Visualize average listening by day of the week
# convert to hours for better readability
average_listening_by_day_bar = average_listening_by_day / (1000 * 60 * 60)
plt.figure(figsize=(10, 5))
sns.barplot(x=average_listening_by_day_bar.index, y=average_listening_by_day_bar.values)
plt.title('Average Listening Time by Day of the Week in 2025')
plt.xlabel('Day of the Week')
plt.ylabel('Average Hours Played')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### Times played by top 10 artists

In [None]:
# 1. Bar chart for the top 10 most listened-to artists
plt.figure(figsize=(12, 6))
#convert to hours
artist_time_plot= artist_time_played.head(10) / (1000 * 60 * 60)
artist_time_plot.plot(kind='bar', color='skyblue')
plt.title('Top 10 Most Listened-to Artists')
plt.ylabel('Total Time Played (hours)')
plt.xlabel('Artist Name')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

### Times plays for top 10 most played tracks

In [None]:
# 2. Bar chart for the top 10 most played tracks
plt.figure(figsize=(12, 6))
most_played_tracks.head(10).plot(kind='bar', color='coral')
plt.title('Top 10 Most Played Tracks')
plt.ylabel('Number of Plays')
plt.xlabel('Artist - Track Name')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

### Pie chart of listening time by top 10 artists 

In [None]:
# 3. Pie chart for listening time by top 10 artists
plt.figure(figsize=(10, 10))
artist_time_played.head(10).plot(kind='pie', autopct='%1.1f%%', startangle=90, colors=plt.cm.Paired.colors)
plt.title('Percentage of Listening Time by Top 10 Artists')
plt.ylabel('')
plt.show()