# This Notebook filters the daily data to retain only one unique entry per song and add some aggregated metrics resulting in the csv(s) which are used for further api requests

In [7]:
import pandas as pd
df = pd.read_csv('daily_gb_17-24.csv')
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date').reset_index(drop=True)

In [None]:

#Group by artist and track name to avoid duplicate req. and do some aggregation.


song_stats = df.groupby(['artist_names', 'track_name']).agg(
    peak_streams=('streams', 'max'),  
    total_streams=('streams', 'sum'),        
    max_peak_rank=('peak_rank', 'max'),   
    min_peak_rank=('peak_rank', 'min'),       
    max_days_on_chart=('days_on_chart', 'max'), 
    best_day_streams=('streams', 'max'),   
    first_appearance=('date', 'first'),
    uri=('uri', 'first'),
    source=('source', 'first'),
    count=('track_name', 'count')
).reset_index()

In [None]:

# Get the date of the day with the most streams for each song and sort by streams


song_stats['best_day_date'] = song_stats.apply(
    lambda x: df[(df['artist_names'] == x['artist_names']) & 
                 (df['track_name'] == x['track_name']) & 
                 (df['streams'] == x['best_day_streams'])]['date'].values[0], axis=1
)

song_stats['total_streams'] = song_stats['total_streams'].astype(int)
song_stats = song_stats.sort_values(by='total_streams', ascending=False)


In [10]:
song_stats.to_csv('cleaned_ger_17-24.csv', index=False)