In [45]:
import pandas as pd

In [46]:
# artists = pd.read_json('../data/IUM_Zad_04_02_v2/artists.jsonl', lines=True)
# sessions = pd.read_json('../data/IUM_Zad_04_02_v2/sessions.jsonl', lines=True)
# tracks = pd.read_json('../data/IUM_Zad_04_02_v2/tracks.jsonl', lines=True)

In [47]:
artists = pd.read_json('../data/IUM_Zad_04_01_v3/artists.jsonl', lines=True)
sessions = pd.read_json('../data/IUM_Zad_04_01_v3/sessions.jsonl', lines=True)
tracks = pd.read_json('../data/IUM_Zad_04_01_v3/tracks.jsonl', lines=True)

In [48]:
sessions['timestamp'] = pd.to_datetime(sessions['timestamp'])
sessions.set_index('timestamp', inplace=True)

In [49]:
data = tracks.merge(artists, left_on='id_artist', right_on='id', suffixes=('_track', '_artist_'))
data.drop(columns=['id_artist_'], inplace=True)
data.rename(columns={'name_artist_': 'name_artist'}, inplace=True)

In [50]:
artist_popularity = data.groupby('id_artist')['popularity'].mean().reset_index(name='artist_popularity')

In [51]:
data = data.merge(artist_popularity, left_on='id_artist', right_on='id_artist', suffixes=('_track', '_artist'))

In [52]:
prepared_data = pd.DataFrame()

In [53]:
for week_number, week_start in enumerate(pd.date_range(start=sessions.index.min(), end=sessions.index.max(), freq='7D')):
    week_end = week_start + pd.Timedelta(days=6)
    week_sessions = sessions[(sessions.index >= week_start) & (sessions.index <= week_end)]
    
    track_plays = week_sessions[week_sessions['event_type'] == 'play'].groupby('track_id').size().reset_index(name='track_plays')

    artist_plays = week_sessions[week_sessions['event_type'] == 'play'].merge(tracks, left_on='track_id', right_on='id').groupby('id_artist').size().reset_index(name='artist_plays')

    week_data = data.copy()
    
    week_data = week_data.merge(track_plays, left_on='id_track', right_on='track_id', how='left')
    week_data = week_data.merge(artist_plays, on='id_artist', how='left')

    week_data['track_plays'] = week_data['track_plays'].fillna(0).astype(int)
    week_data['artist_plays'] = week_data['artist_plays'].fillna(0).astype(int)

    week_data.drop(['track_id'], inplace=True, axis=1)

    week_data['week_number'] = week_number

    prepared_data = pd.concat([prepared_data, week_data], ignore_index=True)

In [54]:
prepared_data.head()

Unnamed: 0,id_track,name_track,popularity,duration_ms,explicit,id_artist,release_date,danceability,energy,key,...,instrumentalness,liveness,valence,tempo,name_artist,genres,artist_popularity,track_plays,artist_plays,week_number
0,0RNxWy0PC3AyH4ThH3aGK6,Mack the Knife,55,201467,0,19eLuQmk9aCobbVDHc6eek,1929,0.673,0.377,0,...,0.0,0.332,0.713,88.973,Louis Armstrong,"[adult standards, dixieland, harlem renaissanc...",61.090909,0,0,0
1,17gxfuiFUrLhbUKdunxUPJ,Moon River,58,179867,0,19eLuQmk9aCobbVDHc6eek,1964-10-25,0.448,0.12,0,...,0.0135,0.1,0.261,86.407,Louis Armstrong,"[adult standards, dixieland, harlem renaissanc...",61.090909,0,0,0
2,63kd4m3VFxcJjPVVtbVNAu,"Hello, Dolly!",53,147000,0,19eLuQmk9aCobbVDHc6eek,1964-10-25,0.0,0.405,0,...,0.00114,0.198,0.0,0.0,Louis Armstrong,"[adult standards, dixieland, harlem renaissanc...",61.090909,0,0,0
3,1qCQTy0fTXerET4x8VHyr9,What A Wonderful World,74,137520,0,19eLuQmk9aCobbVDHc6eek,1968,0.399,0.258,5,...,2e-06,0.128,0.192,108.174,Louis Armstrong,"[adult standards, dixieland, harlem renaissanc...",61.090909,0,0,0
4,1UH4viviUjZnS9aWgPGrk0,La vie en rose - Single Version,66,204400,0,19eLuQmk9aCobbVDHc6eek,1989-01-01,0.507,0.0779,0,...,0.00275,0.108,0.326,70.808,Louis Armstrong,"[adult standards, dixieland, harlem renaissanc...",61.090909,0,0,0


In [55]:
prepared_data.to_csv('../data/prepared_data_v3.csv', index=False)