# Tworzenie nowego popularity i sprzątanie danych

In [275]:
import os
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler
import plotly.express as px
import time

In [276]:
folder_path = "../data_v2/data/"

files = os.listdir(folder_path)

raw_data = {}

for file in files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_json(file_path, lines=True)
    raw_data[file] = df

In [277]:
print(raw_data.keys())

dict_keys(['artists.jsonl', 'sessions.jsonl', 'tracks.jsonl', 'users.jsonl'])


In [278]:
sessions_df = raw_data['sessions.jsonl']
tracks_df = raw_data['tracks.jsonl']
artists_df = raw_data['artists.jsonl']

In [279]:
track_appearance_counts = sessions_df['track_id'].value_counts()
track_appearance_counts = track_appearance_counts.reindex(tracks_df['id'], fill_value=0)
track_appearance_counts.describe()

count    22412.000000
mean        64.433875
std         10.758655
min         23.000000
25%         57.000000
50%         64.000000
75%         72.000000
max        111.000000
Name: count, dtype: float64

Sessions w pełni pokrywa wszystkie utwory

In [280]:
event_map = {'play': 1, 'skip': -1, 'like': 2, 'advertisement': 0}

In [281]:
sessions_df['popularity_coefficient'] = sessions_df['event_type'].map(event_map)
popularity_per_track = sessions_df.groupby('track_id')['popularity_coefficient'].sum().reset_index()
popularity_per_track.rename(columns = {'popularity_coefficient':'popularity_from_sessions'}, inplace = True)

In [282]:
tracks_with_popularity = pd.merge(tracks_df, popularity_per_track, left_on='id', right_on='track_id', how='left')
# niepotrzebne ze względu na pełne pokrycie
# tracks_with_popularity['popularity_from_sessions'].fillna(0, inplace=True)

In [283]:
scaler = MinMaxScaler()
tracks_with_popularity[['normalized_popularity','normalized_popularity_from_sessions']] = scaler.fit_transform(tracks_with_popularity[['popularity','popularity_from_sessions']])

In [284]:
tracks_with_popularity[['normalized_popularity','normalized_popularity_from_sessions']].describe()

Unnamed: 0,normalized_popularity,normalized_popularity_from_sessions
count,22412.0,22412.0
mean,0.216158,0.378079
std,0.167672,0.122217
min,0.0,0.0
25%,0.083333,0.292683
50%,0.1875,0.378049
75%,0.333333,0.45122
max,1.0,1.0


In [285]:
fig = px.histogram(tracks_with_popularity, x=['normalized_popularity', 'normalized_popularity_from_sessions'],
                   nbins=50, opacity=0.7, barmode='overlay',
                   labels={'value': 'Normalized Popularity'},
                   title='Distribution of Normalized Popularity Values')
fig.show()

# Przygotowanie danych treningowych

In [286]:
tracks_with_popularity.describe()

Unnamed: 0,popularity,duration_ms,explicit,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,popularity_from_sessions,normalized_popularity,normalized_popularity_from_sessions
count,22412.0,22412.0,22412.0,22412.0,22412.0,22412.0,22412.0,22412.0,22412.0,22412.0,22412.0,22412.0,22412.0,22412.0,22412.0,22412.0
mean,61.375602,230138.3,0.166607,0.599096,0.648017,5.278824,-7.196227,0.082914,0.267758,0.04022,0.193141,0.518,121.911945,49.002454,0.216158,0.378079
std,8.048246,72095.62,0.372633,0.158259,0.209577,3.558813,3.738098,0.089317,0.276111,0.152828,0.167943,0.244477,29.615254,10.021829,0.167672,0.122217
min,51.0,30622.0,0.0,0.0,0.000103,0.0,-44.41,0.0,1e-06,0.0,0.012,0.0,0.0,18.0,0.0,0.0
25%,55.0,191493.0,0.0,0.495,0.508,2.0,-8.6145,0.0339,0.0298,0.0,0.0944,0.325,97.988,42.0,0.083333,0.292683
50%,60.0,220667.0,0.0,0.608,0.672,5.0,-6.393,0.0463,0.165,3e-06,0.125,0.512,120.041,49.0,0.1875,0.378049
75%,67.0,256240.0,0.0,0.715,0.816,8.0,-4.85,0.086,0.455,0.000523,0.24,0.713,140.078,55.0,0.333333,0.45122
max,99.0,4120258.0,1.0,0.98,0.999,11.0,0.642,0.944,0.996,0.998,0.997,0.991,220.099,100.0,1.0,1.0


In [287]:
tracks_with_popularity.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22412 entries, 0 to 22411
Data columns (total 21 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   id                                   22412 non-null  object 
 1   name                                 22412 non-null  object 
 2   popularity                           22412 non-null  int64  
 3   duration_ms                          22412 non-null  int64  
 4   explicit                             22412 non-null  int64  
 5   id_artist                            22412 non-null  object 
 6   release_date                         22412 non-null  object 
 7   danceability                         22412 non-null  float64
 8   energy                               22412 non-null  float64
 9   key                                  22412 non-null  int64  
 10  loudness                             22412 non-null  float64
 11  speechiness                 

In [288]:
tracks_with_popularity_artists = pd.merge(tracks_with_popularity, artists_df, left_on='id_artist', right_on='id', how='left')
tracks_with_popularity_artists.drop(columns=['track_id'], inplace=True)
tracks_with_popularity_artists.rename(columns = {'id_x':'track_id', 'name_x':'track_name', 'id_y':'artist_id', 'name_y':'artist_name'}, inplace = True)

In [289]:
tracks_with_popularity_artists.columns

Index(['track_id', 'track_name', 'popularity', 'duration_ms', 'explicit',
       'id_artist', 'release_date', 'danceability', 'energy', 'key',
       'loudness', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'popularity_from_sessions',
       'normalized_popularity', 'normalized_popularity_from_sessions',
       'artist_id', 'artist_name', 'genres'],
      dtype='object')

In [290]:
tracks_with_popularity_artists.drop(columns=['id_artist', 'normalized_popularity', 'normalized_popularity_from_sessions'], axis=1, inplace=True)

In [291]:
tracks_with_popularity_artists.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22412 entries, 0 to 22411
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   track_id                  22412 non-null  object 
 1   track_name                22412 non-null  object 
 2   popularity                22412 non-null  int64  
 3   duration_ms               22412 non-null  int64  
 4   explicit                  22412 non-null  int64  
 5   release_date              22412 non-null  object 
 6   danceability              22412 non-null  float64
 7   energy                    22412 non-null  float64
 8   key                       22412 non-null  int64  
 9   loudness                  22412 non-null  float64
 10  speechiness               22412 non-null  float64
 11  acousticness              22412 non-null  float64
 12  instrumentalness          22412 non-null  float64
 13  liveness                  22412 non-null  float64
 14  valenc

# Konwersja daty

In [292]:
tracks_with_popularity_artists['release_date'] = pd.to_datetime(tracks_with_popularity_artists['release_date'], format='mixed')
tracks_with_popularity_artists['release_date_numeric'] = tracks_with_popularity_artists['release_date'].apply(lambda x: time.mktime(x.timetuple()) if not pd.isnull(x) else None)

In [293]:
tracks_with_popularity_artists[['release_date', 'release_date_numeric']].describe()

Unnamed: 0,release_date,release_date_numeric
count,22412,22412.0
mean,2006-08-04 16:57:32.864536832,1154705000.0
min,1929-01-01 00:00:00,-1293844000.0
25%,2000-05-16 00:00:00,958428000.0
50%,2011-02-01 00:00:00,1296515000.0
75%,2017-05-12 00:00:00,1494540000.0
max,2021-04-10 00:00:00,1618006000.0
std,,440415900.0


TODO One hot encoding

In [294]:
# one_hot_genres = tracks_with_popularity_artists['genres'].str.get_dummies(', ')
# tracks_with_popularity_artists_one_hot_genres = pd.concat([tracks_with_popularity_artists, one_hot_genres], axis=1)

# Normalizacja

In [295]:
tracks_with_popularity_artists.columns

Index(['track_id', 'track_name', 'popularity', 'duration_ms', 'explicit',
       'release_date', 'danceability', 'energy', 'key', 'loudness',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'popularity_from_sessions', 'artist_id',
       'artist_name', 'genres', 'release_date_numeric'],
      dtype='object')

In [296]:
numerical_columns = ['popularity', 'duration_ms', 'explicit',
                      'danceability', 'energy', 'key', 'loudness',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'popularity_from_sessions', 'release_date_numeric']

In [297]:
tracks_with_popularity_artists[numerical_columns].isna().sum()

popularity                  0
duration_ms                 0
explicit                    0
danceability                0
energy                      0
key                         0
loudness                    0
speechiness                 0
acousticness                0
instrumentalness            0
liveness                    0
valence                     0
tempo                       0
popularity_from_sessions    0
release_date_numeric        0
dtype: int64

In [298]:
scaler = MinMaxScaler()
tracks_with_popularity_artists_scaled = tracks_with_popularity_artists.copy()
tracks_with_popularity_artists_scaled[numerical_columns] = scaler.fit_transform(tracks_with_popularity_artists[numerical_columns])

In [301]:
tracks_with_popularity_artists_scaled.describe()

Unnamed: 0,popularity,duration_ms,explicit,release_date,danceability,energy,key,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,popularity_from_sessions,release_date_numeric
count,22412.0,22412.0,22412.0,22412,22412.0,22412.0,22412.0,22412.0,22412.0,22412.0,22412.0,22412.0,22412.0,22412.0,22412.0,22412.0
mean,0.216158,0.048786,0.166607,2006-08-04 16:57:32.864536832,0.611323,0.64863,0.479893,0.826018,0.087833,0.268832,0.040301,0.1839,0.522705,0.553896,0.378079,0.840891
min,0.0,0.0,0.0,1929-01-01 00:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.083333,0.039336,0.0,2000-05-16 00:00:00,0.505102,0.508458,0.181818,0.794537,0.035911,0.029918,0.0,0.083655,0.327952,0.4452,0.292683,0.773485
50%,0.1875,0.04647,0.0,2011-02-01 00:00:00,0.620408,0.672639,0.454545,0.843847,0.049047,0.165661,3e-06,0.114721,0.51665,0.545395,0.378049,0.889592
75%,0.333333,0.055168,0.0,2017-05-12 00:00:00,0.729592,0.816798,0.727273,0.878096,0.091102,0.456827,0.000524,0.231472,0.719475,0.636432,0.45122,0.957599
max,1.0,1.0,1.0,2021-04-10 00:00:00,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
std,0.167672,0.017629,0.372633,,0.161488,0.209809,0.323528,0.082973,0.094616,0.27722,0.153134,0.170501,0.246697,0.134554,0.122217,0.15125


: 

In [300]:
tracks_with_popularity_artists_scaled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22412 entries, 0 to 22411
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   track_id                  22412 non-null  object        
 1   track_name                22412 non-null  object        
 2   popularity                22412 non-null  float64       
 3   duration_ms               22412 non-null  float64       
 4   explicit                  22412 non-null  float64       
 5   release_date              22412 non-null  datetime64[ns]
 6   danceability              22412 non-null  float64       
 7   energy                    22412 non-null  float64       
 8   key                       22412 non-null  float64       
 9   loudness                  22412 non-null  float64       
 10  speechiness               22412 non-null  float64       
 11  acousticness              22412 non-null  float64       
 12  instrumentalness  