# Importing the libraries

In [2]:
import os
import pandas as pd
import numpy as np
import json
import spotipy
import spotipy.oauth2 as oauth2
from spotipy.oauth2 import SpotifyOAuth,SpotifyClientCredentials
import re
from tqdm import tqdm
import multiprocessing as mp
import time
import random
import datetime

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [3]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv("../spotify_extraction.env")
SPOTIPY_CLIENT_ID="8b2eac77afd84fb5b62584c8f5b273f9"
SPOTIPY_CLIENT_SECRET="74146d31ba9e46fd9b4bc8eab9862551"
auth_manager = SpotifyClientCredentials(client_id=SPOTIPY_CLIENT_ID,
                                        client_secret=SPOTIPY_CLIENT_SECRET)
sp = spotipy.client.Spotify(auth_manager=auth_manager)

# Importing the dataset

In [4]:
df = pd.read_csv('V1.csv')
artist_features=pd.read_csv('data/artist_features.csv', names=['artist_uri', 'artist_pop', 'artist_genres'])
audio_features=pd.read_csv('data/audio_features.csv', names=['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms', 'time_signature'])
track_features=pd.read_csv('data/track_features.csv', names=['track_uri', 'year', 'artists', 'explicit', 'name', 'track_pop'])

In [5]:
df.columns

Index(['track_uri', 'artist_uri', 'album_uri'], dtype='object')

In [6]:
audio_features.columns

Index(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'type', 'id', 'uri', 'track_href', 'analysis_url', 'duration_ms',
       'time_signature'],
      dtype='object')

# Merging all dataframes

In [7]:
df = pd.merge(df,audio_features, left_on = "track_uri", right_on= "id",how = 'outer')

In [8]:
df = pd.merge(df,track_features, left_on = "track_uri", right_on= "track_uri",how = 'outer')

In [9]:
df = pd.merge(df,artist_features, left_on = "artist_uri", right_on= "artist_uri",how = 'outer')

# Handling missing data 

In [10]:
df.isna().sum()

track_uri                 0
artist_uri                0
album_uri                 0
danceability        3057548
energy              3057548
key                 3057548
loudness            3057548
mode                3057548
speechiness         3057548
acousticness        3057548
instrumentalness    3057548
liveness            3057548
valence             3057548
tempo               3057548
type                3057548
id                  3057548
uri                 3057548
track_href          3057548
analysis_url        3057548
duration_ms         3057548
time_signature      3057548
year                3070081
artists             3070615
explicit            3070081
name                3070635
track_pop           3070081
artist_pop           442640
artist_genres        442640
dtype: int64

## Handling audio_features missing From extraction

In [11]:
df = df.dropna()

In [12]:
df.columns

Index(['track_uri', 'artist_uri', 'album_uri', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'type', 'id', 'uri', 'track_href',
       'analysis_url', 'duration_ms', 'time_signature', 'year', 'artists',
       'explicit', 'name', 'track_pop', 'artist_pop', 'artist_genres'],
      dtype='object')

In [13]:
df.head(1)

Unnamed: 0,track_uri,artist_uri,album_uri,danceability,energy,key,loudness,mode,speechiness,acousticness,...,analysis_url,duration_ms,time_signature,year,artists,explicit,name,track_pop,artist_pop,artist_genres
54,1FtIVI4TDaPNJlIoJrHKBA,001aJOc7CSQVo3XzoLG4DK,0mxxWnON99ABmmeQdqX8Ds,0.751,0.288,6.0,-17.018,1.0,0.0304,0.324,...,https://api.spotify.com/v1/audio-analysis/1FtI...,327200.0,4.0,1989.0,One Way,0.0,Lady You Are,26.0,44.0,classic_soul disco funk new_jack_swing p_funk ...


In [15]:
df.columns

Index(['track_uri', 'artist_uri', 'album_uri', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'type', 'id', 'uri', 'track_href',
       'analysis_url', 'duration_ms', 'time_signature', 'year', 'artists',
       'explicit', 'name', 'track_pop', 'artist_pop', 'artist_genres'],
      dtype='object')

In [18]:
df.shape

(189340, 18)

In [17]:
df.drop(columns=['type', 'track_uri', 'artist_uri', 'album_uri', 'uri', 'track_href', 'analysis_url', 'time_signature', 'artist_pop', 'artist_genres'], inplace=True) 
df.rename(columns={'track_pop': 'popularity'}, inplace=True)
df = df.drop_duplicates(subset=['name', 'year', 'artists'])

In [19]:
df.to_csv('data/AWS_data.csv',index=False)

## Data Preprocessing

Create five point buckets for track and artist popularity .

and 50 point buckets for the track release date.

In [32]:
df['track_pop'] = df['track_pop'].apply(lambda x: int(x/5))
df['artist_pop'] = df['artist_pop'].apply(lambda x: int(x/5))

In [33]:
df['track_release_date'] = df['track_release_date'].apply(lambda x: x.split('-')[0])
df['track_release_date']=df['track_release_date'].astype('int16')
df['track_release_date'] = df['track_release_date'].apply(lambda x: int(x/50))

In [34]:
df.head(1)

Unnamed: 0,track_uri,artist_uri,album_uri,danceability,energy,key,loudness,mode,speechiness,acousticness,...,id,uri,track_href,analysis_url,duration_ms,time_signature,track_release_date,track_pop,artist_pop,artist_genres
27,2xW3EQxFuuFVgU6RCeZGe9,00190FC20vIUv0wXpeTf8S,05iZ9CNkJp5TURl0ET55hL,0.66,0.788,2.0,-6.066,1.0,0.0788,0.0312,...,2xW3EQxFuuFVgU6RCeZGe9,spotify:track:2xW3EQxFuuFVgU6RCeZGe9,https://api.spotify.com/v1/tracks/2xW3EQxFuuFV...,https://api.spotify.com/v1/audio-analysis/2xW3...,204233.0,4.0,40,4,5,beatboxing


In [35]:
df.to_csv('data/1M_unique_processed_data.csv',index=False)