## Data Collection from JioSavan for English Language

This notebook helps to collect the mp4 URLs of different songs from JioSavan API

In [1]:
# importing required packaegs
import pandas as pd
import json
import urllib.request as req
import os
from tqdm import tqdm

In [2]:
# Defining url for gener and artist playlists
genre = {
    'Rock': 'https://www.jiosaavn.com/s/playlist/38e6bfd36d1ad1f70db84ef71a7fa88e/Rock/49t5x5-aMliO0eMLZZxqsA__',
    'Workout': 'https://www.jiosaavn.com/s/playlist/38e6bfd36d1ad1f70db84ef71a7fa88e/Workout/CfL9X1gZ2cZuOxiEGmm6lQ__',
    'Romantic': 'https://www.jiosaavn.com/s/playlist/38e6bfd36d1ad1f70db84ef71a7fa88e/Romantic/9ZUJjaXZGjRuOxiEGmm6lQ__',
    'HipHopParty': 'https://www.jiosaavn.com/s/playlist/38e6bfd36d1ad1f70db84ef71a7fa88e/HipHopParty/1fClLeOCHDbufxkxMEIbIw__'
}


artist = {
    'Selena Gomez': 'https://www.jiosaavn.com/s/playlist/38e6bfd36d1ad1f70db84ef71a7fa88e/Selena_Gomez/5Dof731eh2Mwkg5tVhI3fw__',
    'Adam Levine': 'https://www.jiosaavn.com/s/playlist/38e6bfd36d1ad1f70db84ef71a7fa88e/Adam_Levine/xiejqZd7vwxieSJqt9HmOQ__',
    'Justin Bieber': 'https://www.jiosaavn.com/s/playlist/38e6bfd36d1ad1f70db84ef71a7fa88e/Justin_Bieber/IAT5LUjgzvwwkg5tVhI3fw__',
    'Drake': 'https://www.jiosaavn.com/s/playlist/38e6bfd36d1ad1f70db84ef71a7fa88e/Drake/fM-lT4AodABuOxiEGmm6lQ__',
    'The Weeknd': 'https://www.jiosaavn.com/s/playlist/38e6bfd36d1ad1f70db84ef71a7fa88e/The_Weeknd/NSaWZdVxZL4GSw2I1RxdhQ__'
}

In [3]:
# Collecting defined geners and artistis songs related info and loading into a dictionary
genre_songs_collection = dict()
artist_songs_collection = dict()

for genre_name, genre_url in genre.items():
    #savan_api_url = 'http://127.0.0.1:5000/playlist/?query=' + genre_url + '&lyrics=true'
    savan_api_url = 'http://127.0.0.1:5000/playlist/?query=' + genre_url
    with req.urlopen(savan_api_url) as url:
        genre_songs_collection[genre_name] = json.loads(url.read().decode())
        
for artist_name, artist_url in artist.items():
    #savan_api_url = 'http://127.0.0.1:5000/playlist/?query=' + artist_url + '&lyrics=true'
    savan_api_url = 'http://127.0.0.1:5000/playlist/?query=' + artist_url 
    with req.urlopen(savan_api_url) as url:
        artist_songs_collection[artist_name] = json.loads(url.read().decode())

In [4]:
# Checking how many songs present for each genre
for k,v in genre_songs_collection.items():
    try:
        print(k,':' ,len(v['songs']))
    except:
        print(k)

Rock : 20
Workout : 20
Romantic : 20
HipHopParty : 25


In [5]:
# Checking how many songs present for each artist
for k,v in artist_songs_collection.items():
    try:
        print(k,':' ,len(v['songs']))
    except:
        print(k)

Selena Gomez : 28
Adam Levine : 27
Justin Bieber : 32
Drake : 28
The Weeknd : 15


In [7]:
# Setting up the folders

paths ={
    'root_path': 'JioSavan Data/English',
    'genre_mp3_path' : 'JioSavan Data/English/genre/mp4',
    'genre_wav_path' : 'JioSavan Data/English/genre/wav',
    'artist_mp3_path' : 'JioSavan Data/English/artist/mp4',
    'artist_wav_path' : 'JioSavan Data/English/artist/wav',
}

for name, path in paths.items():
    if not os.path.exists(path):
        os.makedirs(path)


In [8]:
# saving the collced info
with open(paths['root_path'] + '/genre/genres_data.json', 'w') as outfile:
    json.dump(genre_songs_collection, outfile)
    
with open(paths['root_path'] + '/artist/artists_data.json', 'w') as outfile:
    json.dump(artist_songs_collection, outfile)

## Genres data collection and pre-processing

In [9]:
#Redaing data from saved files
with open(paths['root_path'] + '/genre/genres_data.json', 'r') as file:
    genre_songs_collection = json.load(file)

In [10]:
# Converting genre_songs_collection into a data frame

appended_data = []
for genre_type, genre_songs_details in genre_songs_collection.items():
    if genre_type != 'Electronic' :
        temp_df = pd.DataFrame.from_dict(genre_songs_details['songs'])
        temp_df.insert(0, 'song_id', genre_type + '_' + temp_df['id'])
        temp_df.insert(1, 'genere', genre_type)
        appended_data.append(temp_df) # store DataFrame in list 
genre_df = pd.concat(appended_data)
genre_df.shape

(85, 42)

In [11]:
genre_df['genere'].value_counts()

HipHopParty    25
Rock           20
Romantic       20
Workout        20
Name: genere, dtype: int64

In [12]:
genre_df.head()

Unnamed: 0,song_id,genere,320kbps,album,album_url,albumid,artistMap,cache_state,copyright_text,duration,...,rights,singers,song,starred,starring,triller_available,type,year,vcode,vlink
0,Rock_PUjl7vs8,Rock,True,Jesus,https://www.jiosaavn.com/album/jesus/GekIXXJHXfU_,2457552,{'Chris Tomlin': '572982'},False,℗ 2016 sixstepsrecords/Sparrow Records,249,...,"{'cacheable': True, 'code': 0, 'delete_cached_...",Chris Tomlin,Jesus,False,,False,,2016,,
1,Rock_IPbuIHKX,Rock,True,The Dark Side of the Moon,https://www.jiosaavn.com/album/the-dark-side-o...,1707513,"{'David Gilmour': '528009', 'Nick Mason': '584...",False,(P) 2016 The copyright in this sound recording...,418,...,"{'cacheable': True, 'code': 0, 'delete_cached_...","Pink Floyd, David Gilmour, Nick Mason, Richard...",Time,False,,False,,1973,,
2,Rock_ifkMQ1jy,Rock,True,We Are The Champions (Raw Sessions Version),https://www.jiosaavn.com/album/we-are-the-cham...,11597696,{'Queen': '527759'},False,"℗ 2017 Queen Productions Ltd, under exclusive ...",273,...,"{'cacheable': True, 'code': 0, 'delete_cached_...",Queen,We Are The Champions (Alternative Version),False,,False,,2017,,
3,Rock_ZKY9_R-U,Rock,True,Unleashed,https://www.jiosaavn.com/album/unleashed/-kwmr...,2502954,{'Skillet': '704751'},False,℗ 2016 Atlantic Recording Corporation,232,...,"{'cacheable': True, 'code': 0, 'delete_cached_...",Skillet,The Resistance,False,,False,,2016,,
4,Rock_nhn4ApPt,Rock,True,Wake Me up When September Ends (Live at Foxbor...,https://www.jiosaavn.com/album/wake-me-up-when...,1169546,{'Green Day': '645985'},False,℗ 2005 Reprise Records,340,...,"{'cacheable': True, 'code': 0, 'delete_cached_...",Green Day,Wake Me up When September Ends (Live at Foxbor...,False,,False,,2005,,


In [13]:
genre_df.columns

Index(['song_id', 'genere', '320kbps', 'album', 'album_url', 'albumid',
       'artistMap', 'cache_state', 'copyright_text', 'duration',
       'encrypted_media_path', 'encrypted_media_url', 'explicit_content',
       'featured_artists', 'featured_artists_id', 'has_lyrics', 'id', 'image',
       'label', 'label_url', 'language', 'lyrics_snippet', 'media_preview_url',
       'media_url', 'music', 'music_id', 'origin', 'perma_url', 'play_count',
       'primary_artists', 'primary_artists_id', 'release_date', 'rights',
       'singers', 'song', 'starred', 'starring', 'triller_available', 'type',
       'year', 'vcode', 'vlink'],
      dtype='object')

In [14]:
# save the data into CSV
genre_df.to_csv(paths['root_path'] + '/genre/genres_data.csv', index = False)

In [15]:
# downloading songs from a url and saving in mp4 format
genre_mp4_mapping = dict(zip(genre_df['song_id'], genre_df['media_url']))
for idx, url in tqdm(genre_mp4_mapping.items()):
    try:
        download_path = paths['root_path'] + '/genre/mp4/' + str(idx) +'.mp4'
        req.urlretrieve(url, download_path)
    except:
        print('Error ocuured for Id:', idx)

100%|██████████| 85/85 [05:36<00:00,  3.96s/it]


## Artists data collection and pre-processing

In [23]:
# Reading artists_data 
with open(paths['root_path'] + '/artist/artists_data.json', 'r') as file:
    artist_songs_collection = json.load(file)

In [24]:
# Converting genre_songs_collection into a data frame
appended_data = []
for artist_name, artist_songs_details in artist_songs_collection.items():
    temp_df = pd.DataFrame.from_dict(artist_songs_details['songs'])
    temp_df.insert(0, 'song_id', artist_name + '_' + temp_df['id'])
    temp_df.insert(1, 'artist', artist_name)
    appended_data.append(temp_df) # store DataFrame in list
    
artist_df = pd.concat(appended_data)
artist_df.shape

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if __name__ == '__main__':


(130, 43)

In [25]:
artist_df.head()

Unnamed: 0,320kbps,album,album_url,albumid,artist,artistMap,cache_state,copyright_text,disabled,disabled_text,...,rights,singers,song,song_id,starred,starring,type,vcode,vlink,year
0,True,Wolves,https://www.jiosaavn.com/album/wolves/8MDTQ3s3...,11686511,Selena Gomez,"{'Marshmello': '862454', 'Selena Gomez': '6038...",False,℗ 2017 Interscope Records,True,Unavailable,...,"{'cacheable': False, 'code': 1, 'delete_cached...","Selena Gomez, Marshmello",Wolves,Selena Gomez_2oWa2CKB,False,,,10912291154903,https://jiotunepreview.jio.com/content/Convert...,2017
1,True,It Ain't Me,https://www.jiosaavn.com/album/it-aint-me/b6tO...,10218145,Selena Gomez,"{'Ali Tamposi': '566701', 'Andrew Watt': '7775...",False,(P) 2017 Kygo AS under exclusive license to So...,True,Unavailable,...,"{'cacheable': False, 'code': 1, 'delete_cached...","Kygo, Selena Gomez, Kygo & Selena Gomez",It Ain't Me,Selena Gomez_sEdO8hht,False,,,10910140587988,https://jiotunepreview.jio.com/content/Convert...,2017
2,True,Fetish,"https://www.jiosaavn.com/album/fetish/XQagTr,l...",11164990,Selena Gomez,"{'Gucci Mane': '634121', 'Selena Gomez': '6038...",False,℗ 2017 Interscope Records,True,Unavailable,...,"{'cacheable': False, 'code': 1, 'delete_cached...","Selena Gomez, Gucci Mane",Fetish,Selena Gomez_hJBMcPcc,False,,,10912291154921,https://jiotunepreview.jio.com/content/Convert...,2017
3,True,13 Reasons Why (Season 2),https://www.jiosaavn.com/album/13-reasons-why-...,12879128,Selena Gomez,{'Selena Gomez': '603812'},False,℗ 2018 Interscope Records,True,Unavailable,...,"{'cacheable': False, 'code': 1, 'delete_cached...",Selena Gomez,Back To You (From 13 Reasons Why â Season 2 ...,Selena Gomez_dXLuqVW9,False,,,10912291154241,https://jiotunepreview.jio.com/content/Convert...,2018
4,True,Bad Liar,https://www.jiosaavn.com/album/bad-liar/VPeCFo...,10842137,Selena Gomez,{'Selena Gomez': '603812'},False,℗ 2017 Interscope Records,True,Unavailable,...,"{'cacheable': False, 'code': 1, 'delete_cached...",Selena Gomez,Bad Liar,Selena Gomez_HWSf8WIh,False,,,10912291154897,https://jiotunepreview.jio.com/content/Convert...,2017


In [26]:
artist_df['artist'].value_counts()

Justin Bieber    32
Drake            28
Selena Gomez     28
Adam Levine      27
The Weeknd       15
Name: artist, dtype: int64

In [28]:
# save the data into CSV
artist_df.to_csv(paths['root_path'] + '/artist/artists_data.csv', index = False)

In [29]:
# downloading songs from a url and saving in mp3 format
artist_mp4_mapping = dict(zip(artist_df['song_id'], artist_df['media_url']))
for idx, url in tqdm(artist_mp4_mapping.items()):
    try:
        download_path = paths['root_path'] +'/artist/mp4/' + str(idx) +'.mp4'
        req.urlretrieve(url, download_path)
    except:
        print('Error ocuured for Id:', idx)

100%|██████████| 130/130 [09:26<00:00,  4.36s/it]
