## Data Collection from JioSavan for Telugu and Hindi Languages

In [1]:
# importing required packaegs
import pandas as pd
import json
import urllib.request as req
import os
from tqdm import tqdm

In [2]:
# Defining urls for gener and artist playlists for Telugu language
artist_telugu = {
    'armaan':'https://www.jiosaavn.com/s/playlist/nirupamkumar783/Armaanmalik/-Bk0LScmHanuCJW60TJk1Q__',
    'geetha':'https://www.jiosaavn.com/s/playlist/nirupamkumar783/Geetha/RVMkfEXHIXBieSJqt9HmOQ__',
    'anurag':'https://www.jiosaavn.com/s/playlist/nirupamkumar783/Anurag/UEbEVNg4MnDfemJ68FuXsA__',
    'sidsriram':'https://www.jiosaavn.com/s/playlist/nirupamkumar783/SidSriram/mf6uhpQYx9-femJ68FuXsA__',
}
genre_telugu = {
    'romantic':'https://www.jiosaavn.com/s/playlist/nirupamkumar783/romantic_telugu/S9XlYCAY9MnuCJW60TJk1Q__',
    'workout': 'https://www.jiosaavn.com/s/playlist/nirupamkumar783/workout_telugu/52dDXu76ew3femJ68FuXsA__',
    'rock': 'https://www.jiosaavn.com/s/playlist/nirupamkumar783/rock_telugu/mdpU-D0Yfz3ufxkxMEIbIw__'
}

 

# Defining urls for gener and artist playlists for Hindi language
artist_hindi = {
    'sonunigam':'https://www.jiosaavn.com/s/playlist/nirupamkumar783/Sonunigam/XkR36glHnO2O0eMLZZxqsA__',
    'arijit':'https://www.jiosaavn.com/s/playlist/nirupamkumar783/Arijit/NdU0h57FeuHuCJW60TJk1Q__',
    'atifaslam':'https://www.jiosaavn.com/s/playlist/nirupamkumar783/Atifaslam/k-d-iYGE1mowkg5tVhI3fw__'    
}
genre_hindi = {
    'workout':'https://www.jiosaavn.com/s/playlist/nirupamkumar783/workout/vksQVeaTrrVuOxiEGmm6lQ__',
    'rock':'https://www.jiosaavn.com/s/playlist/nirupamkumar783/rock/iiH095xhtlHfemJ68FuXsA__',
    'romantic': 'https://www.jiosaavn.com/s/playlist/nirupamkumar783/romantic_hindi/WLGabCRS6PhieSJqt9HmOQ__'
}

### Collecting audio data for Telugu language

In [3]:
# Collecting defined geners and artistis songs related info and loading into a dictionary
genre_songs_collection_telugu = dict()
artist_songs_collection_telugu = dict()


for genre_name, genre_url in genre_telugu.items():
    savan_api_url = 'http://127.0.0.1:5000/playlist/?query=' + genre_url
    with req.urlopen(savan_api_url) as url:
        genre_songs_collection_telugu[genre_name] = json.loads(url.read().decode())
        
        
for artist_name, artist_url in artist_telugu.items():
    savan_api_url = 'http://127.0.0.1:5000/playlist/?query=' + artist_url 
    with req.urlopen(savan_api_url) as url:
        artist_songs_collection_telugu[artist_name] = json.loads(url.read().decode())

In [4]:
# Setting up the folders

paths ={
    'root_path': 'JioSavan Data/Telugu',
    'genre_mp3_path' : 'JioSavan Data/Telugu/genre/mp4',
    'genre_wav_path' : 'JioSavan Data/Telugu/genre/wav',
    'artist_mp3_path' : 'JioSavan Data/Telugu/artist/mp4',
    'artist_wav_path' : 'JioSavan Data/Telugu/artist/wav',
}

for name, path in paths.items():
    if not os.path.exists(path):
        os.makedirs(path)

In [5]:
# Checking how many songs present for each genre
for k,v in genre_songs_collection_telugu.items():
    try:
        print(k,':' ,len(v['songs']))
    except:
        print(k)

romantic : 25
workout : 24
rock : 13


In [6]:
# Checking how many songs present for each genre
for k,v in artist_songs_collection_telugu.items():
    try:
        print(k,':' ,len(v['songs']))
    except:
        print(k)

armaan : 15
geetha : 23
anurag : 10
sidsriram : 8


In [9]:
# saving the collced info
with open(paths['root_path'] + '/genre/genres_data.json', 'w') as outfile:
    json.dump(genre_songs_collection_telugu, outfile)
    
with open(paths['root_path'] + '/artist/artists_data.json', 'w') as outfile:
    json.dump(artist_songs_collection_telugu, outfile)

### Data collection and pre-processing for Telugu genres

In [28]:
#Redaing data from saved files
with open(paths['root_path'] + '/genre/genres_data.json', 'r') as file:
    genre_songs_collection = json.load(file)

In [29]:
# Converting genre_songs_collection into a data frame

appended_data = []
for genre_type, genre_songs_details in genre_songs_collection.items():
    if genre_type != 'Electronic' :
        temp_df = pd.DataFrame.from_dict(genre_songs_details['songs'])
        #temp_df['genere'] = genre_type
        temp_df.insert(0, 'song_id', genre_type + '_' + temp_df['id'])
        temp_df.insert(1, 'genere', genre_type)
        appended_data.append(temp_df) # store DataFrame in list 
genre_df = pd.concat(appended_data)
genre_df.shape

(62, 43)

In [30]:
genre_df['genere'].value_counts()

romantic    25
workout     24
rock        13
Name: genere, dtype: int64

In [31]:
#save the data into CSV
genre_df.to_csv(paths['root_path'] + '/genre/genres_data.csv', index = False)

In [32]:
# downloading songs from a url and saving in mp4 format
genre_mp4_mapping = dict(zip(genre_df['song_id'], genre_df['media_url']))
for idx, url in tqdm(genre_mp4_mapping.items()):
    try:
        download_path = paths['root_path'] + '/genre/mp4/' + str(idx) +'.mp4'
        req.urlretrieve(url, download_path)
    except:
        print('Error ocuured for Id:', idx)

100%|██████████| 62/62 [06:11<00:00,  5.99s/it]


### Data collection and pre-processing Artist - Telugu

In [33]:
# Reading artists_data 
with open(paths['root_path'] + '/artist/artists_data.json', 'r') as file:
    artist_songs_collection = json.load(file)

In [34]:
# Converting genre_songs_collection into a data frame
appended_data = []
for artist_name, artist_songs_details in artist_songs_collection.items():
    temp_df = pd.DataFrame.from_dict(artist_songs_details['songs'])
    temp_df.insert(0, 'song_id', artist_name + '_' + temp_df['id'])
    temp_df.insert(1, 'artist', artist_name)
    appended_data.append(temp_df) # store DataFrame in list
    
artist_df = pd.concat(appended_data)
artist_df.shape

(56, 41)

In [35]:
artist_df['artist'].value_counts()

geetha       23
armaan       15
anurag       10
sidsriram     8
Name: artist, dtype: int64

In [36]:
# save the data into CSV
artist_df.to_csv(paths['root_path'] + '/artist/artists_data.csv', index = False)

In [37]:
# downloading songs from a url and saving in mp3 format
artist_mp4_mapping = dict(zip(artist_df['song_id'], artist_df['media_url']))
for idx, url in tqdm(artist_mp4_mapping.items()):
    try:
        download_path = paths['root_path'] +'/artist/mp4/' + str(idx) +'.mp4'
        req.urlretrieve(url, download_path)
    except:
        print('Error ocuured for Id:', idx)

100%|██████████| 56/56 [05:28<00:00,  5.87s/it]


### Collectting audio data for Hindi language

In [10]:
# Collecting defined geners and artistis songs related info and loading into a dictionary
genre_songs_collection_hindi = dict()
artist_songs_collection_hindi = dict()


for genre_name, genre_url in genre_hindi.items():
    #savan_api_url = 'http://127.0.0.1:5000/playlist/?query=' + genre_url + '&lyrics=true'
    savan_api_url = 'http://127.0.0.1:5000/playlist/?query=' + genre_url
    with req.urlopen(savan_api_url) as url:
        genre_songs_collection_hindi[genre_name] = json.loads(url.read().decode())
        
for artist_name, artist_url in artist_hindi.items():
    #savan_api_url = 'http://127.0.0.1:5000/playlist/?query=' + artist_url + '&lyrics=true'
    savan_api_url = 'http://127.0.0.1:5000/playlist/?query=' + artist_url 
    with req.urlopen(savan_api_url) as url:
        artist_songs_collection_hindi[artist_name] = json.loads(url.read().decode())

In [11]:
# Setting up the folders

paths ={
    'root_path': 'JioSavan Data/Hindi',
    'genre_mp3_path' : 'JioSavan Data/Hindi/genre/mp4',
    'genre_wav_path' : 'JioSavan Data/Hindi/genre/wav',
    'artist_mp3_path' : 'JioSavan Data/Hindi/artist/mp4',
    'artist_wav_path' : 'JioSavan Data/Hindi/artist/wav',
}

for name, path in paths.items():
    if not os.path.exists(path):
        os.makedirs(path)

In [12]:
# Checking how many songs present for each genre
for k,v in genre_songs_collection_hindi.items():
    try:
        print(k,':' ,len(v['songs']))
    except:
        print(k)

workout : 20
rock : 16
romantic : 16


In [13]:
# Checking how many songs present for each genre
for k,v in artist_songs_collection_hindi.items():
    try:
        print(k,':' ,len(v['songs']))
    except:
        print(k)

sonunigam : 20
arijit : 20
atifaslam : 20


In [14]:
# saving the collced info
with open(paths['root_path'] + '/genre/genres_data.json', 'w') as outfile:
    json.dump(genre_songs_collection_hindi, outfile)
    
with open(paths['root_path'] + '/artist/artists_data.json', 'w') as outfile:
    json.dump(artist_songs_collection_hindi, outfile)

### Data collection and pre-processing Hindi - Genre

In [15]:
#Redaing data from saved files
with open(paths['root_path'] + '/genre/genres_data.json', 'r') as file:
    genre_songs_collection = json.load(file)

In [16]:
# Converting genre_songs_collection into a data frame

appended_data = []
for genre_type, genre_songs_details in genre_songs_collection.items():
    if genre_type != 'Electronic' :
        temp_df = pd.DataFrame.from_dict(genre_songs_details['songs'])
        #temp_df['genere'] = genre_type
        temp_df.insert(0, 'song_id', genre_type + '_' + temp_df['id'])
        temp_df.insert(1, 'genere', genre_type)
        appended_data.append(temp_df) # store DataFrame in list 
genre_df = pd.concat(appended_data)
genre_df.shape

(52, 43)

In [17]:
genre_df['genere'].value_counts()

workout     20
romantic    16
rock        16
Name: genere, dtype: int64

In [18]:
genre_df.head()

Unnamed: 0,song_id,genere,320kbps,album,album_url,albumid,artistMap,cache_state,copyright_text,duration,...,singers,song,starred,starring,type,vcode,vlink,year,disabled,disabled_text
0,workout_ptVsuhLj,workout,True,Garmi (From 'Street Dancer 3D'),https://www.jiosaavn.com/album/garmi-from-stre...,18434002,"{'Badshah': '456863', 'Neha Kakkar': '464932',...",False,℗ 2019 Super Cassettes Industries Private Limited,182,...,"Neha Kakkar, Badshah",Garmi,False,"Varun Dhawan, Prabhu Deva, Shraddha Kapoor, No...",,10910091082929,https://jiotunepreview.jio.com/content/Convert...,2019,,
1,workout_9xnr1G5u,workout,True,Baar Baar Dekho,https://www.jiosaavn.com/album/baar-baar-dekho...,2480284,"{'Amar Arshi': '457728', 'Amrik Singh': '54578...",False,Zee Music Company,187,...,"Amar Arshi, Badshah, Neha Kakkar",Kala Chashma,False,"Sidharth Malhotra, Katrina Kaif, Sarika, Ram K...",,10910440266684,https://jiotunepreview.jio.com/content/Convert...,2016,,
2,workout_yshGBwsU,workout,True,Befikre,https://www.jiosaavn.com/album/befikre/0J-8H55...,2652003,"{'Arijit Singh': '459320', 'Caralisa Monteiro'...",False,© 2016 YRF Music,237,...,"Arijit Singh, Caralisa Monteiro",Nashe Si Chadh Gayi,False,"Ranveer Singh, Vaani Kapoor",,10910390528186,https://jiotunepreview.jio.com/content/Convert...,2016,,
3,workout_Xo1Z1OwW,workout,True,Raabta,https://www.jiosaavn.com/album/raabta/CKUSdZyu...,10660301,"{'Amitabh Bhattacharya': '458681', 'Arijit Sin...",False,© 2017 T-Series,275,...,Arijit Singh,Ik Vaari Aa,False,"Sushant Singh Rajput, Kriti Sanon, Jim Sarbh, ...",,10910090619276,https://jiotunepreview.jio.com/content/Convert...,2017,,
4,workout_3ERwm_MU,workout,True,Shivaay,https://www.jiosaavn.com/album/shivaay/BNW4dXY...,2659075,"{'Abigail Eames': '2134917', 'Ajay Devgn': '46...",False,© 2016 T-Series,374,...,"Mithoon, Mohit Chauhan, Sukhwinder Singh, Bads...",Bolo Har Har Har,False,"Ajay Devgn, Sayyeshaa, Erika Kaar, Abigail Eam...",,10910090345218,https://jiotunepreview.jio.com/content/Convert...,2016,,


In [20]:
#save the data into CSV
genre_df.to_csv(paths['root_path'] + '/genre/genres_data.csv', index = False)

In [21]:
# downloading songs from a url and saving in mp4 format
genre_mp4_mapping = dict(zip(genre_df['song_id'], genre_df['media_url']))
for idx, url in tqdm(genre_mp4_mapping.items()):
    try:
        download_path = paths['root_path'] + '/genre/mp4/' + str(idx) +'.mp4'
        req.urlretrieve(url, download_path)
    except:
        print('Error ocuured for Id:', idx)

100%|██████████| 52/52 [03:08<00:00,  3.63s/it]


### Data collection and pre-processing Hindi -Artist

In [22]:
# Reading artists_data 
with open(paths['root_path'] + '/artist/artists_data.json', 'r') as file:
    artist_songs_collection = json.load(file)

In [23]:
# Converting genre_songs_collection into a data frame
appended_data = []
for artist_name, artist_songs_details in artist_songs_collection.items():
    temp_df = pd.DataFrame.from_dict(artist_songs_details['songs'])
    temp_df.insert(0, 'song_id', artist_name + '_' + temp_df['id'])
    temp_df.insert(1, 'artist', artist_name)
    appended_data.append(temp_df) # store DataFrame in list
    
artist_df = pd.concat(appended_data)
artist_df.shape

(60, 41)

In [24]:
artist_df['artist'].value_counts()

arijit       20
sonunigam    20
atifaslam    20
Name: artist, dtype: int64

In [25]:
# save the data into CSV
artist_df.to_csv(paths['root_path'] + '/artist/artists_data.csv', index = False)

In [26]:
# downloading songs from a url and saving in mp3 format
artist_mp4_mapping = dict(zip(artist_df['song_id'], artist_df['media_url']))
for idx, url in tqdm(artist_mp4_mapping.items()):
    try:
        download_path = paths['root_path'] +'/artist/mp4/' + str(idx) +'.mp4'
        req.urlretrieve(url, download_path)
    except:
        print('Error ocuured for Id:', idx)

100%|██████████| 60/60 [04:14<00:00,  4.25s/it]
