In [1]:
import psycopg2
import pandas as pd
from config import config
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
from sklearn.decomposition import PCA
import numpy as np
import csv

# set up SQl connection
params = config()
conn = psycopg2.connect(**params)
cur = conn.cursor()

def main_db():
    query = ("""SELECT sp.s_id, 
    sp.s_release_date, 
    sp.s_track_name, 
    artistname, 
    sp.featuresid, 
    popularity, 
    danceability, 
    energy, 
    music_key, 
    loudness, 
    music_mode, 
    speechiness, 
    acousticness, 
    instrumentalness, 
    liveness, 
    valence, 
    tempo, 
    time_signature,
    channelname,
    datepublished,
    view_count
    FROM youtube_videos yt
    INNER JOIN channel_names cn
    ON yt.channelid = cn.channelid
    INNER JOIN spotify_tracks sp
    ON yt.s_id = sp.s_id 
    INNER JOIN tracks_artists ta
    ON sp.s_id = ta.s_id
    INNER JOIN artist_names an
    ON ta.artistid = an.artistid
    INNER JOIN audio_features af
    ON sp.featuresid = af.featuresid
    """)
    cur.execute(query)
    return pd.read_sql(query, con=conn)

In [2]:
df = main_db()
df.head()

Unnamed: 0,s_id,s_release_date,s_track_name,artistname,featuresid,popularity,danceability,energy,music_key,loudness,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,channelname,datepublished,view_count
0,1HNC0b3AN2jbMqN3FElzfX,2017-08-25,Mantra,TroyBoi,0,38.0,0.738,0.868,2,-4.988,...,0.0459,0.0115,0.832,0.0831,0,94,4,BassNation,2016-11-21,1450567
1,1HNC0b3AN2jbMqN3FElzfX,2017-08-25,Mantra,TroyBoi,0,38.0,0.738,0.868,2,-4.988,...,0.0459,0.0115,0.832,0.0831,0,94,4,TrapNation,2021-01-25,110164
2,3zSwpAhbPIF4TonAOO35Pk,2021-01-07,NFS,Juche,1,33.0,0.472,0.981,2,-0.797,...,0.0547,0.0246,0.000275,0.271,0,150,4,TrapNation,2021-01-23,145696
3,1F6VtyBAeSSh6FCBzT251f,2020-08-01,Akita,Skeler,2,41.0,0.466,0.688,10,-9.102,...,0.0375,0.265,0.395,0.116,0,128,3,TrapNation,2021-01-21,191266
4,1OAaiLYD5BbWTjTMPymaV5,2019-10-04,Unity,Axel Thesleff,3,35.0,0.658,0.534,9,-7.907,...,0.0426,0.0807,0.0103,0.0781,0,120,4,TrapNation,2021-01-17,314356


In [3]:
df = df.convert_dtypes()
df['datepublished'] = pd.to_datetime(df['datepublished'])
df['s_release_date'] = pd.to_datetime(df['s_release_date'])

In [13]:
test_sum = df[df['artistname'].isin(['TroyBoi', 'RL Grime'])].groupby('artistname').agg({'popularity' : ['mean', 'min', 'max']}).transpose()
test_sum = pd.DataFrame(test_sum)
test_sum.head()

Unnamed: 0,artistname,RL Grime,TroyBoi
popularity,mean,41.357143,42.071429
popularity,min,15.0,8.0
popularity,max,59.0,58.0


In [18]:
artists = df['artistname'].unique()

artist_options = []
for a in artists:
    artist_options.append({'label': a, 'value': a})

print(artist_options)


[{'label': 'TroyBoi', 'value': 'TroyBoi'}, {'label': 'Juche', 'value': 'Juche'}, {'label': 'Skeler', 'value': 'Skeler'}, {'label': 'Axel Thesleff', 'value': 'Axel Thesleff'}, {'label': 'Ezra', 'value': 'Ezra'}, {'label': 'DROELOE', 'value': 'DROELOE'}, {'label': 'Jordan Comolli', 'value': 'Jordan Comolli'}, {'label': 'KLOUD', 'value': 'KLOUD'}, {'label': 'Fabian Mazur', 'value': 'Fabian Mazur'}, {'label': 'Fytch', 'value': 'Fytch'}, {'label': 'barnacle boi', 'value': 'barnacle boi'}, {'label': 'CloZee', 'value': 'CloZee'}, {'label': 'Alison Wonderland', 'value': 'Alison Wonderland'}, {'label': 'LEXIM', 'value': 'LEXIM'}, {'label': 'Hopex', 'value': 'Hopex'}, {'label': 'San Holo', 'value': 'San Holo'}, {'label': 'Arrested Youth', 'value': 'Arrested Youth'}, {'label': 'Bishu', 'value': 'Bishu'}, {'label': 'JAWNS', 'value': 'JAWNS'}, {'label': 'TIGER DROOL', 'value': 'TIGER DROOL'}, {'label': 'QUIX', 'value': 'QUIX'}, {'label': 'Vincent', 'value': 'Vincent'}, {'label': 'Sidewalks and Skel

In [21]:
artists = ['RL Grime', 'TroyBoi', 'Kaivon', 'Illenium']
channels = ['TrapNation','TrapCity','BassNation','UKFDubstep','BassMusicMovement','DubRebellion']
# ['TrapNation','TrapCity','BassNation','UKFDubstep','BassMusicMovement','DubRebellion']
#s_dates = ['2009-01-01', '2020-12-01']
s_dates = [min(df['s_release_date']), max(df['s_release_date'])]
y_dates = ['2009-01-01', '2020-12-01']

if artists:
    df_filtered = df[df['artistname'].isin(artists) & 
                 df['channelname'].isin(channels) &
                 df['s_release_date'].isin(pd.date_range(s_dates[0], s_dates[1])) &
                 df['datepublished'].between(y_dates[0], y_dates[1])]
else:
    df_filtered = df[df['channelname'].isin(channels) &
                 df['s_release_date'].between(s_dates[0], s_dates[1]) &
                 df['datepublished'].between(y_dates[0], y_dates[1])]
df_filtered

Unnamed: 0,s_id,s_release_date,s_track_name,artistname,featuresid,popularity,danceability,energy,music_key,loudness,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,channelname,datepublished,view_count
0,1HNC0b3AN2jbMqN3FElzfX,2017-08-25,Mantra,TroyBoi,0,38,0.738,0.868,2,-4.988,...,0.0459,0.0115,0.832,0.0831,0,94,4,BassNation,2016-11-21,1450567
6,4Qgr3o07k8NuGRRno8z9rO,2017-12-04,X2C,TroyBoi,5,36,0.675,0.563,0,-7.072,...,0.033,0.000765,0.0733,0.0749,0,136,4,BassNation,2017-12-02,141351
7,4Qgr3o07k8NuGRRno8z9rO,2017-12-04,X2C,TroyBoi,5,36,0.675,0.563,0,-7.072,...,0.033,0.000765,0.0733,0.0749,0,136,4,TrapCity,2017-12-21,919311
67,0f30zxuzCZ3ayoKY88nnAo,2020-11-13,Free Falling,Kaivon,34,50,0.546,0.623,6,-8.334,...,0.0561,0.00839,0.0271,0.231,0,120,4,TrapNation,2020-11-14,177271
68,02uwXnLbfT68JKAShQ0VTS,2020-11-13,Mother Africa,TroyBoi,35,52,0.739,0.69,10,-7.448,...,0.0314,0.142,0.61,0.134,1,132,4,TrapCity,2020-11-12,79550
69,02uwXnLbfT68JKAShQ0VTS,2020-11-13,Mother Africa,TroyBoi,35,52,0.739,0.69,10,-7.448,...,0.0314,0.142,0.61,0.134,1,132,4,TrapNation,2020-11-13,857575
369,5sl2TZpcQJAoiBK5ncSTk2,2020-03-06,RUSYNTH (feat. RL Grime),RL Grime,189,40,0.455,0.808,8,-5.831,...,0.0395,0.0334,0.921,0.151,0,160,4,TrapCity,2020-03-09,225941
370,5sl2TZpcQJAoiBK5ncSTk2,2020-03-06,RUSYNTH (feat. RL Grime),RL Grime,189,40,0.455,0.808,8,-5.831,...,0.0395,0.0334,0.921,0.151,0,160,4,TrapNation,2020-03-21,275019
403,1coBBKWnPLu1lsDbuzUVIf,2020-02-20,AJA AJA,TroyBoi,211,44,0.808,0.62,8,-3.468,...,0.103,0.0327,0.000933,0.0418,0,87,4,TrapCity,2020-02-20,484801
404,1coBBKWnPLu1lsDbuzUVIf,2020-02-20,AJA AJA,TroyBoi,211,44,0.808,0.62,8,-3.468,...,0.103,0.0327,0.000933,0.0418,0,87,4,TrapNation,2020-02-20,379624


In [39]:
fig = px.scatter(df_filtered, x="view_count", y="popularity", hover_name='s_track_name')
fig.show()

In [61]:
fig = go.Figure()
for channel in channels:
    fig.add_trace(go.Histogram(x=df['popularity'][df['channelname'] == channel], name=channel))

fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.75)
fig.update_layout(
    title=dict(text='Popularity',
               y=0.9,
               x=0.5,
               xanchor='center',
               yanchor= 'top'),
    xaxis_title_text='Popularity',
    yaxis_title_text='Count'
)
fig.show()

In [66]:
fig = go.Figure()
for artist in artists:
    fig.add_trace(go.Histogram(x=df['popularity'][df['artistname'] == artist], name=artist))

fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.75)
fig.update_layout(
    title=dict(text='Popularity',
               y=0.9,
               x=0.5,
               xanchor='center',
               yanchor= 'top'),
    xaxis_title_text='Popularity',
    yaxis_title_text='Count'
)
fig.show()

In [73]:
audio_features = ['danceability', 'energy', 'music_key', 'loudness', 'music_mode', 'speechiness', 
                  'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 
                  'time_signature']
X = df_filtered[audio_features].to_numpy(dtype='float')
X_id = pd.merge(df_filtered[['s_track_name', 's_id']], df_filtered[audio_features], left_index=True, right_index=True)

pca = PCA(n_components=2)
components = pca.fit_transform(X)

fig = px.scatter(components, x=0, y=1, hover_name=X_id['s_track_name'])

fig.show()

In [84]:
df = main_db()
df = df.convert_dtypes()
df['datepublished'] = pd.to_datetime(df['datepublished'])
df['s_release_date'] = pd.to_datetime(df['s_release_date'])
sids = df['s_id'].unique()


In [25]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials 
import json
import time

client_id = 'a1811945814d4643b145f5c809815b62'
client_secret = '1afc4f8e057d437fa3652a3ce0d49c0d'
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager) #spotify object to access API

In [141]:
result = sp.track('4bKqAcgrla5I0xYiAuYSBC')
print(json.dumps(result, indent=4))


{
    "album": {
        "album_type": "single",
        "artists": [
            {
                "external_urls": {
                    "spotify": "https://open.spotify.com/artist/0rSRhW3EmJTsqAM3hTefwA"
                },
                "href": "https://api.spotify.com/v1/artists/0rSRhW3EmJTsqAM3hTefwA",
                "id": "0rSRhW3EmJTsqAM3hTefwA",
                "name": "TIGER DROOL",
                "type": "artist",
                "uri": "spotify:artist:0rSRhW3EmJTsqAM3hTefwA"
            },
            {
                "external_urls": {
                    "spotify": "https://open.spotify.com/artist/19EW4WBhl0fvZUQgi7wV5M"
                },
                "href": "https://api.spotify.com/v1/artists/19EW4WBhl0fvZUQgi7wV5M",
                "id": "19EW4WBhl0fvZUQgi7wV5M",
                "name": "QUIX",
                "type": "artist",
                "uri": "spotify:artist:19EW4WBhl0fvZUQgi7wV5M"
            },
            {
                "external_urls": {
        

In [105]:
artist_ids = {}
for sid in sids:
    result = sp.track(sid)
    if len(result['artists']) > 1:
        for i in range(len(result['artists'])):
            if result['artists'][i]['name'] not in artist_ids:
                artist_ids[result['artists'][i]['name']] = result['artists'][i]['id']
    else:
        if result['artists'][0]['name'] not in artist_ids:
                artist_ids[result['artists'][0]['name']] = result['artists'][0]['id']
        

In [106]:
unique_artists = df['artistname'].unique()
print(len(unique_artists))

3697


In [107]:
print(len(artist_ids))

3698


In [108]:
for i in artist_ids:
    if i not in unique_artists:
        print(i)

XIE
Chad
Wyd!
Trilo
Goja
Masta Don
Ghosh


In [109]:
for i in unique_artists:
    if i not in artist_ids:
        print(i)

Xie
Plowman
Born I Music
WYD!
TRILO
Mastadon


In [110]:
artist_ids['Eptic']

'4dvZ0abeUaiHm7Fu9Gj0PQ'

In [113]:
result = sp.artist(artist_ids['Eptic'])
print(json.dumps(result, indent=4))
print(result['followers']['total'])
print(result['genres'])

{
    "external_urls": {
        "spotify": "https://open.spotify.com/artist/4dvZ0abeUaiHm7Fu9Gj0PQ"
    },
    "followers": {
        "href": null,
        "total": 163735
    },
    "genres": [
        "brostep",
        "complextro",
        "dubstep",
        "edm",
        "electro house",
        "electronic trap",
        "filthstep"
    ],
    "href": "https://api.spotify.com/v1/artists/4dvZ0abeUaiHm7Fu9Gj0PQ",
    "id": "4dvZ0abeUaiHm7Fu9Gj0PQ",
    "images": [
        {
            "height": 640,
            "url": "https://i.scdn.co/image/f7365f22a97874385b1131b2d3690727d7e24e5f",
            "width": 640
        },
        {
            "height": 320,
            "url": "https://i.scdn.co/image/0943004528fab5e39cd5c1687adc6ef4d533b0e8",
            "width": 320
        },
        {
            "height": 160,
            "url": "https://i.scdn.co/image/92a00857a37f386f6c8f35b36d8df75fd4af744e",
            "width": 160
        }
    ],
    "name": "Eptic",
    "popularity": 

In [117]:
for artist in artist_ids:
    result = sp.artist(artist_ids[artist])
    artist_ids[artist] = {'id': artist_ids[artist], 'followers': result['followers']['total'], 'genres': result['genres']}


In [118]:
artist_ids['Eptic']

{'id': '4dvZ0abeUaiHm7Fu9Gj0PQ',
 'followers': 163735,
 'genres': ['brostep',
  'complextro',
  'dubstep',
  'edm',
  'electro house',
  'electronic trap',
  'filthstep']}

In [123]:
result = sp.artist_top_tracks(artist_ids['Eptic']['id'])
#print(json.dumps(result, indent=4))
print(result['tracks'][0]['id'])
print(result['tracks'][0]['name'])
print(len(result['tracks']))

3qgtGhoCutbBX73cd6ArCW
SouthSide (feat. Eptic)
10


In [124]:
artist_tops = {}
for artist in artist_ids:
    artist_tops[artist_ids[artist]['id']] = []
    result = sp.artist_top_tracks(artist_ids[artist]['id'])
    for i in range(len(result['tracks'])):
        artist_tops[artist_ids[artist]['id']].append(result['tracks'][i]['id'])
        

In [125]:
artist_tops.values()

dict_values([['0tAFy8LVdMQgW6A6Nq7olS', '1T3E8z6VMXSsACrBSQHTkN', '3srQQxcIZO3PxtOJ83BmDM', '1gV0hgMNdpSWeW7ZjSUKnX', '59QmjU5K8XQTfj9JjFNb4Z', '753DZsGGXlasRCp48FmOhH', '7vBqDxT6CcJhnOCRfM9Wvi', '1aQ8HdSmvofR0bjGkpvaKf', '02uwXnLbfT68JKAShQ0VTS', '1dfdPmrD0O3oiRl8uvwbKS'], ['4AnstjlAxgNRjym4uQQsju', '7M14W9QJ43hPh6gaVCcsbH', '6BWjy4BEMpbEzgwJm1637h', '3vuXoJvzhhDDQbh1BUsPIK', '4czy2hfLO3xBa85Q3Qkucb', '43JltL1Sno0XJu97qmnUDi', '4XPH8tZKpHcapMo9cxtLz4', '7tN7aVXhFiGrJn24N9faRf', '3zSwpAhbPIF4TonAOO35Pk', '1s9RYYdrGhhQTgMs4FQwC1'], ['45KsTd0ZqKlgw0fNc8oyC5', '1SMRiuk5Yt1IJztGGw7LvG', '7bgm7BtKzJTW2ViBkX4abh', '562If4OJFL1ptRIfMCEyT7', '04FXtfbY8KFILgf7P3mEul', '0bKIXko2IX0y0yOG0dzlrk', '7obCERKA5H8m7zpEVxndmM', '02lPNSgehiFIo7BcdCnkt0', '64F0tid5vwapfuC4ERAHyA', '7M14W9QJ43hPh6gaVCcsbH'], ['5ZVujHtwnsqFJvLnqRSdch', '7jzqNM05a0watKpcN0kqyk', '2Ene2KtroHMwyI4oxwEbl9', '63tlZzersJQ4qqMuFtsnXl', '1OAaiLYD5BbWTjTMPymaV5', '02UCSfDqzFyVXqnNfuD8RV', '3DsCw4OuoUVid7emUZ5xyq', '0EAPR9IXTwWN6fQf5

In [127]:
artist_tops_unique = [item for sublist in artist_tops.values() for item in sublist]

In [130]:
len(set(artist_tops_unique))

29791

In [132]:
sids_to_query = [sid for sid in set(artist_tops_unique) if sid not in sids]

In [133]:
len(sids_to_query)

26632

In [168]:
features = sp.audio_features('6V5720ymG48XTzxyvz6Diu')
print(json.dumps(features, indent=4))


[
    {
        "danceability": 0.629,
        "energy": 0.963,
        "key": 2,
        "loudness": -2.138,
        "mode": 1,
        "speechiness": 0.247,
        "acousticness": 0.13,
        "instrumentalness": 0,
        "liveness": 0.14,
        "valence": 0.345,
        "tempo": 75.0,
        "type": "audio_features",
        "id": "6V5720ymG48XTzxyvz6Diu",
        "uri": "spotify:track:6V5720ymG48XTzxyvz6Diu",
        "track_href": "https://api.spotify.com/v1/tracks/6V5720ymG48XTzxyvz6Diu",
        "analysis_url": "https://api.spotify.com/v1/audio-analysis/6V5720ymG48XTzxyvz6Diu",
        "duration_ms": 198400,
        "time_signature": 4
    }
]


In [150]:
audio_features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 
                  'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 
                  'duration_ms', 'time_signature']
new_tracks = {'s_id':'', 's_track':'', 's_artists':'', 's_release_date':'', 'popularity':''}
for f in audio_features:
    new_tracks[f] = ''
new_tracks

{'s_id': '',
 's_track': '',
 's_artists': '',
 's_release_date': '',
 'popularity': '',
 'danceability': '',
 'energy': '',
 'key': '',
 'loudness': '',
 'mode': '',
 'speechiness': '',
 'acousticness': '',
 'instrumentalness': '',
 'liveness': '',
 'valence': '',
 'tempo': '',
 'duration_ms': '',
 'time_signature': ''}

In [1]:
with open('new_tracks.csv', 'w', encoding='utf-8', newline='') as csv_file:
    fieldnames = ['s_id', 's_track', 's_artists', 's_release_date', 'popularity',
                  'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 
                  'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 
                  'duration_ms', 'time_signature']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()

    sleep_min = 2
    sleep_max = 5
    start_time = time.time()
    request_count = 0

    for sid in sids_to_query:
        if request_count % 5 == 0:
            print(str(request_count) + " tracks searched")
            time.sleep(np.random.uniform(sleep_min, sleep_max))
            print('Elapsed Time: {} seconds'.format(time.time() - start_time))  

        result = sp.track(sid)
        new_tracks['s_id'] = sid
        new_tracks['s_track'] = result['name']
        new_tracks['s_release_date'] = result['album']['release_date']
        new_tracks['popularity'] = result['popularity']
        spotify_artists = []
        for name in range(len(result['artists'])):
            spotify_artists.append(result['artists'][name]['name'])
        new_tracks['s_artists'] = spotify_artists

        features = sp.audio_features(sid)
        for f in audio_features:
            try:
                new_tracks[f] = features[0][f]
            except TypeError:
                new_tracks[f] = None

        writer.writerow(new_tracks)        
        request_count+=1


NameError: name 'csv' is not defined