In [1]:
import psycopg2
import pandas as pd
from config import config
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
from sklearn.decomposition import PCA


# set up SQl connection
params = config()
conn = psycopg2.connect(**params)
cur = conn.cursor()

def main_db():
    query = ("""SELECT sp.s_id, 
    sp.s_release_date, 
    sp.s_track_name, 
    artistname, 
    sp.featuresid, 
    popularity, 
    danceability, 
    energy, 
    music_key, 
    loudness, 
    music_mode, 
    speechiness, 
    acousticness, 
    instrumentalness, 
    liveness, 
    valence, 
    tempo, 
    time_signature,
    channelname,
    datepublished,
    view_count
    FROM youtube_videos yt
    INNER JOIN channel_names cn
    ON yt.channelid = cn.channelid
    INNER JOIN spotify_tracks sp
    ON yt.s_id = sp.s_id 
    INNER JOIN tracks_artists ta
    ON sp.s_id = ta.s_id
    INNER JOIN artist_names an
    ON ta.artistid = an.artistid
    INNER JOIN audio_features af
    ON sp.featuresid = af.featuresid
    """)
    cur.execute(query)
    return pd.read_sql(query, con=conn)

In [2]:
df = main_db()
df.head()

Unnamed: 0,s_id,s_release_date,s_track_name,artistname,featuresid,popularity,danceability,energy,music_key,loudness,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,channelname,datepublished,view_count
0,1HNC0b3AN2jbMqN3FElzfX,2017-08-25,Mantra,TroyBoi,0,38.0,0.738,0.868,2,-4.988,...,0.0459,0.0115,0.832,0.0831,0,94,4,BassNation,2016-11-21,1450567
1,1HNC0b3AN2jbMqN3FElzfX,2017-08-25,Mantra,TroyBoi,0,38.0,0.738,0.868,2,-4.988,...,0.0459,0.0115,0.832,0.0831,0,94,4,TrapNation,2021-01-25,110164
2,3zSwpAhbPIF4TonAOO35Pk,2021-01-07,NFS,Juche,1,33.0,0.472,0.981,2,-0.797,...,0.0547,0.0246,0.000275,0.271,0,150,4,TrapNation,2021-01-23,145696
3,1F6VtyBAeSSh6FCBzT251f,2020-08-01,Akita,Skeler,2,41.0,0.466,0.688,10,-9.102,...,0.0375,0.265,0.395,0.116,0,128,3,TrapNation,2021-01-21,191266
4,1OAaiLYD5BbWTjTMPymaV5,2019-10-04,Unity,Axel Thesleff,3,35.0,0.658,0.534,9,-7.907,...,0.0426,0.0807,0.0103,0.0781,0,120,4,TrapNation,2021-01-17,314356


In [17]:
df = df.convert_dtypes()
df['datepublished'] = pd.to_datetime(df['datepublished'])
df['s_release_date'] = pd.to_datetime(df['s_release_date'])

In [18]:
artists = df['artistname'].unique()

artist_options = []
for a in artists:
    artist_options.append({'label': a, 'value': a})

print(artist_options)


[{'label': 'TroyBoi', 'value': 'TroyBoi'}, {'label': 'Juche', 'value': 'Juche'}, {'label': 'Skeler', 'value': 'Skeler'}, {'label': 'Axel Thesleff', 'value': 'Axel Thesleff'}, {'label': 'Ezra', 'value': 'Ezra'}, {'label': 'DROELOE', 'value': 'DROELOE'}, {'label': 'Jordan Comolli', 'value': 'Jordan Comolli'}, {'label': 'KLOUD', 'value': 'KLOUD'}, {'label': 'Fabian Mazur', 'value': 'Fabian Mazur'}, {'label': 'Fytch', 'value': 'Fytch'}, {'label': 'barnacle boi', 'value': 'barnacle boi'}, {'label': 'CloZee', 'value': 'CloZee'}, {'label': 'Alison Wonderland', 'value': 'Alison Wonderland'}, {'label': 'LEXIM', 'value': 'LEXIM'}, {'label': 'Hopex', 'value': 'Hopex'}, {'label': 'San Holo', 'value': 'San Holo'}, {'label': 'Arrested Youth', 'value': 'Arrested Youth'}, {'label': 'Bishu', 'value': 'Bishu'}, {'label': 'JAWNS', 'value': 'JAWNS'}, {'label': 'TIGER DROOL', 'value': 'TIGER DROOL'}, {'label': 'QUIX', 'value': 'QUIX'}, {'label': 'Vincent', 'value': 'Vincent'}, {'label': 'Sidewalks and Skel

In [21]:
artists = ['RL Grime', 'TroyBoi', 'Kaivon', 'Illenium']
channels = ['TrapNation','TrapCity','BassNation','UKFDubstep','BassMusicMovement','DubRebellion']
# ['TrapNation','TrapCity','BassNation','UKFDubstep','BassMusicMovement','DubRebellion']
#s_dates = ['2009-01-01', '2020-12-01']
s_dates = [min(df['s_release_date']), max(df['s_release_date'])]
y_dates = ['2009-01-01', '2020-12-01']

if artists:
    df_filtered = df[df['artistname'].isin(artists) & 
                 df['channelname'].isin(channels) &
                 df['s_release_date'].isin(pd.date_range(s_dates[0], s_dates[1])) &
                 df['datepublished'].between(y_dates[0], y_dates[1])]
else:
    df_filtered = df[df['channelname'].isin(channels) &
                 df['s_release_date'].between(s_dates[0], s_dates[1]) &
                 df['datepublished'].between(y_dates[0], y_dates[1])]
df_filtered

Unnamed: 0,s_id,s_release_date,s_track_name,artistname,featuresid,popularity,danceability,energy,music_key,loudness,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,channelname,datepublished,view_count
0,1HNC0b3AN2jbMqN3FElzfX,2017-08-25,Mantra,TroyBoi,0,38,0.738,0.868,2,-4.988,...,0.0459,0.0115,0.832,0.0831,0,94,4,BassNation,2016-11-21,1450567
6,4Qgr3o07k8NuGRRno8z9rO,2017-12-04,X2C,TroyBoi,5,36,0.675,0.563,0,-7.072,...,0.033,0.000765,0.0733,0.0749,0,136,4,BassNation,2017-12-02,141351
7,4Qgr3o07k8NuGRRno8z9rO,2017-12-04,X2C,TroyBoi,5,36,0.675,0.563,0,-7.072,...,0.033,0.000765,0.0733,0.0749,0,136,4,TrapCity,2017-12-21,919311
67,0f30zxuzCZ3ayoKY88nnAo,2020-11-13,Free Falling,Kaivon,34,50,0.546,0.623,6,-8.334,...,0.0561,0.00839,0.0271,0.231,0,120,4,TrapNation,2020-11-14,177271
68,02uwXnLbfT68JKAShQ0VTS,2020-11-13,Mother Africa,TroyBoi,35,52,0.739,0.69,10,-7.448,...,0.0314,0.142,0.61,0.134,1,132,4,TrapCity,2020-11-12,79550
69,02uwXnLbfT68JKAShQ0VTS,2020-11-13,Mother Africa,TroyBoi,35,52,0.739,0.69,10,-7.448,...,0.0314,0.142,0.61,0.134,1,132,4,TrapNation,2020-11-13,857575
369,5sl2TZpcQJAoiBK5ncSTk2,2020-03-06,RUSYNTH (feat. RL Grime),RL Grime,189,40,0.455,0.808,8,-5.831,...,0.0395,0.0334,0.921,0.151,0,160,4,TrapCity,2020-03-09,225941
370,5sl2TZpcQJAoiBK5ncSTk2,2020-03-06,RUSYNTH (feat. RL Grime),RL Grime,189,40,0.455,0.808,8,-5.831,...,0.0395,0.0334,0.921,0.151,0,160,4,TrapNation,2020-03-21,275019
403,1coBBKWnPLu1lsDbuzUVIf,2020-02-20,AJA AJA,TroyBoi,211,44,0.808,0.62,8,-3.468,...,0.103,0.0327,0.000933,0.0418,0,87,4,TrapCity,2020-02-20,484801
404,1coBBKWnPLu1lsDbuzUVIf,2020-02-20,AJA AJA,TroyBoi,211,44,0.808,0.62,8,-3.468,...,0.103,0.0327,0.000933,0.0418,0,87,4,TrapNation,2020-02-20,379624


In [39]:
fig = px.scatter(df_filtered, x="view_count", y="popularity", hover_name='s_track_name')
fig.show()

In [61]:
fig = go.Figure()
for channel in channels:
    fig.add_trace(go.Histogram(x=df['popularity'][df['channelname'] == channel], name=channel))

fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.75)
fig.update_layout(
    title=dict(text='Popularity',
               y=0.9,
               x=0.5,
               xanchor='center',
               yanchor= 'top'),
    xaxis_title_text='Popularity',
    yaxis_title_text='Count'
)
fig.show()

In [66]:
fig = go.Figure()
for artist in artists:
    fig.add_trace(go.Histogram(x=df['popularity'][df['artistname'] == artist], name=artist))

fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.75)
fig.update_layout(
    title=dict(text='Popularity',
               y=0.9,
               x=0.5,
               xanchor='center',
               yanchor= 'top'),
    xaxis_title_text='Popularity',
    yaxis_title_text='Count'
)
fig.show()

In [73]:
audio_features = ['danceability', 'energy', 'music_key', 'loudness', 'music_mode', 'speechiness', 
                  'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 
                  'time_signature']
X = df_filtered[audio_features].to_numpy(dtype='float')
X_id = pd.merge(df_filtered[['s_track_name', 's_id']], df_filtered[audio_features], left_index=True, right_index=True)

pca = PCA(n_components=2)
components = pca.fit_transform(X)

fig = px.scatter(components, x=0, y=1, hover_name=X_id['s_track_name'])

fig.show()