In [1]:
# need to run the test.ipynb file before this code so that you can retrieve the preprocessing csv file 

In [2]:
# import dependencies
from pathlib import Path
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
from datetime import datetime
from sqlalchemy.orm import sessionmaker
import csv

file_path = Path("../main_dataset.csv")
df = pd.read_csv("../main_dataset.csv") # for the dataset without preprocessing
preprocessing_df = pd.read_csv("../preprocessing.csv", low_memory=False) # dataset from the test.ipynb file with preprocessing


In [3]:
preprocessing_df = preprocessing_df[preprocessing_df['release_date'].notna() & preprocessing_df['release_date'].str.match(r'\d{4}-\d{2}-\d{2}')]
# 15,173 removed as they do not include a release date, only the year. 


preprocessing_df.insert(0, 'index', range(1, len(preprocessing_df) + 1))
preprocessing_df = preprocessing_df.drop(columns=['Unnamed: 0'])

column_mapping = {
    'C♯/D♭': 'CD',
    'D♯/E♭': 'DE',
    'F♯/G♭': 'FG',
    'G♯/A♭': 'GA',
    'A♯/B♭': 'AB', 
    'instrumentalness_above_0.5': 'instrumentalness_above_5',
    'instrumentalness_below_0.5': 'instrumentalness_below_5',
    'speechiness_non-speech': 'speechiness_non_speech'
}

# Rename columns
preprocessing_df.rename(columns=column_mapping, inplace=True)

print(preprocessing_df.columns.tolist())
print(preprocessing_df.shape[0])

['index', 'track_uri', 'name', 'artists_names', 'popularity', 'is_playable', 'release_date', 'artists_uris', 'playlist_uris', 'danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'analysis_url', 'duration_ms', 'artists_popularities', 'artists_genres', 'artists_followers', 'album', 'compilation', 'single', 'release_dayofweek', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'release_month', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December', 'time_since_release', 'pitch_names', 'C', 'CD', 'D', 'DE', 'E', 'F', 'FG', 'G', 'GA', 'A', 'AB', 'B', 'mode_minor', 'mode_major', 'speechiness_mixed', 'speechiness_non_speech', 'speechiness_speech', 'instrumentalness_above_5', 'instrumentalness_below_5', 'is_live', 'is_not_live', 'time_signature_4_4', 'time_signature_other', 'broader_genre_category']
211728


In [4]:
genre_mapping = {
    'Miscellaneous': 0,
    'Pop': 1,
    'Rock': 2,
    'Hip Hop/Rap': 3,
    'Electronic/Dance': 4,
    'Jazz/Blues': 5,
    'Classical/Instrumental': 6,
    'Ambient/Chill': 7,
    'R&B/Soul/Funk': 8,
    'Country/Folk': 9,
    'Soundtrack/Theme': 10,
    'Christian/Religious': 11,
    'World/International': 12,
    'Alternative/Indie': 13,
    'Metal': 14,
    'Latin': 15,
    'Dance/Club': 16,
    "Children's Music": 17,
    'Experimental': 18,
    'Seasonal/Holiday': 19
}

# Create a new column 'genre_numeric' by mapping the 'broader_genre_category' column
preprocessing_df['genre_numeric'] = preprocessing_df['broader_genre_category'].map(genre_mapping)

preprocessing_df.head(5)


Unnamed: 0,index,track_uri,name,artists_names,popularity,is_playable,release_date,artists_uris,playlist_uris,danceability,...,speechiness_non_speech,speechiness_speech,instrumentalness_above_5,instrumentalness_below_5,is_live,is_not_live,time_signature_4_4,time_signature_other,broader_genre_category,genre_numeric
0,1,spotify:track:0GYpisgsrUSd8B4UqksmfY,新緑,['上野大樹'],25,True,2023-04-05,['spotify:artist:5YPkOSqagMwjOWf7PLjHNF'],['spotify:playlist:37i9dQZF1DWSt89CX9de4L'],0.595,...,1,0,0,1,0,1,1,0,Pop,1
1,2,spotify:track:3b9eglykqfxtTaSpSidP9u,またね,['Lucky Kilimanjaro'],47,True,2023-04-05,['spotify:artist:2V8UZPMR1EbkXhzvEGBTrV'],['spotify:playlist:37i9dQZF1DXahYFr91pFvG'],0.807,...,1,0,0,1,0,1,1,0,R&B/Soul/Funk,8
2,3,spotify:track:2iOn50LwBLQxhL5pMoIMpW,Sueño Flamenco - Original Mix,['Marksman'],5,True,2023-04-04,['spotify:artist:59ggRM2BowvTe9nlnwGlBT'],['spotify:playlist:1ILlyqabDcTv0mbnTxE7Od'],0.597,...,1,0,1,0,0,1,1,0,Miscellaneous,0
3,4,spotify:track:68aueb4O4xxqwsBPiP7dLS,Nightshift,['Khainz'],27,True,2023-03-31,['spotify:artist:71yD5VENn9Wy1IECnpYWvX'],['spotify:playlist:37i9dQZF1DX8AliSIsGeKd'],0.757,...,1,0,1,0,0,1,1,0,Electronic/Dance,4
4,5,spotify:track:2lRBoCWxkUcEicwVQjQugG,The Spider's thread,['Penthouse'],15,True,2023-03-29,['spotify:artist:50QaWH5OLY3Pkt1XNCGk6L'],['spotify:playlist:37i9dQZF1DWSt89CX9de4L'],0.641,...,1,0,0,1,0,1,1,0,Pop,1


In [5]:
song_attribute = ['"index"', 'is_playable', 'danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness', 
                  'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'Monday', 'Tuesday', 
                  'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'January', 'February', 'March', 'April', 
                  'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December', 'time_since_release', 
                  'C', 'CD', 'D', 'DE', 'E', 'F', 'FG', 'G', 'GA', 'A', 'AB', 'B', 'mode_minor', 'mode_major', 
                  'speechiness_mixed', 'speechiness_non_speech', 'speechiness_speech', 'instrumentalness_above_5', 
                  'instrumentalness_below_5', 'is_live', 'is_not_live', 'time_signature_4_4', 'time_signature_other']


other_attribute = ['"index"', 'track_uri', 'name', 'artists_names', 'popularity', 'release_date', 
                   'artists_uris', 'playlist_uris', 'artists_popularities', 'artists_genres', 
                   'artists_followers', 'album', 'compilation', 'single', 'release_dayofweek', 
                   'release_month', 'pitch_names', 'broader_genre_category']

In [6]:
# create engine and new sqlite file 
engine = create_engine('sqlite:///converted_file.sqlite')
preprocessing_df.to_sql('dataset', engine, if_exists = 'replace', index=False)

# start session 
Session = sessionmaker(bind = engine)
session = Session()

# print first 10 rows of dataset 
result = session.execute('select * from dataset limit 5')
for row in result:
    print(row) 

(1, 'spotify:track:0GYpisgsrUSd8B4UqksmfY', '新緑', "['上野大樹']", 25, 1, '2023-04-05', "['spotify:artist:5YPkOSqagMwjOWf7PLjHNF']", "['spotify:playlist:37i9dQZF1DWSt89CX9de4L']", 0.595, 0.432, 2.0, -9.488, 0.0422, 0.601, 0.0, 0.0899, 0.379, 145.707, 'https://api.spotify.com/v1/audio-analysis/0GYpisgsrUSd8B4UqksmfY', 260033.0, '[48]', "[['j-acoustic', 'j-pop']]", '[69413]', 1, 0, 0, 2.0, 0, 0, 1, 0, 0, 0, 0, 4.0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 425.0, 'D', 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 'Pop', 1)
(2, 'spotify:track:3b9eglykqfxtTaSpSidP9u', 'またね', "['Lucky Kilimanjaro']", 47, 1, '2023-04-05', "['spotify:artist:2V8UZPMR1EbkXhzvEGBTrV']", "['spotify:playlist:37i9dQZF1DXahYFr91pFvG']", 0.807, 0.707, 5.0, -4.759, 0.0319, 0.0696, 0.0033, 0.31, 0.607, 123.023, 'https://api.spotify.com/v1/audio-analysis/3b9eglykqfxtTaSpSidP9u', 216730.0, '[49]', "[['city pop', 'j-indie', 'japanese electropop', 'japanese r&b']]", '[89318]', 0, 0, 1, 2.0, 0, 0, 1, 0, 0, 0, 0,

In [7]:
other_attributes_query = f"select {', '.join(other_attribute)} from dataset"
result = session.execute(other_attributes_query)
for i, row in enumerate(result):
    if i >= 5:
        break
    print(row)

(1, 'spotify:track:0GYpisgsrUSd8B4UqksmfY', '新緑', "['上野大樹']", 25, '2023-04-05', "['spotify:artist:5YPkOSqagMwjOWf7PLjHNF']", "['spotify:playlist:37i9dQZF1DWSt89CX9de4L']", '[48]', "[['j-acoustic', 'j-pop']]", '[69413]', 1, 0, 0, 2.0, 4.0, 'D', 'Pop')
(2, 'spotify:track:3b9eglykqfxtTaSpSidP9u', 'またね', "['Lucky Kilimanjaro']", 47, '2023-04-05', "['spotify:artist:2V8UZPMR1EbkXhzvEGBTrV']", "['spotify:playlist:37i9dQZF1DXahYFr91pFvG']", '[49]', "[['city pop', 'j-indie', 'japanese electropop', 'japanese r&b']]", '[89318]', 0, 0, 1, 2.0, 4.0, 'F', 'R&B/Soul/Funk')
(3, 'spotify:track:2iOn50LwBLQxhL5pMoIMpW', 'Sueño Flamenco - Original Mix', "['Marksman']", 5, '2023-04-04', "['spotify:artist:59ggRM2BowvTe9nlnwGlBT']", "['spotify:playlist:1ILlyqabDcTv0mbnTxE7Od']", '[26]', '[[]]', '[6409]', 0, 0, 1, 1.0, 4.0, 'F♯/G♭', 'Miscellaneous')
(4, 'spotify:track:68aueb4O4xxqwsBPiP7dLS', 'Nightshift', "['Khainz']", 27, '2023-03-31', "['spotify:artist:71yD5VENn9Wy1IECnpYWvX']", "['spotify:playlist:37i9dQZ

In [8]:
# song_attributes_query = f"select {', '.join(song_attribute)} from dataset limit 5"

song_attributes_query = """
select
    "index", is_playable, danceability, energy, key, loudness, speechiness, acousticness, 
    instrumentalness, liveness, valence, tempo, duration_ms, Monday, Tuesday, 
    Wednesday, Thursday, Friday, Saturday, Sunday, January, February, March, April, 
    May, June, July, August, September, October, November, December, time_since_release, 
    C, CD, D, DE, E, F, FG, G, GA, A, AB, B, mode_minor, mode_major, 
    speechiness_mixed, speechiness_non_speech, speechiness_speech, instrumentalness_above_5, 
    instrumentalness_below_5, is_live, is_not_live, time_signature_4_4, time_signature_other
from dataset 
"""

result = session.execute(song_attributes_query)
for i, row in enumerate(result):
    if i >= 5:
        break
    print(row)


(1, 1, 0.595, 0.432, 2.0, -9.488, 0.0422, 0.601, 0.0, 0.0899, 0.379, 145.707, 260033.0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 425.0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0)
(2, 1, 0.807, 0.707, 5.0, -4.759, 0.0319, 0.0696, 0.0033, 0.31, 0.607, 123.023, 216730.0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 425.0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0)
(3, 1, 0.597, 0.814, 6.0, -7.531, 0.0475, 0.000751, 0.857, 0.105, 0.0917, 124.01, 491613.0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 426.0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0)
(4, 1, 0.757, 0.913, 7.0, -7.542, 0.0483, 0.00333, 0.853, 0.133, 0.181, 123.993, 390968.0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 430.0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0)
(5, 1, 0.641, 0.758, 1.0, -5.04, 0.046, 0.102, 0.0, 0.337, 0.933, 97.969, 211747.0, 0, 0, 1, 0, 0, 0, 0, 0,

In [9]:
song_attributes = ['is_playable', 'danceability', 'energy', 'key', 'loudness', 'speechiness', 'acousticness',
                   'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'Monday', 'Tuesday',
                   'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'January', 'February', 'March', 'April',
                   'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December', 'time_since_release',
                   'C', 'CD', 'D', 'DE', 'E', 'F', 'FG', 'G', 'GA', 'A', 'AB', 'B', 'mode_minor', 'mode_major',
                   'speechiness_mixed', 'speechiness_non_speech', 'speechiness_speech', 'instrumentalness_above_5',
                   'instrumentalness_below_5', 'is_live', 'is_not_live', 'time_signature_4_4', 'time_signature_other']

additional_attributes = ['track_uri', 'name', 'artists_names', 'popularity', 'release_date', 
                         'artists_uris', 'playlist_uris', 'artists_popularities', 'artists_genres', 
                         'artists_followers', 'album', 'compilation', 'single', 'release_dayofweek', 
                         'release_month', 'pitch_names', 'broader_genre_category']

In [10]:
# # can use this section to create CSV files for song and additional attributes. 

# # Define file paths for CSV files
# song_attributes_csv = "song_attributes.csv"
# additional_attributes_csv = "additional_attributes.csv"

# # Execute and write results for song_attributes_query
# with open(song_attributes_csv, 'w', newline='') as file:
#     writer = csv.writer(file)
#     writer.writerow(["index"] + song_attributes[1:])
#     result = session.execute(song_attributes_query)
#     for row in result:
#         writer.writerow(row)

# # Execute and write results for addl_attributes_query
# with open(additional_attributes_csv, 'w', newline='') as file:
#     writer = csv.writer(file)
#     writer.writerow(["index"] + additional_attributes[1:])
#     result = session.execute(other_attributes_query)
#     for row in result:
#         writer.writerow(row)

In [11]:
# base audio features model:
model_one = ['popularity', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms','C', 'CD', 'D', 'DE', 'E', 'F', 'FG', 'G', 'GA', 'A', 'AB', 'B', 'mode_minor', 'mode_major', 'time_signature_4_4', 'time_signature_other' ]

model_one_query = f"select {', '.join(model_one)} from dataset"
result = session.execute(model_one_query)
for i, row in enumerate(result):
    if i >= 5:
        break
    print(row)

# create model dataframe 
model_one_rows = result.fetchall()
model_one_df = pd.DataFrame(model_one_rows, columns = model_one)

(25, 0.595, 0.432, -9.488, 0.0422, 0.601, 0.0, 0.0899, 0.379, 145.707, 260033.0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0)
(47, 0.807, 0.707, -4.759, 0.0319, 0.0696, 0.0033, 0.31, 0.607, 123.023, 216730.0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0)
(5, 0.597, 0.814, -7.531, 0.0475, 0.000751, 0.857, 0.105, 0.0917, 124.01, 491613.0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0)
(27, 0.757, 0.913, -7.542, 0.0483, 0.00333, 0.853, 0.133, 0.181, 123.993, 390968.0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0)
(15, 0.641, 0.758, -5.04, 0.046, 0.102, 0.0, 0.337, 0.933, 97.969, 211747.0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0)


In [12]:
model_one_df.head()

Unnamed: 0,popularity,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,...,FG,G,GA,A,AB,B,mode_minor,mode_major,time_signature_4_4,time_signature_other
0,2,0.7,0.153,-18.967,0.0555,0.986,0.934,0.111,0.545,122.046,...,0,0,0,0,0,0,0,1,0,1
1,2,0.769,0.137,-17.694,0.0478,0.989,0.957,0.116,0.259,107.034,...,0,0,0,0,0,0,0,1,1,0
2,5,0.375,0.0532,-27.67,0.0321,0.963,0.868,0.11,0.0688,130.137,...,0,0,0,0,0,0,0,1,1,0
3,37,0.63,0.374,-9.007,0.227,0.861,7.5e-05,0.294,0.373,104.955,...,1,0,0,0,0,0,0,1,1,0
4,3,0.556,0.113,-19.275,0.0391,0.992,0.965,0.111,0.267,140.256,...,0,0,0,0,0,0,0,1,0,1


In [13]:
# # audio features model without time_signature
model_two = ['popularity', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms','C', 'CD', 'D', 'DE', 'E', 'F', 'FG', 'G', 'GA', 'A', 'AB', 'B', 'mode_minor', 'mode_major']

model_two_query = f"select {', '.join(model_two)} from dataset"
result = session.execute(model_two_query)
for i, row in enumerate(result):
    if i >= 5:
        break
    print(row)

# create model dataframe 
model_two_rows = result.fetchall()
model_two_df = pd.DataFrame(model_two_rows, columns = model_two)

(25, 0.595, 0.432, -9.488, 0.0422, 0.601, 0.0, 0.0899, 0.379, 145.707, 260033.0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1)
(47, 0.807, 0.707, -4.759, 0.0319, 0.0696, 0.0033, 0.31, 0.607, 123.023, 216730.0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1)
(5, 0.597, 0.814, -7.531, 0.0475, 0.000751, 0.857, 0.105, 0.0917, 124.01, 491613.0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0)
(27, 0.757, 0.913, -7.542, 0.0483, 0.00333, 0.853, 0.133, 0.181, 123.993, 390968.0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1)
(15, 0.641, 0.758, -5.04, 0.046, 0.102, 0.0, 0.337, 0.933, 97.969, 211747.0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1)


In [14]:
model_two_df.head()

Unnamed: 0,popularity,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,...,E,F,FG,G,GA,A,AB,B,mode_minor,mode_major
0,2,0.7,0.153,-18.967,0.0555,0.986,0.934,0.111,0.545,122.046,...,1,0,0,0,0,0,0,0,0,1
1,2,0.769,0.137,-17.694,0.0478,0.989,0.957,0.116,0.259,107.034,...,0,0,0,0,0,0,0,0,0,1
2,5,0.375,0.0532,-27.67,0.0321,0.963,0.868,0.11,0.0688,130.137,...,0,1,0,0,0,0,0,0,0,1
3,37,0.63,0.374,-9.007,0.227,0.861,7.5e-05,0.294,0.373,104.955,...,0,0,1,0,0,0,0,0,0,1
4,3,0.556,0.113,-19.275,0.0391,0.992,0.965,0.111,0.267,140.256,...,0,0,0,0,0,0,0,0,0,1


In [15]:
# audio features model with thresholds
model_three = ['popularity', 'danceability', 'energy', 'loudness', 'valence', 'tempo', 'duration_ms','C', 'CD', 'D', 'DE', 'E', 'F', 'FG', 'G', 'GA', 'A', 'AB', 'B', 'mode_minor', 'mode_major', 'speechiness_mixed', 'speechiness_non_speech', 'speechiness_speech', 'instrumentalness_above_5', 'instrumentalness_below_5', 'is_live', 'is_not_live', 'time_signature_4_4', 'time_signature_other']

model_three_query = f"select {', '.join(model_three)} from dataset"
result = session.execute(model_three_query)
for i, row in enumerate(result):
    if i >= 5:
        break
    print(row)

# create model dataframe 
model_three_rows = result.fetchall()
model_three_df = pd.DataFrame(model_three_rows, columns = model_three)

(25, 0.595, 0.432, -9.488, 0.379, 145.707, 260033.0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0)
(47, 0.807, 0.707, -4.759, 0.607, 123.023, 216730.0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0)
(5, 0.597, 0.814, -7.531, 0.0917, 124.01, 491613.0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0)
(27, 0.757, 0.913, -7.542, 0.181, 123.993, 390968.0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0)
(15, 0.641, 0.758, -5.04, 0.933, 97.969, 211747.0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0)


In [16]:
model_three_df.head()

Unnamed: 0,popularity,danceability,energy,loudness,valence,tempo,duration_ms,C,CD,D,...,mode_major,speechiness_mixed,speechiness_non_speech,speechiness_speech,instrumentalness_above_5,instrumentalness_below_5,is_live,is_not_live,time_signature_4_4,time_signature_other
0,2,0.7,0.153,-18.967,0.545,122.046,128339.0,0,0,0,...,1,0,1,0,1,0,0,1,0,1
1,2,0.769,0.137,-17.694,0.259,107.034,127009.0,0,0,0,...,1,0,1,0,1,0,0,1,1,0
2,5,0.375,0.0532,-27.67,0.0688,130.137,107076.0,0,0,0,...,1,0,1,0,1,0,0,1,1,0
3,37,0.63,0.374,-9.007,0.373,104.955,196427.0,0,0,0,...,1,0,1,0,0,1,0,1,1,0
4,3,0.556,0.113,-19.275,0.267,140.256,125667.0,1,0,0,...,1,0,1,0,1,0,0,1,0,1


In [17]:
# audio features model with thresholds without time signature
model_four = ['popularity', 'danceability', 'energy', 'loudness', 'valence', 'tempo', 'duration_ms','C', 'CD', 'D', 'DE', 'E', 'F', 'FG', 'G', 'GA', 'A', 'AB', 'B', 'mode_minor', 'mode_major', 'speechiness_mixed', 'speechiness_non_speech', 'speechiness_speech', 'instrumentalness_above_5', 'instrumentalness_below_5', 'is_live', 'is_not_live']

model_four_query = f"select {', '.join(model_four)} from dataset"
result = session.execute(model_four_query)
for i, row in enumerate(result):
    if i >= 5:
        break
    print(row)

# create model dataframe 
model_four_rows = result.fetchall()
model_four_df = pd.DataFrame(model_four_rows, columns = model_four)

(25, 0.595, 0.432, -9.488, 0.379, 145.707, 260033.0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1)
(47, 0.807, 0.707, -4.759, 0.607, 123.023, 216730.0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1)
(5, 0.597, 0.814, -7.531, 0.0917, 124.01, 491613.0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1)
(27, 0.757, 0.913, -7.542, 0.181, 123.993, 390968.0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1)
(15, 0.641, 0.758, -5.04, 0.933, 97.969, 211747.0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1)


In [18]:
model_four_df.head()

Unnamed: 0,popularity,danceability,energy,loudness,valence,tempo,duration_ms,C,CD,D,...,B,mode_minor,mode_major,speechiness_mixed,speechiness_non_speech,speechiness_speech,instrumentalness_above_5,instrumentalness_below_5,is_live,is_not_live
0,2,0.7,0.153,-18.967,0.545,122.046,128339.0,0,0,0,...,0,0,1,0,1,0,1,0,0,1
1,2,0.769,0.137,-17.694,0.259,107.034,127009.0,0,0,0,...,0,0,1,0,1,0,1,0,0,1
2,5,0.375,0.0532,-27.67,0.0688,130.137,107076.0,0,0,0,...,0,0,1,0,1,0,1,0,0,1
3,37,0.63,0.374,-9.007,0.373,104.955,196427.0,0,0,0,...,0,0,1,0,1,0,0,1,0,1
4,3,0.556,0.113,-19.275,0.267,140.256,125667.0,1,0,0,...,0,0,1,0,1,0,1,0,0,1


In [19]:
# audio features with other attributes
model_five = ['popularity', 'is_playable', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December', 'time_since_release', 'C', 'CD', 'D', 'DE', 'E', 'F', 'FG', 'G', 'GA', 'A', 'AB', 'B', 'mode_minor', 'mode_major', 'time_signature_4_4', 'time_signature_other', 'artists_popularities','artists_followers', 'artists_followers', 'album', 'compilation', 'single', 'genre_numeric']

model_five_query = f"select {', '.join(model_five)} from dataset"
result = session.execute(model_five_query)
for i, row in enumerate(result):
    if i >= 5:
        break
    print(row)

# create model dataframe 
model_five_rows = result.fetchall()
model_five_df = pd.DataFrame(model_five_rows, columns = model_five)

(25, 1, 0.595, 0.432, -9.488, 0.0422, 0.601, 0.0, 0.0899, 0.379, 145.707, 260033.0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 425.0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, '[48]', '[69413]', '[69413]', 1, 0, 0, 1)
(47, 1, 0.807, 0.707, -4.759, 0.0319, 0.0696, 0.0033, 0.31, 0.607, 123.023, 216730.0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 425.0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, '[49]', '[89318]', '[89318]', 0, 0, 1, 8)
(5, 1, 0.597, 0.814, -7.531, 0.0475, 0.000751, 0.857, 0.105, 0.0917, 124.01, 491613.0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 426.0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, '[26]', '[6409]', '[6409]', 0, 0, 1, 0)
(27, 1, 0.757, 0.913, -7.542, 0.0483, 0.00333, 0.853, 0.133, 0.181, 123.993, 390968.0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 430.0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, '[28]', '[12983]', '[12983]', 0, 0, 1, 4)
(15, 1, 0.641, 0.758, -5.04, 0.046, 0.102,

In [20]:
model_five_df.head()

Unnamed: 0,popularity,is_playable,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,...,mode_major,time_signature_4_4,time_signature_other,artists_popularities,artists_followers,artists_followers.1,album,compilation,single,genre_numeric
0,2,1,0.7,0.153,-18.967,0.0555,0.986,0.934,0.111,0.545,...,1,0,1,[33],[127],[127],0,0,1,0
1,2,1,0.769,0.137,-17.694,0.0478,0.989,0.957,0.116,0.259,...,1,1,0,"[50, 40]","[579, 563]","[579, 563]",0,0,1,0
2,5,1,0.375,0.0532,-27.67,0.0321,0.963,0.868,0.11,0.0688,...,1,1,0,[76],[190995],[190995],0,0,1,0
3,37,1,0.63,0.374,-9.007,0.227,0.861,7.5e-05,0.294,0.373,...,1,1,0,"[87, 91]","[7007008, 16261027]","[7007008, 16261027]",0,0,1,15
4,3,1,0.556,0.113,-19.275,0.0391,0.992,0.965,0.111,0.267,...,1,0,1,[50],[1957],[1957],0,0,1,6


In [21]:
# audio features with other attributes and no time signature
model_six = ['popularity', 'is_playable', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December', 'time_since_release', 'C', 'CD', 'D', 'DE', 'E', 'F', 'FG', 'G', 'GA', 'A', 'AB', 'B', 'mode_minor', 'mode_major', 'artists_popularities','artists_followers', 'artists_followers', 'album', 'compilation', 'single', 'genre_numeric']

model_six_query = f"select {', '.join(model_six)} from dataset"
result = session.execute(model_six_query)
for i, row in enumerate(result):
    if i >= 5:
        break
    print(row)

# create model dataframe 
model_six_rows = result.fetchall()
model_six_df = pd.DataFrame(model_six_rows, columns = model_six)

(25, 1, 0.595, 0.432, -9.488, 0.0422, 0.601, 0.0, 0.0899, 0.379, 145.707, 260033.0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 425.0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, '[48]', '[69413]', '[69413]', 1, 0, 0, 1)
(47, 1, 0.807, 0.707, -4.759, 0.0319, 0.0696, 0.0033, 0.31, 0.607, 123.023, 216730.0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 425.0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, '[49]', '[89318]', '[89318]', 0, 0, 1, 8)
(5, 1, 0.597, 0.814, -7.531, 0.0475, 0.000751, 0.857, 0.105, 0.0917, 124.01, 491613.0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 426.0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, '[26]', '[6409]', '[6409]', 0, 0, 1, 0)
(27, 1, 0.757, 0.913, -7.542, 0.0483, 0.00333, 0.853, 0.133, 0.181, 123.993, 390968.0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 430.0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, '[28]', '[12983]', '[12983]', 0, 0, 1, 4)
(15, 1, 0.641, 0.758, -5.04, 0.046, 0.102, 0.0, 0.337, 0.933, 97.9

In [22]:
model_six_df.head()

Unnamed: 0,popularity,is_playable,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,...,B,mode_minor,mode_major,artists_popularities,artists_followers,artists_followers.1,album,compilation,single,genre_numeric
0,2,1,0.7,0.153,-18.967,0.0555,0.986,0.934,0.111,0.545,...,0,0,1,[33],[127],[127],0,0,1,0
1,2,1,0.769,0.137,-17.694,0.0478,0.989,0.957,0.116,0.259,...,0,0,1,"[50, 40]","[579, 563]","[579, 563]",0,0,1,0
2,5,1,0.375,0.0532,-27.67,0.0321,0.963,0.868,0.11,0.0688,...,0,0,1,[76],[190995],[190995],0,0,1,0
3,37,1,0.63,0.374,-9.007,0.227,0.861,7.5e-05,0.294,0.373,...,0,0,1,"[87, 91]","[7007008, 16261027]","[7007008, 16261027]",0,0,1,15
4,3,1,0.556,0.113,-19.275,0.0391,0.992,0.965,0.111,0.267,...,0,0,1,[50],[1957],[1957],0,0,1,6


In [23]:
# audio features with thresholds and other attributes
model_seven = ['popularity', 'is_playable', 'danceability', 'energy', 'loudness', 'acousticness', 'valence', 'tempo', 'duration_ms', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December', 'time_since_release', 'C', 'CD', 'D', 'DE', 'E', 'F', 'FG', 'G', 'GA', 'A', 'AB', 'B', 'mode_minor', 'mode_major','speechiness_mixed', 'speechiness_non_speech', 'speechiness_speech', 'instrumentalness_above_5', 'instrumentalness_below_5', 'is_live', 'is_not_live', 'time_signature_4_4', 'time_signature_other', 'artists_popularities','artists_followers', 'artists_followers', 'album', 'compilation', 'single', 'genre_numeric']

model_seven_query = f"select {', '.join(model_seven)} from dataset"
result = session.execute(model_seven_query)
for i, row in enumerate(result):
    if i >= 5:
        break
    print(row)

# create model dataframe 
model_seven_rows = result.fetchall()
model_seven_df = pd.DataFrame(model_seven_rows, columns = model_seven)

(25, 1, 0.595, 0.432, -9.488, 0.601, 0.379, 145.707, 260033.0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 425.0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, '[48]', '[69413]', '[69413]', 1, 0, 0, 1)
(47, 1, 0.807, 0.707, -4.759, 0.0696, 0.607, 123.023, 216730.0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 425.0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, '[49]', '[89318]', '[89318]', 0, 0, 1, 8)
(5, 1, 0.597, 0.814, -7.531, 0.000751, 0.0917, 124.01, 491613.0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 426.0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, '[26]', '[6409]', '[6409]', 0, 0, 1, 0)
(27, 1, 0.757, 0.913, -7.542, 0.00333, 0.181, 123.993, 390968.0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 430.0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, '[28]', '[12983]', '[12983]', 0, 0, 1, 4)
(15, 1, 0.641, 0.758, -5.04, 0.102, 0.933, 97

In [24]:
model_seven_df.head()

Unnamed: 0,popularity,is_playable,danceability,energy,loudness,acousticness,valence,tempo,duration_ms,Monday,...,is_not_live,time_signature_4_4,time_signature_other,artists_popularities,artists_followers,artists_followers.1,album,compilation,single,genre_numeric
0,2,1,0.7,0.153,-18.967,0.986,0.545,122.046,128339.0,0,...,1,0,1,[33],[127],[127],0,0,1,0
1,2,1,0.769,0.137,-17.694,0.989,0.259,107.034,127009.0,0,...,1,1,0,"[50, 40]","[579, 563]","[579, 563]",0,0,1,0
2,5,1,0.375,0.0532,-27.67,0.963,0.0688,130.137,107076.0,0,...,1,1,0,[76],[190995],[190995],0,0,1,0
3,37,1,0.63,0.374,-9.007,0.861,0.373,104.955,196427.0,0,...,1,1,0,"[87, 91]","[7007008, 16261027]","[7007008, 16261027]",0,0,1,15
4,3,1,0.556,0.113,-19.275,0.992,0.267,140.256,125667.0,0,...,1,0,1,[50],[1957],[1957],0,0,1,6


In [25]:
# audio features with thresholds and other attributes and no time signature
model_eight = ['popularity', 'is_playable', 'danceability', 'energy', 'loudness', 'acousticness', 'valence', 'tempo', 'duration_ms', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December', 'time_since_release', 'C', 'CD', 'D', 'DE', 'E', 'F', 'FG', 'G', 'GA', 'A', 'AB', 'B', 'mode_minor', 'mode_major','speechiness_mixed', 'speechiness_non_speech', 'speechiness_speech', 'instrumentalness_above_5', 'instrumentalness_below_5', 'is_live', 'is_not_live', 'artists_popularities','artists_followers', 'artists_followers', 'album', 'compilation', 'single', 'genre_numeric']

model_eight_query = f"select {', '.join(model_eight)} from dataset"
result = session.execute(model_eight_query)
for i, row in enumerate(result):
    if i >= 5:
        break
    print(row)

# create model dataframe 
model_eight_rows = result.fetchall()
model_eight_df = pd.DataFrame(model_eight_rows, columns = model_eight)

(25, 1, 0.595, 0.432, -9.488, 0.601, 0.379, 145.707, 260033.0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 425.0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, '[48]', '[69413]', '[69413]', 1, 0, 0, 1)
(47, 1, 0.807, 0.707, -4.759, 0.0696, 0.607, 123.023, 216730.0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 425.0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, '[49]', '[89318]', '[89318]', 0, 0, 1, 8)
(5, 1, 0.597, 0.814, -7.531, 0.000751, 0.0917, 124.01, 491613.0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 426.0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, '[26]', '[6409]', '[6409]', 0, 0, 1, 0)
(27, 1, 0.757, 0.913, -7.542, 0.00333, 0.181, 123.993, 390968.0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 430.0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, '[28]', '[12983]', '[12983]', 0, 0, 1, 4)
(15, 1, 0.641, 0.758, -5.04, 0.102, 0.933, 97.969, 211747.0, 0, 0, 1,

In [26]:
model_eight_df.head()

Unnamed: 0,popularity,is_playable,danceability,energy,loudness,acousticness,valence,tempo,duration_ms,Monday,...,instrumentalness_below_5,is_live,is_not_live,artists_popularities,artists_followers,artists_followers.1,album,compilation,single,genre_numeric
0,2,1,0.7,0.153,-18.967,0.986,0.545,122.046,128339.0,0,...,0,0,1,[33],[127],[127],0,0,1,0
1,2,1,0.769,0.137,-17.694,0.989,0.259,107.034,127009.0,0,...,0,0,1,"[50, 40]","[579, 563]","[579, 563]",0,0,1,0
2,5,1,0.375,0.0532,-27.67,0.963,0.0688,130.137,107076.0,0,...,0,0,1,[76],[190995],[190995],0,0,1,0
3,37,1,0.63,0.374,-9.007,0.861,0.373,104.955,196427.0,0,...,1,0,1,"[87, 91]","[7007008, 16261027]","[7007008, 16261027]",0,0,1,15
4,3,1,0.556,0.113,-19.275,0.992,0.267,140.256,125667.0,0,...,0,0,1,[50],[1957],[1957],0,0,1,6


In [27]:
# audio features with thresholds and other attributes and no time signature
model_nine = ['popularity', 'valence', 'tempo', 'time_since_release', 'genre_numeric']

model_nine_query = f"select {', '.join(model_nine)} from dataset"
result = session.execute(model_nine_query)
for i, row in enumerate(result):
    if i >= 5:
        break
    print(row)

# create model dataframe 
model_nine_rows = result.fetchall()
model_nine_df = pd.DataFrame(model_nine_rows, columns = model_nine)

(25, 0.379, 145.707, 425.0, 1)
(47, 0.607, 123.023, 425.0, 8)
(5, 0.0917, 124.01, 426.0, 0)
(27, 0.181, 123.993, 430.0, 4)
(15, 0.933, 97.969, 432.0, 1)


In [28]:
model_nine_df.head()

Unnamed: 0,popularity,valence,tempo,time_since_release,genre_numeric
0,2,0.545,122.046,437.0,0
1,2,0.259,107.034,437.0,0
2,5,0.0688,130.137,437.0,0
3,37,0.373,104.955,437.0,15
4,3,0.267,140.256,437.0,6


In [29]:
session.close()
