In [1]:
# import dependencies
from pathlib import Path
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
from datetime import datetime
from sqlalchemy.orm import sessionmaker
import csv


# file path and df set up 
# the main resources folder was too large for upload in github so it is on my local machine
# link to the resource is in the readme and you can insert your path below to run locally
file_path = Path("../main_dataset.csv")
df = pd.read_csv("../main_dataset.csv")

# add index column for uniformity with the dataset 
df.insert(0, 'index', range(1, len(df) + 1))


In [2]:
def convert_to_unix(date_str):
    try:
        date = pd.to_datetime(date_str)
        return int(date.timestamp())
    
    except ValueError:
        try:
            year = int(date_str)
            month = np.random.randint(1, 12)
            day = np.random.randint(1, 28) 
            date = datetime(year, month, day)
            return int(date.timestamp())
        
        except ValueError:
            return int(datetime(1970, 1, 1).timestamp())
        


In [3]:
# Apply the convert_to_unix function to the release_date column and create a new column 'unix_time'
df['unix_time'] = df['release_date'].apply(convert_to_unix)

In [4]:
# create engine and new sqlite file 
engine = create_engine('sqlite:///converted_file.sqlite')
df.to_sql('dataset', engine, if_exists = 'replace', index=False)

# start session 
Session = sessionmaker(bind = engine)
session = Session()

# print first 10 rows of dataset 
result = session.execute('select * from dataset limit 5')
for row in result:
    print(row) 

(1, 'spotify:track:3v6sBj3swihU8pXQQHhDZo', 'Way Up (feat. Ava Re)', "['Floduxe']", 0, 'single', 1, '2015-11-20', "['spotify:artist:4B7SI5PsnEgeMQd6gSIrbn']", "['spotify:playlist:5JJZvA3VR9RZ5XIr0reWiM']", 0.611, 0.614, 5.0, -8.815, 0.0, 0.0672, 0.0169, 0.000794, 0.753, 0.52, 128.05, 'https://api.spotify.com/v1/audio-analysis/3v6sBj3swihU8pXQQHhDZo', 195000.0, 4.0, '[9]', '[[]]', '[765]', 1447977600)
(2, 'spotify:track:7KCWmFdw0TzoJbKtqRRzJO', 'Fantasy - Felix Jaehn Remix', "['Alina Baraz', 'Galimatias']", 33, 'album', 1, '2015-09-04', "['spotify:artist:6hfwwpXqZPRC9CsKI7qtv1', 'spotify:artist:0tOrKkXIn3VYyVHFEPG6Xd']", "['spotify:playlist:5JJZvA3VR9RZ5XIr0reWiM']", 0.638, 0.781, 4.0, -6.848, 0.0, 0.0285, 0.0118, 0.00953, 0.349, 0.25, 122.985, 'https://api.spotify.com/v1/audio-analysis/7KCWmFdw0TzoJbKtqRRzJO', 194641.0, 4.0, '[68, 56]', "[['etherpop', 'pop', 'r&b'], ['chillstep', 'future bass']]", '[1283665, 138396]', 1441324800)
(3, 'spotify:track:2CY92qejUrhyPUASawNVRr', 'Try Me', "[

In [5]:
result = session.execute("select * from dataset where release_date not like '____-__-__' limit 5")
for row in result:
    print(row)

# 19864 rows not in the right format (looks like only the year is included)
# still getting errors for random integer assignment for the month/day 

(6, 'spotify:track:41MOCUNOgWtaYBFUsGnpZ5', 'The Safety Dance - Video Version', "['Men Without Hats']", 37, 'album', 1, '1982', "['spotify:artist:34PLzyi7CdXUekiLHYyqXq']", "['spotify:playlist:3o1jJdcl3fz0nxnEvmpozM', 'spotify:playlist:2MRwwqPvBdkihfVVkbdyyc', 'spotify:playlist:1QGAGnozUEKcW3YHAY0OJP', 'spotify:playlist:519N2pOOQrNuVt3hJ7GF7S', 'spotify:playlist:12qiUmQsX8f8ZRBayEGhyn', 'spotify:playlist:5JJZvA3VR9RZ5XIr0reWiM']", 0.572, 0.837, 5.0, -7.876, 1.0, 0.0367, 0.0197, 0.0, 0.163, 0.627, 100.343, 'https://api.spotify.com/v1/audio-analysis/41MOCUNOgWtaYBFUsGnpZ5', 166920.0, 4.0, '[55]', "[['classic canadian rock', 'new romantic', 'new wave', 'new wave pop', 'synthpop']]", '[208619]', 378691200)
(103, 'spotify:track:7bNX2Fgb4MjPrYbkHaCNwV', 'Roll With It', "['Easton Corbin']", 57, 'album', 1, '2010', "['spotify:artist:070kGpqtESdDsLb3gdMIyx']", "['spotify:playlist:49QHtigsIlJ7DhsBElxQjC', 'spotify:playlist:2VxF0Ope2So8d0PQQv9iol', 'spotify:playlist:0J74JRyDCMotTzAEKMfwYN']", 0.7

In [6]:
song_attributes = ['"index"', 'track_uri', 'danceability', 'energy', 'key', 'loudness', 
                   'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 
                   'valence', 'tempo', 'duration_ms', 'time_signature', 'popularity']

additional_attributes = ['"index"', 'track_uri', 'name', 'artists_names', 'popularity', 'unix_time', 'artists_popularities']

In [7]:
song_attributes_query = f"select {', '.join(song_attributes)} from dataset"
result = session.execute(song_attributes_query)

In [8]:
addl_attributes_query = f"select {', '.join(additional_attributes)} from dataset"
result = session.execute(addl_attributes_query)

In [9]:
# Define file paths for CSV files
song_attributes_csv = "song_attributes.csv"
additional_attributes_csv = "additional_attributes.csv"

# Execute and write results for song_attributes_query
with open(song_attributes_csv, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["index"] + song_attributes[1:])
    result = session.execute(song_attributes_query)
    for row in result:
        writer.writerow(row)

# Execute and write results for addl_attributes_query
with open(additional_attributes_csv, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["index"] + additional_attributes[1:])
    result = session.execute(addl_attributes_query)
    for row in result:
        writer.writerow(row)