# Integration of Downloaded Data

Given the data downloaded from Spotify API, transform and process them into CSV files.

## Track Integration

In [None]:
import os, json, csv
import pandas as pd

folder_path = "download/tracks"
output_path = "output"

artists_csv_path = os.path.join(output_path, "artists.csv")
albums_csv_path = os.path.join(output_path, "albums.csv")
tracks_csv_path = os.path.join(output_path, "tracks.csv")

json_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith(".json")]

In [None]:
from dataclasses import dataclass

@dataclass
class Artist:
    uri: str
    name: str

    def __hash__(self):
        return hash(self.uri)
    
    def __eq__(self, other):
        return self.uri == other.uri

@dataclass    
class Album:
    uri: str
    type: str # album type (album, single, compilation)
    artists: str
    available_markets: str
    name: str
    release_year: str
    total_tracks: int

    def __hash__(self):
        return hash(self.uri)
    
    def __eq__(self, other):
        return self.uri == other.uri

@dataclass()
class Track:
    uri: str
    name: str
    duration: int
    explicit: bool
    artists: str
    available_markets: str
    album_uri: str
    popularity: int

    def __hash__(self):
        return hash(self.uri)
    
    def __eq__(self, other):
        return self.uri == other.uri


In [None]:
from typing import MutableSet

tracks: MutableSet[Track] = set()
artists: MutableSet[Artist] = set()
albums: MutableSet[Album] = set()

tracks_loaded = 0
none_tracks = 0

for file in json_files:
    with open(file, "r") as f:
        data = json.load(f)
        data_tracks = data['tracks']
        for track in data_tracks:
            if track is None:
                none_tracks += 1
                continue
            tracks_loaded += 1

            print(f"\rCount: {tracks_loaded}/2.262.292, none_count: {none_tracks}", end="")

            for artist in track['artists']:
                parsed_artist = Artist(uri=artist['uri'], name=artist['name'])
                artists.add(parsed_artist)
            
            for artist in track['album']['artists']:
                parsed_artist = Artist(uri=artist['uri'], name=artist['name'])
                artists.add(parsed_artist)
            
            parsed_album = Album(
                uri = track['album']['uri'],
                type = track['album']['album_type'],
                artists = "|".join([artist["uri"] for artist in track['album']['artists']]),
                available_markets = "|".join(track['album']['available_markets']),
                name = track['album']['name'],
                release_year = track['album']['release_date'].split("-")[0],
                total_tracks = track['album']['total_tracks']    
            )
            albums.add(parsed_album)

            parsed_track = Track(
                    uri=track['uri'],
                    name=track['name'],
                    duration=track['duration_ms'],
                    explicit=track['explicit'],
                    artists="|".join([artist['uri'] for artist in track['artists']]),
                    available_markets="|".join(track['available_markets']),
                    album_uri=track['album']['uri'],
                    popularity=track['popularity']
            )
            tracks.add(parsed_track)


In [None]:
with open(tracks_csv_path, "w", newline="") as f:
    writer = csv.writer(f, delimiter=",", lineterminator="\n")
    writer.writerow(["uri", "name", "duration", "explicit", "artists", "available_markets", "album_uri", "popularity"])
    for track in tracks:
        writer.writerow([track.uri, track.name, track.duration, track.explicit, track.artists, track.available_markets, track.album_uri, track.popularity]) 

with open(artists_csv_path, "w", newline="") as f:
    writer = csv.writer(f, delimiter=",", lineterminator="\n")
    writer.writerow(["uri", "name"])
    for artist in artists:
        writer.writerow([artist.uri, artist.name])

with open(albums_csv_path, "w", newline="") as f:
    writer = csv.writer(f, delimiter=",", lineterminator="\n")
    writer.writerow(["uri", "type", "artists", "available_markets", "name", "release_year", "total_tracks"])
    for album in albums:
        writer.writerow([album.uri, album.type, album.artists, album.available_markets, album.name, album.release_year, album.total_tracks])

## Artist Integration

In [None]:
from dataclasses import dataclass

@dataclass(frozen=True, eq=True, order=True)
class Artist:
    uri: str
    name: str
    followers: int
    genres: str
    popularity: int

    def __hash__(self):
        return hash(self.uri)
    
    def __eq__(self, other):
        return self.uri == other.uri

In [None]:
folder_path = "download/artists_full"
json_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith(".json")]

artists = []
count = 0
total = len(json_files)

for file in json_files:
    count += 1
    print(f"\rCount: {count}/{total}", end="")
    with open(file, "r") as f:
        data = json.load(f)
        for entry in data['artists']:
            artist = Artist(
                uri=entry['uri'],
                name=entry['name'],
                followers=entry['followers']['total'],
                genres="|".join(entry['genres']),
                popularity=entry['popularity']
            )
            artists.append(artist)

artists_df = pd.DataFrame(artists)
artists_df.to_csv("output/downloaded_artists.csv", index=False)