# Big Data Exam Preprocessing Notebook

In [None]:
import os
import json

folder_path = "data"
output_path = "dataset"
json_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith(".json")]

In [None]:
from dataclasses import dataclass

@dataclass
class Playlist:
    pid: int
    name: str
    num_followers: int

@dataclass()
class Track:
    uri: str
    name: str
    duration: int
    artist_uri: str
    album_uri: str
    album_name: str

    def __hash__(self):
        return hash(self.uri)
    
    def __eq__(self, other):
        return self.uri == other.uri

@dataclass(eq=True, frozen=True)
class TrackInPlaylist:
    pid: int
    track_uri: str
    pos: int

@dataclass
class Artist:
    uri: str
    name: str

    def __hash__(self):
        return hash(self.uri)
    
    def __eq__(self, other):
        return self.uri == other.uri

## Loading Playlists, Tracks and Artists Data

In [None]:
from typing import List, MutableSet

playlists: List[Playlist] = list()
tracks: MutableSet[Track] = set()
artists: MutableSet[Artist] = set()
tracks_in_playlists: MutableSet[TrackInPlaylist] = set()

playlists_loaded = 0

for file in json_files:
    with open(file, "r") as f:
        data = json.load(f)
        data_playlists = data['playlists']
        for playlist in data_playlists:
            playlists_loaded += 1
            print(f"\rCount: {playlists_loaded}/1.000.000", end="")
            parsed_playlist = Playlist(pid=playlist['pid'], name=playlist['name'], num_followers=playlist['num_followers'])
            playlists.append(parsed_playlist)
            playlist_tracks = playlist['tracks']

            for track in playlist_tracks:
                parsed_track = Track(uri=track['track_uri'], duration=track['duration_ms'], name=track['track_name'], artist_uri=track['artist_uri'], album_uri=track['album_uri'], album_name=track['album_name'])
                tracks.add(parsed_track)

                parsed_artist = Artist(uri=track['artist_uri'], name=track['artist_name'])
                artists.add(parsed_artist)

                parsed_track_in_playlist = TrackInPlaylist(pid=playlist['pid'], track_uri=track['track_uri'], pos=track['pos'])
                tracks_in_playlists.add(parsed_track_in_playlist)


In [None]:
import csv

playlist_csv_path = os.path.join(output_path, "playlists.csv")
sorted_playlists = sorted(playlists, key=lambda x: x.pid)

with open(playlist_csv_path, "w", newline="") as f:
    writer = csv.writer(f, delimiter=",", lineterminator="\n")
    writer.writerow(["pid", "name", "num_followers"])
    for playlist in sorted_playlists:
        writer.writerow([playlist.pid, playlist.name, playlist.num_followers])

## Exporting Tracks

In [None]:
tracks_csv_path = os.path.join(output_path, "tracks.csv")

with open(tracks_csv_path, "w", newline="") as f:
    writer = csv.writer(f, delimiter=",", lineterminator="\n")
    writer.writerow(["uri", "name", "duration", "artist_uri", "album_uri", "album_name"])
    for track in tracks:
        writer.writerow([track.uri, track.name, track.duration, track.artist_uri, track.album_uri, track.album_name])    


## Exporting Artist

In [None]:
artists_csv_path = os.path.join(output_path, "artists.csv")

with open(artists_csv_path, "w", newline="") as f:
    writer = csv.writer(f, delimiter=",", lineterminator="\n")
    writer.writerow(["uri", "name"])
    for artist in artists:
        writer.writerow([artist.uri, artist.name])

## Exporting Tracks in Playlists

In [None]:
tracks_in_playlists_csv_path = os.path.join(output_path, "tracks_in_playlists.csv")

with open(tracks_in_playlists_csv_path, "w", newline="") as f:
    writer = csv.writer(f, delimiter=",", lineterminator="\n")
    writer.writerow(["pid", "track_uri", "pos"])
    for track_in_playlist in tracks_in_playlists:
        writer.writerow([track_in_playlist.pid, track_in_playlist.track_uri, track_in_playlist.pos])