#### Some background reading and source for our data: 
https://towardsdatascience.com/how-i-built-a-lo-fi-music-web-player-with-ai-generated-tracks-36f3915e39f8/
"We have a couple of options when it comes to the music data format we are training the model on: raw audio, audio features (e.g. time frequency representations like Mel spectrogram), or symbolic music representation (e.g. midi files). Our goal is to generate a solo track (i.e. a sequence of notes, chords, and rests) to layer on other components like drum loops, so midi files are the easiest and most effective format to achieve our goal. Raw audio is very computationally expensive to train on."

##### data source: https://www.kaggle.com/datasets/zakarii/lofi-hip-hop-midi/data

In [1]:
# I've created a DataCleaning class to handle the data loading and cleaning process.
# You can call on this class to load the data and clean it up as needed.
# for example: 

import kagglehub
from data.data_cleaning import DataCleaning, logger

# 1) Download the dataset and grab the local folder path
path = kagglehub.dataset_download("zakarii/lofi-hip-hop-midi")
logger.info(f"Dataset lives at: {path}")

# 2) Point the DataCleaning class at that folder (no __main__ needed)
cleaner = DataCleaning(midi_dir=path)

# 3) Run it, capturing the in-memory objects
encoded_seqs, sym2int, dur2int = cleaner.run(save=True)

  from .autonotebook import tqdm as notebook_tqdm
INFO:data.data_cleaning:Dataset lives at: /Users/arielazria/.cache/kagglehub/datasets/zakarii/lofi-hip-hop-midi/versions/1
INFO:data.data_cleaning:Found 93 MIDI files in /Users/arielazria/.cache/kagglehub/datasets/zakarii/lofi-hip-hop-midi/versions/1.
Parsing MIDI files: 100%|██████████| 93/93 [00:00<00:00, 497.36it/s]
INFO:data.data_cleaning:Parsed 93 sequences from MIDI files.
INFO:data.data_cleaning:Built symbol vocab (258) and duration vocab (34).
INFO:data.data_cleaning:Encoded 93 sequences.
INFO:data.data_cleaning:Saved encoded sequences to: processed_lofi_data/encoded_sequences.json
INFO:data.data_cleaning:Saved symbol vocabulary to: processed_lofi_data/symbol_to_int.json
INFO:data.data_cleaning:Saved duration vocabulary to: processed_lofi_data/duration_to_int.json


In [None]:
# now you can use those in modeling. 
# save=False won't save the files locally

In [1]:
import os
from huggingface_hub import snapshot_download

# 1) (Optional) set env vars so *all* HF artifacts go under your OneDrive folder:
BASE = "'/Users/arielazria/Library/CloudStorage/OneDrive-TheUniversityofChicago/School/Bayesian - Spring 2025/Final Project/hf_data'"

os.makedirs(BASE + "/datasets", exist_ok=True)
os.makedirs(BASE + "/models", exist_ok=True)
os.makedirs(BASE + "/modules", exist_ok=True)

os.environ["HF_HOME"]            = BASE         # root for everything
os.environ["HF_DATASETS_CACHE"]  = BASE + "/datasets"
os.environ["TRANSFORMERS_CACHE"] = BASE + "/models"
os.environ["HF_MODULES_CACHE"]   = BASE + "/modules"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#!pip install huggingface_hub[hf_transfer]
# Set HF_HUB_ENABLE_HF_TRANSFER=1 as an environment variable for faster downloads

Collecting hf-transfer>=0.1.4 (from huggingface_hub[hf_transfer])
  Downloading hf_transfer-0.1.9-cp38-abi3-macosx_11_0_arm64.whl.metadata (1.7 kB)
Downloading hf_transfer-0.1.9-cp38-abi3-macosx_11_0_arm64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[0mInstalling collected packages: hf-transfer
[0mSuccessfully installed hf-transfer-0.1.9

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

In [4]:
snapshot_download(
    repo_id="vikhyatk/lofi",
    repo_type="dataset",
    local_dir="/Users/arielazria/Library/CloudStorage/OneDrive-TheUniversityofChicago/School/Bayesian - Spring 2025/Final Project/hf_data/datasets/vikhyatk_lofi",
    resume_download=True,
)

Fetching 1032 files:  16%|█▋        | 169/1032 [16:40<1:25:13,  5.93s/it]Error while downloading from https://cdn-lfs-us-1.hf.co/repos/d8/ac/d8ac448d35744659d32d26c3b4614fce52cf0492010551d1217ba29fff3c36d3/72ff416afed3feca0a81c2407fb4fbdf5b77d30afc90523239316bedffea640f?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27dynamic-prompts-2bd28898-8214-4c15-b94e-049a18156239.parquet%3B+filename%3D%22dynamic-prompts-2bd28898-8214-4c15-b94e-049a18156239.parquet%22%3B&Expires=1747102296&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0NzEwMjI5Nn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zL2Q4L2FjL2Q4YWM0NDhkMzU3NDQ2NTlkMzJkMjZjM2I0NjE0ZmNlNTJjZjA0OTIwMTA1NTFkMTIxN2JhMjlmZmYzYzM2ZDMvNzJmZjQxNmFmZWQzZmVjYTBhODFjMjQwN2ZiNGZiZGY1Yjc3ZDMwYWZjOTA1MjMyMzkzMTZiZWRmZmVhNjQwZj9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=bEG8aqsgeM70avEccUJU4FpjoE6sKAQd4Dtyg1Psk4vCaBaeqvVaLKcevCjNOmcd8-5T1%7EHfcvoF1o8qXz54spYInx7cP2sgTHO

OSError: [Errno 28] No space left on device