# Initialization

This notebook outputs configuration files used by Analysis notebook under the same directory

In [2]:
!mkdir -p config

In [4]:
bucket_name = input("Please enter the S3 Storage Bucket name: ")
f = open("./config/config.conf", "w")
f.write(bucket_name)
f.close()

Please enter the S3 Storage Bucket name:  samplebucket


### Data Preparation
Compress audio files into different bitrate/sample rate randomly.

The following blocks retrieve the audio files from "./song" under the same directory, and outputs the compressed files into "./songs/compressed"

In [None]:
# run this block if packages not installed
!conda install -y -c conda-forge ffmpeg
!pip install pydub
!pip install audio_metadata

In [None]:
from pydub import AudioSegment
import os
import random as rd
import audio_metadata

bitrate_list = ["16k", "32k", "96k", "128k", "160k", "192k", "256k", "320k"] 
samplerate_list = ["8000", "11025", "16000", "22050", "44100"]
supported_format = ["mp3", "flac"] #add on if needed
list_no = [] # list containing auido files that have corrupted metadata
decode_err = [] # list containing audio files failed to compress
directory = os.fsencode("./songs")
for count, file in enumerate(os.listdir(directory)):
    filename = os.fsdecode(file)
    extension = os.path.splitext(filename)[1].replace('.', '')
    if extension in supported_format:
        metadata = audio_metadata.load("./songs/{}".format(filename))
        try:
            title = metadata['tags']['title'][0].replace('/', '&') # Modify this line if not processing songs
            artist = metadata['tags']['artist'][0].replace('/', '&') # Modify this line if not processing songs
            bitrate = metadata['streaminfo']['bitrate']
            samplerate = metadata['streaminfo']['sample_rate']
            genre = metadata['tags']['genre'][0]
            try:
                lyric_test = metadata['tags']['lyrics'][0]['text']
            except:
                lyric_test = "null"
        except:
            print("Error proccessing {}, corrupted metadata".format(filename))
            list_no.append(filename)
            continue
        try:
            AudioSegment.from_file("./songs/{}".format(filename)).export(
                "./songs/compressed/{} - {}.mp3".format(artist, title),
                format = "mp3",
                bitrate = rd.sample(bitrate_list, 1)[0],
                tags = {"title": title, "artist": artist, "lyrics": lyric_test, "genre": genre},
                parameters = ["-ar", rd.sample(samplerate_list, 1)[0]]
            )
        except:
            decode_err.append(filename)
