In [None]:
#################### OUTLIER REMOVAL AND AVERAGING ######################
# plt.boxplot(y)
# plt.show()

# skc = 1
# att = sorted(sorted([(y[i],i) for i in range(len(y))], reverse=True)[skc:-skc], key=lambda x: x[1])
# y_cleaned = [x[0] for x in att]

# plt.plot(normalized_tone[:5000])
# new_tone = []
# sz = 2
# avg = 0
# for i in range(sz):
#     avg += y[i]
# avg /= sz
# new_tone.append(avg)
# for j in range(sz, len(y)):
#     avg += y[j]/sz - y[j-sz]/sz
#     new_tone.append(avg)
#################### OUTLIER REMOVAL AND AVERAGING ######################

#################### MIXED SIGNAL ######################
# import numpy as np
# from matplotlib import pyplot as plt
# from scipy.io.wavfile import write

# Remember SAMPLE_RATE = 44100 Hz is our playback rate
# SAMPLE_RATE = 44100  # Hertz
# write("mysinewave.wav", SAMPLE_RATE, normalized_tone)

# DURATION = 5  # Seconds

# def generate_sine_wave(freq, sample_rate, duration):
#     x = np.linspace(0, duration, sample_rate * duration, endpoint=False)
#     frequencies = x * freq
#     # 2pi because np.sin takes radians
#     y = np.sin((2 * np.pi) * frequencies)
#     return x, y

# _, nice_tone = generate_sine_wave(400, SAMPLE_RATE, DURATION)
# _, noise_tone = generate_sine_wave(4000, SAMPLE_RATE, DURATION)
# noise_tone = noise_tone * 0.3

# mixed_tone = nice_tone + noise_tone
# normalized_tone = np.int16((mixed_tone / mixed_tone.max()) * 32767)
#################### MIXED SIGNAL ######################


#################### FFT ######################
# from scipy.fft import fft, fftfreq

# # Number of samples in normalized_tone
# N = SAMPLE_RATE * DURATION

# yf = fft(normalized_tone)
# xf = fftfreq(N, 1 / SAMPLE_RATE)

# print(xf, yf)

# plt.scatter(xf, np.abs(yf))
# plt.show()
#################### FFT ######################

In [110]:
import numpy as np
import librosa
import itertools
import os
import IPython.display as ipd
import matplotlib.pyplot as plt
# import soundfile as sf
# sf.write(save_path, audio, sr)

class AudioManipulator:
    def __init__(self):
        self.chroma_hop_length=12
        self.normalizationValue = 32767.00
        self.n_mels = 128 * 2

    def joinDiffAudios(self, audios, binDurationAudioLength):
        for i in range(len(audios)):
            if(len(audios[i]) < binDurationAudioLength):
                audios[i] = np.concatenate((audios[i], np.zeros(int(binDurationAudioLength - len(audios[i])))))
        return np.sum(audios, axis = 0)

    def getAudioInterface(self, audio):
        return ipd.Audio(audio)

    def splitAudio(self, audio, sr, start_time, end_time):
        audio = audio[int(sr * start_time):int(sr * end_time)]
        return audio
    
    def shiftPitchOfAudio(self, audio, sr, pitch_shift):
        audio_with_pitch = librosa.effects.pitch_shift(audio, sr=sr, n_steps=pitch_shift)
        return audio_with_pitch

    def getStft(self, audio):
        stft = librosa.stft(audio)
        stft_db = librosa.amplitude_to_db(abs(stft))
        return stft, stft_db
    
    def getMelSpectogram(self, audio, sr):
        mel_spec = librosa.feature.melspectrogram(y=audio, sr = sr, n_mels = self.n_mels)
        mel_spec_db = librosa.amplitude_to_db(mel_spec) # ref = np.max
        return mel_spec, mel_spec_db
    
    def getChromaGram(self, audio, sr):
        chromaGram = librosa.feature.chroma_stft(y=audio, sr=sr, hop_length=self.chroma_hop_length)
        return chromaGram
    
    def compareTwoAudios(self, audio1, audio2):
        _, stft1_db = self.getStft(audio1)
        _, stft2_db = self.getStft(audio2)
        similarity = np.dot(stft1_db.flatten(), stft2_db.flatten()) / (np.linalg.norm(stft1_db) * np.linalg.norm(stft2_db))
        return similarity

    def drawAudio(self, audio, sr):
        plt.figure(figsize=(8.8, 3))
        plt.plot([(i+1)/sr for i in range(len(audio))], audio)
        plt.title('Raw Audio Example')
        plt.show()

    def drawAudioSpectrum(self, audio, sr):
        X, Xdb= self.getStft(audio)
        plt.figure(figsize=(14, 5))
        librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='log')
        plt.colorbar()
        plt.show()

    def drawAudioSpectrumNormalized(self, audio, sr):
        X, Xdb = self.getStft(audio/audio.max() * self.normalizationValue)
        plt.figure(figsize=(14, 5))
        librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='log')
        plt.colorbar()
        plt.show()

    def drawMelSpectrogram(self, audio, sr):
        S, S_db_mel = self.getMelSpectogram(audio, sr)

        fig, ax = plt.subplots(figsize=(10, 3))
        img = librosa.display.specshow(S_db_mel,
                                    x_axis='time',
                                    y_axis='log',
                                    ax=ax)
        ax.set_title('Mel Spectogram Example', fontsize=20)
        fig.colorbar(img, ax=ax, format=f'%0.2f')
        plt.show()

    def drawChromaGram(self, audio, sr):
        chromagram = self.getChromaGram(audio, sr)
        plt.figure(figsize=(15, 5))
        librosa.display.specshow(chromagram, x_axis='time', y_axis='chroma', hop_length=self.chroma_hop_length, cmap='coolwarm')

class MinecraftAudioMatcher:
    def __init__(self, baseSoundsPath, binSize):
        self.pitch_shifts = [6,12,18]
        self.bin_size = binSize
        self.baseSoundsPath = baseSoundsPath
        self.manipulator = AudioManipulator()
        
    def getAllBaseSoundsNotes(self):
        sounds = []
        for sound_file_name in os.listdir(self.baseSoundsPath):
            sound, sr = librosa.load(self.baseSoundsPath + sound_file_name)
            sound = self.manipulator.splitAudio(sound, sr, 0, self.bin_size)
            sounds.append((sound, sr, sound_file_name))

        allBaseSounds = []
        for sound, pitch_shift in itertools.product(sounds, self.pitch_shifts):
            shifted_sound = self.manipulator.shiftPitchOfAudio(sound[0], sound[1], pitch_shift)
            allBaseSounds.append((shifted_sound, sound[1], sound[2], pitch_shift))
        return allBaseSounds

    def findBestNotes(self, mainAudio, startTime, sr, maxNotesCount):
        all_base_sounds = self.getAllBaseSoundsNotes()

        best_matches = []
        for r in range(1, maxNotesCount + 1):
            print("Trying combination of", r, "audios")
            selected_combinations = itertools.combinations(all_base_sounds, r)
            for combination in selected_combinations:
                combined_audio = self.manipulator.joinDiffAudios([c_audio for c_audio,c_sr,c_name,c_shift in combination], self.bin_size * sr)
                split_audio = self.manipulator.splitAudio(mainAudio, sr, startTime, startTime + len(combined_audio)/sr)
                similarity = self.manipulator.compareTwoAudios(combined_audio, split_audio)
                best_matches.append((similarity, combination, combined_audio, split_audio))

        best_matches.sort(key=lambda x: x[0], reverse=True)
        return best_matches[0]
    

matcher = MinecraftAudioMatcher('Sounds/', 0.1)
mainAudio, sr = librosa.load('RecordedSounds/lofiBeats.m4a')

all_combination = []
times = []
startTime = 1
i = startTime
while i < startTime + 0.51:
    i = i + 0.1
    times.append(i)

combined_reconstructed_audio = []
combined_original_audio = []
for startTime in times:
    [best_combination_similarity, best_combination_audio_details, best_combination, original_audio] = matcher.findBestNotes(mainAudio, startTime, sr, 3)
    print("Similarity", best_combination_similarity)
    combination_details = []
    for audio in best_combination_audio_details:
        combination_details.append((audio[2], audio[3]))
    all_combination.append(combination_details)
    print(combination_details)
    combined_reconstructed_audio = np.concatenate((combined_reconstructed_audio, best_combination))
    combined_original_audio = np.concatenate((combined_original_audio, original_audio))


  mainAudio, sr = librosa.load('RecordedSounds/lofiBeats.m4a')


Trying combination of 1 audios
Trying combination of 2 audios
Similarity 0.9571571
[('bass.ogg', 6)]
Trying combination of 1 audios
Trying combination of 2 audios
Similarity 0.96479094
[('bass.ogg', 6)]
Trying combination of 1 audios
Trying combination of 2 audios
Similarity 0.9612
[('bass.ogg', 6)]
Trying combination of 1 audios
Trying combination of 2 audios
Similarity 0.95733535
[('bass.ogg', 6)]
Trying combination of 1 audios
Trying combination of 2 audios
Similarity 0.9645781
[('bass.ogg', 6)]
Trying combination of 1 audios
Trying combination of 2 audios
Similarity 0.9616429
[('bass.ogg', 6)]


In [111]:
ipd.Audio(combined_reconstructed_audio, rate=sr)

In [107]:
ipd.Audio(combined_original_audio, rate=sr)