In [None]:
#################### OUTLIER REMOVAL AND AVERAGING ######################
# plt.boxplot(y)
# plt.show()

# skc = 1
# att = sorted(sorted([(y[i],i) for i in range(len(y))], reverse=True)[skc:-skc], key=lambda x: x[1])
# y_cleaned = [x[0] for x in att]

# plt.plot(normalized_tone[:5000])
# new_tone = []
# sz = 2
# avg = 0
# for i in range(sz):
#     avg += y[i]
# avg /= sz
# new_tone.append(avg)
# for j in range(sz, len(y)):
#     avg += y[j]/sz - y[j-sz]/sz
#     new_tone.append(avg)
#################### OUTLIER REMOVAL AND AVERAGING ######################

#################### MIXED SIGNAL ######################
# import numpy as np
# from matplotlib import pyplot as plt
# from scipy.io.wavfile import write

# Remember SAMPLE_RATE = 44100 Hz is our playback rate
# SAMPLE_RATE = 44100  # Hertz
# write("mysinewave.wav", SAMPLE_RATE, normalized_tone)

# DURATION = 5  # Seconds

# def generate_sine_wave(freq, sample_rate, duration):
#     x = np.linspace(0, duration, sample_rate * duration, endpoint=False)
#     frequencies = x * freq
#     # 2pi because np.sin takes radians
#     y = np.sin((2 * np.pi) * frequencies)
#     return x, y

# _, nice_tone = generate_sine_wave(400, SAMPLE_RATE, DURATION)
# _, noise_tone = generate_sine_wave(4000, SAMPLE_RATE, DURATION)
# noise_tone = noise_tone * 0.3

# mixed_tone = nice_tone + noise_tone
# normalized_tone = np.int16((mixed_tone / mixed_tone.max()) * 32767)
#################### MIXED SIGNAL ######################


#################### FFT ######################
# from scipy.fft import fft, fftfreq

# # Number of samples in normalized_tone
# N = SAMPLE_RATE * DURATION

# yf = fft(normalized_tone)
# xf = fftfreq(N, 1 / SAMPLE_RATE)

# print(xf, yf)

# plt.scatter(xf, np.abs(yf))
# plt.show()
#################### FFT ######################

In [2]:
import numpy as np
import librosa
import itertools
import os
import soundfile as sf
import IPython.display as ipd
import matplotlib.pyplot as plt

class AudioManipulator:
    def __init__(self):
        self.chroma_hop_length=12
        self.normalizationValue = 32767.00
        self.n_mels = 128 * 2

    def getAudioInterface(self, audio):
        return ipd.Audio(audio)

    def splitAudio(self, audio, sr, save_path, start_time, end_time):
        audio = audio[int(sr * start_time):int(sr * end_time)]
        sf.write(save_path, audio, sr)
    
    def shiftPitchOfAudio(self, audio, sr, pitch_shift):
        audio_with_pitch = librosa.effects.pitch_shift(audio, sr=sr, n_steps=pitch_shift)
        return audio_with_pitch

    def getStft(self, audio):
        stft = librosa.stft(audio)
        stft_db = librosa.amplitude_to_db(abs(stft))
        return stft, stft_db
    
    def getMelSpectogram(self, audio, sr):
        mel_spec = librosa.feature.melspectrogram(y=audio, sr = sr, n_mels = self.n_mels)
        mel_spec_db = librosa.amplitude_to_db(mel_spec) # ref = np.max
        return mel_spec, mel_spec_db
    
    def getChromaGram(self, audio, sr):
        chromaGram = librosa.feature.chroma_stft(y=audio, sr=sr, hop_length=self.chroma_hop_length)
        return chromaGram
    
    def compareTwoAudios(self, audio1, audio2, sr):
        mel1 = self.getMelSpectogram(audio1, sr)
        mel2 = self.getMelSpectogram(audio2, sr)
        
        similarity = np.dot(mel1.flatten(), mel2.flatten()) / (np.linalg.norm(mel1) * np.linalg.norm(mel2))
        return similarity

    def drawAudio(self, audio, sr):
        plt.figure(figsize=(8.8, 3))
        plt.plot([(i+1)/sr for i in range(len(audio))], audio)
        plt.title('Raw Audio Example')
        plt.show()

    def drawAudioSpectrum(self, audio, sr):
        X, Xdb= self.getStft(audio)
        plt.figure(figsize=(14, 5))
        librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='log')
        plt.colorbar()
        plt.show()

    def drawAudioSpectrumNormalized(self, audio, sr):
        X, Xdb = self.getStft(audio/audio.max() * self.normalizationValue)
        plt.figure(figsize=(14, 5))
        librosa.display.specshow(Xdb, sr=sr, x_axis='time', y_axis='log')
        plt.colorbar()
        plt.show()

    def drawMelSpectrogram(self, audio, sr):
        S, S_db_mel = self.getMelSpectogram(audio, sr)

        fig, ax = plt.subplots(figsize=(10, 3))
        img = librosa.display.specshow(S_db_mel,
                                    x_axis='time',
                                    y_axis='log',
                                    ax=ax)
        ax.set_title('Mel Spectogram Example', fontsize=20)
        fig.colorbar(img, ax=ax, format=f'%0.2f')
        plt.show()

    def drawChromaGram(self, audio, sr):
        chromagram = self.getChromaGram(audio, sr)
        plt.figure(figsize=(15, 5))
        librosa.display.specshow(chromagram, x_axis='time', y_axis='chroma', hop_length=self.chroma_hop_length, cmap='coolwarm')

class MinecraftAudioMatcher:
    def __init__(self, mainAudio, baseSoundsPath):
        self.pitch_shifts = [6,12,18]
        self.mainAudio = mainAudio
        self.baseSoundsPath = baseSoundsPath
        self.manipulator = AudioManipulator()
        
    def getAllBaseSoundsNotes(self):
        sounds = []
        for sound_file_name in os.listdir(self.baseSoundsPath):
            sound, sr = librosa.load(self.baseSoundsPath + sound_file_name)
            sounds.append((sound, sr, sound_file_name))

        combinations = []
        for sound, pitch_shift in itertools.product(sounds, self.pitch_shifts):
            shifted_sound = self.manipulator.shiftPitchOfAudio(sound[0], sound[1], pitch_shift)
            combinations.append((shifted_sound, sound[1], sound[2], pitch_shift))
        return combinations

    def findBestNotes(self, maxNotesCount):
        all_base_sounds = self.getAllBaseSoundsNotes(self.baseSoundsPath)

        best_matches = []
        for r in range(1, maxNotesCount + 1):
            selected_combinations = itertools.combinations(all_base_sounds, r)
            for combination in selected_combinations:
                combined_audio = np.sum([audio for audio, _ in combination], axis=0)
                similarity = self.manipulator.compareTwoAudios(combined_audio, self.mainAudio)
                best_matches.append((similarity, combination))

        best_matches.sort(key=lambda x: x[0], reverse=True)
        best_matches = best_matches[:maxNotesCount]

        return best_matches

In [None]:
# Example usage:
# Assuming you have mainAudio and baseSoundsPath ready
matcher = MinecraftAudioMatcher(mainAudio, baseSoundsPath)
best_notes = matcher.findBestNotes(maxNotesCount=9)
for similarity, combination in best_notes:
    print("Similarity:", similarity)
    print("Combination:", combination)