In [12]:
import numpy as np
import pandas as pd
import torch
import torchvision
import torch.nn as nn
from torch.nn.functional import interpolate 
import matplotlib.pyplot as plt
import pydub
from pydub import AudioSegment
from pathlib import Path

# generation imports
from pippi.soundbuffer import SoundBuffer
from pippi import dsp,fx
import param_generation as pg
import _pickle as pickle
from IPython.display import Audio
from feature_extraction.mir_utils import *
###
import torch.utils.data as utils
import torchvision.transforms as transforms
from PIL import Image
###
import scipy.stats as ss
import common_vars as comv
import imp
import librosa
import librosa.display
import sys
import csv
sys.path.append("..")

import helpers
imp.reload(helpers)
imp.reload(comv)
imp.reload(pg)
from feature_extraction import pytorch_models as tm
imp.reload(tm)
from helpers import *

# from common_vars import SR
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

drum_types = ['hat', 'kick', 'shake', 'snare']
# save_path="./generated_sample_pack/"
# device = "cpu"
# stack_size=3
# BATCH_SIZE=1
# NUM_BINS=100

# classes=comv.classes
# classes_ranked=comv.classes_ranked
# cDict={v:i for i,v in enumerate(classes)}

# drum_groups=['tom_low','snare','hihat_closed','rim','synth_noise','clap','kick','hihat_open','tom_mid']


# drum_df=pd.DataFrame(columns=["name","fc","cnnlstm","env+freq","consensus","stack_size"])

In [13]:
out,params = StackMaker(2)
a = memToAud(out)
Audio(a,rate=44100,autoplay=True)

In [14]:
# transformations to the audio (by default, the SR rate of generations is 44100)
# model is trained on 5512 SR
SCALE_FACTOR = 1/8

signal = interpolate(torch.tensor(a).reshape([1,1,-1]),scale_factor = SCALE_FACTOR,recompute_scale_factor=False).reshape(1,-1)
Audio(signal.reshape([-1]),rate=5512,autoplay=True)

In [15]:
import torchaudio
spec = torchaudio.functional.spectrogram

FREQ_BINS=30
TIME_STEPS=10
class specTrans(object):
    def __init__(self,num_mels=50,sr=SR,time_steps=20,amp_to_power=True):
        self.sr = sr
        self.amp_to_power=amp_to_power
        self.num_mels=num_mels
        self.ampP=torchaudio.transforms.AmplitudeToDB(stype='power',top_db=60)
        self.melP=torchaudio.transforms.MelScale(n_mels=self.num_mels, sample_rate=sr,n_stft=None)
        self.hop_step=time_steps-1
    def __call__(self, sample):
        
        wf=sample
        wf=wf.reshape(-1,len(wf))
        sample_length=self.sr

        num_bins=wf[0].shape[0]
        win_length=self.sr//17
        hop_step=self.sr//self.hop_step
        window=torch.tensor([1]*win_length)
        s=spec(wf, 0, window, num_bins, hop_step, win_length,2,normalized=True)
        s=self.melP(s)
        if self.amp_to_power:
            s=self.ampP(s)
        s = s - s.min()
        s = s/s.abs().max()

        freq=s
        freq[torch.isnan(freq)]=0
        freq=freq
        return freq.detach()
spec_tf = specTrans(FREQ_BINS,time_steps=TIME_STEPS,sr=SR//4,amp_to_power=True)

In [17]:
# load model here
try:
    del cnet
except:
    pass
cnet = tm.Conv_Spec_DVN(embed_only=False,device=device).to(device)
checkpoint = torch.load("feature_extraction/models/1d_conv/0.149_0.0702_.checkpoint")
cnet.load_state_dict(checkpoint['model_state_dict'])
cnet.to(device)
cnet.eval()

try:
    del cnet_dvd
except:
    pass
cnet_dvd = tm.Conv_Spec_DVD(embed_only=False,device=device).to(device)
checkpoint = torch.load("feature_extraction/models/1d_conv_dvd/0.751_0.5051_.checkpoint")
cnet_dvd.load_state_dict(checkpoint['model_state_dict'])
cnet_dvd.to(device)
cnet_dvd.eval()
pass

In [18]:
spec_signal = interpolate(torch.tensor(a).reshape([1,1,-1]),scale_factor = 0.25,recompute_scale_factor=False).reshape(1,-1)
spectrogram = spec_tf(spec_signal[0].float()).to(device)

In [27]:
# make sound and evaluate
def make_and_eval(stack_size = 1):
  
    out,params = StackMaker(stack_size)
    a = memToAud(out)
    # transformations to the audio (by default, the SR rate of generations is 44100)
    signal = interpolate(torch.tensor(a).reshape([1,1,-1]),scale_factor = SCALE_FACTOR,recompute_scale_factor=False).reshape(1,-1).to(device)
    spec_signal = interpolate(torch.tensor(a).reshape([1,1,-1]),scale_factor = 0.25,recompute_scale_factor=False).reshape(1,-1)

    spectrogram = spec_tf(spec_signal[0].float()).to(device)
    with torch.no_grad():
        ps = cnet(signal,spectrogram)
        ps_dvd = cnet_dvd(signal,spectrogram)
    return a,signal,ps.cpu().numpy()[0],torch.sigmoid(ps).cpu().numpy()[0],torch.sigmoid(ps_dvd).cpu().numpy()[0]
a,a_downsampled,ps,pss,ps_dvd = make_and_eval()
print(drum_types[np.argmax(ps_dvd)])
Audio(a_downsampled.cpu().reshape([-1]),rate=5512,autoplay=True)

hat


In [28]:
Audio(a.reshape([-1]),rate=44100,autoplay=True)

In [68]:
# loop until found
biggest_p = []
best_sounds = {}
for i in range(1000):
    a,a_ds,ps,pss,pss_dvd= make_and_eval(3)
    if pss[0]>pss[1]:
        print(drum_types[np.argmax(pss_dvd)])
        print(pss_dvd)
        break
    biggest_p.append(ps[0])
Audio(a[0:SR//2],rate=SR,autoplay=True)

snare
[0.60457534 0.22367331 0.18656485 0.77972597]


In [69]:
# getting the best sound out of n tries

best_sounds = {}
best_sounds = dict.fromkeys(drum_types)
for k in drum_types:
    best_sounds[k] = (0,[0])

for i in range(1000):
    a,a_ds,ps,pss,pss_dvd= make_and_eval(1)
    if pss[0]>pss[1]:
        t,p = drum_types[np.argmax(pss_dvd)],pss_dvd[np.argmax(pss_dvd)]
#         print(i,t,p,end="\r")
        if best_sounds[t][0]<p:
            best_sounds[t] = (p,a)
#     biggest_p.append(ps[0])
Audio(a,rate=SR,autoplay=True)

In [71]:
# import numpy
# import IPython.display as ipd
# from IPython.display import clear_output
# import time 
# sr = 22050# sample rate
# T = 2.0# seconds
# t = numpy.linspace(0, T, int(T*sr), endpoint=False)# time variable
# x = 0.5*numpy.sin(2*numpy.pi*440*t)
# ipd.Audio(x, rate=sr)

# for i in range(3):
#     x = 0.1*numpy.sin(i*numpy.pi*440*t)
#     ipd.set_matplotlib_close(close=True)
#     ipd.display(ipd.Audio(x, rate=sr,autoplay=True))
#     time.sleep(1)
#     clear_output()

In [90]:
#save kit to file
from scipy.io.wavfile import write
from pippi import dsp
write("feature_extraction/kits/kick.wav",SR,best_sounds["kick"][1])

In [94]:
from pydub import AudioSegment

song = AudioSegment.from_wav("feature_extraction/kits/kick.wav")
k = dsp.read("feature_extraction/kits/kick.wav")

In [95]:
k.pla

SoundBuffer(samplerate=48000, channels=1, frames=<MemoryView of 'ndarray' object>)