<a href="https://colab.research.google.com/github/its-rajesh/Music-Speech-Separation/blob/main/MusicSpeechDNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Neural Network for Music Speech Separation

In [3]:
import tensorflow as tf

In [4]:
from tensorflow import keras

In [5]:
import soundfile as sf
import librosa as lb
import numpy as np
from matplotlib import pyplot as plt

In [6]:
import os

## Reading DataSet

In [7]:
from google.colab import drive
drive.mount('/drive')

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


**For Running In Google Colab**

In [6]:
path = "/drive/My Drive/Projects/Music-Speech-Separation/Dataset"

folders = os.listdir(path)
folders

['Mixture', 'Overlay', 'speech', 'music']

**For Running In Local Machine**

In [7]:
'''
path = "/home/rajesh/MusicSpeechSeparartion/Dataset/"

folders = os.listdir(path)
folders
'''

'\npath = "/home/rajesh/MusicSpeechSeparartion/Dataset/"\n\nfolders = os.listdir(path)\nfolders\n'

In [7]:
audiofiles, samplerates = [], []
for folder in folders:
    files = os.listdir(path+'/'+folder)
    files = sorted(files)
    audio, sr = [], []
    for file in files:
        data, samplerate = sf.read(path+'/'+folder+'/'+file)
        audio.append(data)
        sr.append(samplerate)
    audiofiles.append(audio)
    samplerates.append(sr)

In [8]:
overlay = np.array(audiofiles[0])
mixture = np.array(audiofiles[1])
speech = np.array(audiofiles[2])
music = np.array(audiofiles[3])

dataset = [overlay, mixture, speech, music]

### Short Time Fourier Transform

In [9]:
datasetstft = []
for data in dataset:
    spect = []
    for audio in data:
        stft = np.abs(lb.stft(audio))
        spect.append(stft)
    datasetstft.append(spect)

In [10]:
overlay_stft = np.array(datasetstft[0])
mixture_stft = np.array(datasetstft[1])
speech_stft = np.array(datasetstft[2])
music_stft = np.array(datasetstft[3])

dataset_stft = [overlay_stft, mixture_stft, speech_stft, music_stft]

In [11]:
overlay_stft.shape, mixture_stft.shape, speech_stft.shape, music_stft.shape

((50, 1025, 1292), (50, 1025, 646), (50, 1025, 646), (50, 1025, 646))

In [12]:
groundtruth_stft = []
for i in range(50):
    groundtruth_stft.append(np.hstack((speech_stft[i], music_stft[i])))

    
groundtruth_stft = np.array(groundtruth_stft)
groundtruth_stft.shape

(50, 1025, 1292)

## DNN Model

In [14]:
from keras import layers

In [None]:
input_dim = (1025, 1292, 1)
output_dim = (1025, 1292)

inputs = layers.Input(shape=input_dim)
x = layers.Flatten()(inputs)

x = layers.Dense(units=128, activation='relu')(x)
x = layers.BatchNormalization()(x)

x = layers.Dense(units=128, activation='relu')(x)
x = layers.BatchNormalization()(x)

outputs = layers.Dense(units=output_dim, activation='softmax')(x)
model = keras.Model(inputs=inputs, outputs=outputs)

In [16]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1, 1025, 1292)]   0         
                                                                 
 flatten (Flatten)           (None, 1324300)           0         
                                                                 
 dense (Dense)               (None, 128)               169510528 
                                                                 
 batch_normalization (BatchN  (None, 128)              512       
 ormalization)                                                   
                                                                 
 dense_1 (Dense)             (None, 128)               16512     
                                                                 
 batch_normalization_1 (Batc  (None, 128)              512       
 hNormalization)                                             

In [21]:
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss=keras.losses.SparseCategoricalCrossentropy(),
    metrics=['acc']
)

In [22]:
X_train = overlay_stft
y_train = groundtruth_stft

X_train.shape, y_train.shape

((50, 1025, 1292), (50, 1025, 1292))

In [23]:
model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=2,
    validation_split=0.1,
    verbose=1)

Epoch 1/5


InvalidArgumentError: ignored

# Read CSV

In [8]:
import pandas as pd

In [9]:
csv_path = '/drive/My Drive/Projects/Music-Speech-Separation/CSVFilesTraining/'

files = os.listdir(csv_path+'/GroundTruth/')
files = sorted(files)

y_train = []
for file in files:
  y_train.append(np.array(pd.read_csv(csv_path+'/GroundTruth/'+file)))



files = os.listdir(csv_path+'/Mixture/')
files = sorted(files)

X_train = []
for file in files:
  X_train.append(np.array(pd.read_csv(csv_path+'/Mixture/'+file)))

## CNN Model

In [10]:
#X_train = overlay_stft
#y_train = groundtruth_stft

X_train = np.array(X_train)
y_train = np.array(y_train)

X_train.shape, y_train.shape

((50, 1025, 1293), (50, 1025, 1293))

In [11]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import time
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input,Dense,Flatten,Dropout,Conv2D,MaxPool2D
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers

In [12]:
batch_size = 128
epochs = 50
inp = Input(shape =(1025, 1293, 1))
out_dim = 1025*1293

In [13]:
start = time.time()

x = Conv2D(32, 3, activation='relu')(inp)
x = BatchNormalization()(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
x = Dropout(0.30)(x)

x = Conv2D(64, 3, activation='relu')(x)
x = BatchNormalization()(x)
x = MaxPooling2D(pool_size=(4, 100))(x)
x = Dropout(0.30)(x)

x = Flatten()(x)
x = Dense(100, activation='relu')(x)
x = Dropout(0.30)(x)

x = Dense(out_dim, activation='relu')(x)

In [None]:
classify = Model(inputs = inp, outputs = x)
classify.compile(loss='SparseCategoricalCrossentropy', optimizer = 'adam', metrics='accuracy') 
classify.summary()
classify_train = classify.fit(X_train,y_train, epochs = epochs ,batch_size=batch_size)
end = time.time()
print("time taken",time)

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1025, 1293, 1)]   0         
                                                                 
 conv2d (Conv2D)             (None, 1023, 1291, 32)    320       
                                                                 
 batch_normalization (BatchN  (None, 1023, 1291, 32)   128       
 ormalization)                                                   
                                                                 
 max_pooling2d (MaxPooling2D  (None, 511, 645, 32)     0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 511, 645, 32)      0         
                                                                 
 conv2d_1 (Conv2D)           (None, 509, 643, 64)      18496 