First, mount your google drive here to access the dataset folder later.

In [1]:
from google.colab import drive
# drive.flush_and_unmount()
drive.mount('/content/drive/')

Mounted at /content/drive/


The following packages do not come pre-installed. So we will install them here.

In [2]:
!pip install pydub mutagen

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Collecting mutagen
  Downloading mutagen-1.47.0-py3-none-any.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.4/194.4 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydub, mutagen
Successfully installed mutagen-1.47.0 pydub-0.25.1


Import all the necessary packages.

In [3]:
from pydub import AudioSegment
import librosa
import numpy as np
import matplotlib.pyplot as plt
import math
import mutagen
import os
import pandas as pd
# from pytube import YouTube
# from youtubesearchpython import VideosSearch
from moviepy.editor import *
import random
from PIL import Image
import PIL.ImageOps

Now load some pytorch functions.

In [4]:
import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
import torchvision.utils
import torch
from torch.autograd import Variable
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import torch.utils.model_zoo as model_zoo
from torch.nn import init

Check if gpu is available. If the result is False, change runtime type in Runtime-->Change runtime type-->Hardware accelerator "T4 GPU"

In [5]:
torch.cuda.is_available()

True

This is the xception model that we will use. It takes inputted images in size 299x299, and outputs a vector of size 1000, which will be the feature map that encodes information about the inputted diagrams.

One important thing is to change the path to the pretrained model based on your mount location. Find the line:


```
if pretrained:
        model.load_state_dict(torch.load('/content/drive/MyDrive/Music_Plagiarism/xception-43020ad28.pth'))
```
Change the path inside ```torch.load()``` to the path pointing to ```xception-43020ad28.pth``` in our shared folder. This is the pretrained parameters trained on imagenet data.


In [6]:
"""
Creates an Xception Model as defined in:

Francois Chollet
Xception: Deep Learning with Depthwise Separable Convolutions
https://arxiv.org/pdf/1610.02357.pdf

This weights ported from the Keras implementation. Achieves the following performance on the validation set:

Loss:0.9173 Prec@1:78.892 Prec@5:94.292

REMEMBER to set your image size to 3x299x299 for both test and validation

normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5],
                                  std=[0.5, 0.5, 0.5])

The resize parameter of the validation transform should be 333, and make sure to center crop at 299x299
"""
import math
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.model_zoo as model_zoo
from torch.nn import init
import torch

__all__ = ['xception']

model_urls = {
#     'xception':'https://www.dropbox.com/s/1hplpzet9d7dv29/xception-c0a72b38.pth.tar?dl=1'
    'xception':'http://data.lip6.fr/cadene/pretrainedmodels/xception-43020ad28.pth'
}


class SeparableConv2d(nn.Module):
    def __init__(self,in_channels,out_channels,kernel_size=1,stride=1,padding=0,dilation=1,bias=False):
        super(SeparableConv2d,self).__init__()

        self.conv1 = nn.Conv2d(in_channels,in_channels,kernel_size,stride,padding,dilation,groups=in_channels,bias=bias)
        self.pointwise = nn.Conv2d(in_channels,out_channels,1,1,0,1,1,bias=bias)

    def forward(self,x):
        x = self.conv1(x)
        x = self.pointwise(x)
        return x


class Block(nn.Module):
    def __init__(self,in_filters,out_filters,reps,strides=1,start_with_relu=True,grow_first=True):
        super(Block, self).__init__()

        if out_filters != in_filters or strides!=1:
            self.skip = nn.Conv2d(in_filters,out_filters,1,stride=strides, bias=False)
            self.skipbn = nn.BatchNorm2d(out_filters)
        else:
            self.skip=None

        self.relu = nn.ReLU(inplace=True)
        rep=[]

        filters=in_filters
        if grow_first:
            rep.append(self.relu)
            rep.append(SeparableConv2d(in_filters,out_filters,3,stride=1,padding=1,bias=False))
            rep.append(nn.BatchNorm2d(out_filters))
            filters = out_filters

        for i in range(reps-1):
            rep.append(self.relu)
            rep.append(SeparableConv2d(filters,filters,3,stride=1,padding=1,bias=False))
            rep.append(nn.BatchNorm2d(filters))

        if not grow_first:
            rep.append(self.relu)
            rep.append(SeparableConv2d(in_filters,out_filters,3,stride=1,padding=1,bias=False))
            rep.append(nn.BatchNorm2d(out_filters))

        if not start_with_relu:
            rep = rep[1:]
        else:
            rep[0] = nn.ReLU(inplace=False)

        if strides != 1:
            rep.append(nn.MaxPool2d(3,strides,1))
        self.rep = nn.Sequential(*rep)

    def forward(self,inp):
        x = self.rep(inp)

        if self.skip is not None:
            skip = self.skip(inp)
            skip = self.skipbn(skip)
        else:
            skip = inp

        x+=skip
        return x



class Xception(nn.Module):
    """
    Xception optimized for the ImageNet dataset, as specified in
    https://arxiv.org/pdf/1610.02357.pdf
    """
    def __init__(self, num_classes=1000):
        """ Constructor
        Args:
            num_classes: number of classes
        """
        super(Xception, self).__init__()


        self.num_classes = num_classes

        self.conv1 = nn.Conv2d(3, 32, 3,2, 0, bias=False)
        self.bn1 = nn.BatchNorm2d(32)
        self.relu = nn.ReLU(inplace=True)

        self.conv2 = nn.Conv2d(32,64,3,bias=False)
        self.bn2 = nn.BatchNorm2d(64)
        #do relu here

        self.block1=Block(64,128,2,2,start_with_relu=False,grow_first=True)
        self.block2=Block(128,256,2,2,start_with_relu=True,grow_first=True)
        self.block3=Block(256,728,2,2,start_with_relu=True,grow_first=True)

        self.block4=Block(728,728,3,1,start_with_relu=True,grow_first=True)
        self.block5=Block(728,728,3,1,start_with_relu=True,grow_first=True)
        self.block6=Block(728,728,3,1,start_with_relu=True,grow_first=True)
        self.block7=Block(728,728,3,1,start_with_relu=True,grow_first=True)

        self.block8=Block(728,728,3,1,start_with_relu=True,grow_first=True)
        self.block9=Block(728,728,3,1,start_with_relu=True,grow_first=True)
        self.block10=Block(728,728,3,1,start_with_relu=True,grow_first=True)
        self.block11=Block(728,728,3,1,start_with_relu=True,grow_first=True)

        self.block12=Block(728,1024,2,2,start_with_relu=True,grow_first=False)

        self.conv3 = SeparableConv2d(1024,1536,3,1,1)
        self.bn3 = nn.BatchNorm2d(1536)

        #do relu here
        self.conv4 = SeparableConv2d(1536,2048,3,1,1)
        self.bn4 = nn.BatchNorm2d(2048)

        self.fc = nn.Linear(2048, num_classes)



        #------- init weights --------
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
        #-----------------------------





    def forward_once(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)

        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)

        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.block4(x)
        x = self.block5(x)
        x = self.block6(x)
        x = self.block7(x)
        x = self.block8(x)
        x = self.block9(x)
        x = self.block10(x)
        x = self.block11(x)
        x = self.block12(x)

        x = self.conv3(x)
        x = self.bn3(x)
        x = self.relu(x)

        x = self.conv4(x)
        x = self.bn4(x)
        x = self.relu(x)

        x = F.adaptive_avg_pool2d(x, (1, 1))
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x

    def forward(self, input1, input2, input3):
        # In this function we pass in both images and obtain both vectors
        # which are returned
        output1 = self.forward_once(input1)
        output2 = self.forward_once(input2)
        output3 = self.forward_once(input3)

        return output1, output2, output3

# Define the triplet Loss Function
class TripletLoss(torch.nn.Module):
    def __init__(self, margin=2.0):
        super(TripletLoss, self).__init__()
        self.margin = margin

    # output 1 for anchor, output2 for positive, output3 for negative
    def forward(self, output1, output2, output3):
      # Calculate the euclidean distance and calculate the contrastive loss
      euclidean_distance_p = F.pairwise_distance(output1, output2, keepdim = True)
      euclidean_distance_n = F.pairwise_distance(output1, output3, keepdim = True)

      loss_triple = torch.mean((torch.clamp(self.margin - euclidean_distance_n + euclidean_distance_p, min=0.0)))

      return loss_triple

def xception(pretrained=True, to_cuda = False, reload_previous = False, **kwargs):
    """
    Construct Xception.
    """

    if to_cuda:
        model = Xception().cuda()
    else:
        model = Xception()
    if pretrained:
        model.load_state_dict(torch.load('/content/drive/MyDrive/Music_Plagiarism/xception-43020ad28.pth'))
        # model.load_state_dict(model_zoo.load_url(model_urls['xception']))
    if reload_previous:
        load_model(model, path = kwargs['RELOAD_PATH'])
    return model

def load_model(model, path = './xception.pth'):
    model.load_state_dict(torch.load(path))

Dataset loader for training

In [7]:
class NetworkDataset(Dataset):
    def __init__(self,imageFolderDataset,transform=None):
        self.imageFolderDataset = imageFolderDataset
        self.transform = transform

    def __getitem__(self,index):
        img0_tuple = random.choice(self.imageFolderDataset.imgs)

        #We need to get a image in the same class and in a different class
        while True:
            #Look untill the same class image is found
            img1_tuple = random.choice(self.imageFolderDataset.imgs)
            if img0_tuple[1] == img1_tuple[1]:
                break
        while True:
            #Look untill a different class image is found
            img2_tuple = random.choice(self.imageFolderDataset.imgs)
            if img0_tuple[1] != img2_tuple[1]:
                break

        # open image file
        img0 = Image.open(img0_tuple[0])
        img1 = Image.open(img1_tuple[0])
        img2 = Image.open(img2_tuple[0])

        # to grayscale
        # img0 = img0.convert("L")
        # img1 = img1.convert("L")
        # img2 = img2.convert("L")

        # to RBG
        img0 = img0.convert("RGB")
        img1 = img1.convert("RGB")
        img2 = img2.convert("RGB")


        if self.transform is not None:
            img0 = self.transform(img0)
            img1 = self.transform(img1)
            img2 = self.transform(img2)

        return img0, img1, img2

    def __len__(self):
        return len(self.imageFolderDataset.imgs)

Some helper functions needed for audio processing and etc..



In [8]:
# ------------ string related
def convert_time_to_sec(time_str):
    # convert time string like ['1:27'] to seconds
    if "," in time_str:
        return [convert_time_to_sec(i) for i in time_str.split(",")]

    # if "[" in time_str:
    time_str = time_str.replace("[", "")
    time_str = time_str.replace("]", "")
    time_str = time_str.replace("'", "")

    if ":" in time_str:
        time_str = time_str.split(":")
    elif "_" in time_str:
        time_str = time_str.split("_")
    return int(time_str[0]) * 60 + int(time_str[1])

def replace_invalid_char(string):
	invalid_char = ['/',':','*','?','"','<','>','|']
	new_string = ''
	for s in string:
		if s in invalid_char:
			s = '_'
		new_string += s
	return new_string

def remove_special_char(string):
	test_str = ''.join(letter for letter in string if letter.isalnum())
	return test_str
#--------------------------------------------


# ------------------------- audio related
def stretch_audio(audio, sr, rate):
	new_audio = librosa.effects.time_stretch(audio, rate=rate)
	return new_audio

def shift_audio(audio, sr, semitone):
	y_shifted = librosa.effects.pitch_shift(audio, sr=sr, n_steps=semitone)
	return y_shifted

def bpm_estimation(audio, sr):
	bpm, _ = librosa.beat.beat_track(y=audio, sr=sr)
	return bpm

def calculate_eightbars_duration(audio, sr):
	bpm = bpm_estimation(audio, sr)
	# assuming a 44 time signature
	secs = 60/bpm*4*8
	return secs

# --------------------------------------

# ---------------------for dataset conversion
def list_of_existing_sampletime(dir_list):
    return np.array(dir_list, dtype = np.int32)

def find_closest_number(array, target):
  # Find the minimum distance between the target value and each number in the array.
  distances = [np.abs(target - number) for number in array]
  # Return the index of the number with the minimum distance.
  return distances.index(min(distances))
#--------------------------------------------

# ------------------------ for images
# Showing images
def imshow(img, text=None):
	npimg = img.numpy()
	plt.axis("off")
	if text:
		plt.text(75, 8, text, style='italic',fontweight='bold',
			bbox={'facecolor':'white', 'alpha':0.8, 'pad':10})

	plt.imshow(np.transpose(npimg, (1, 2, 0)))
	plt.show()

# Plotting data
def show_plot(iteration,loss):
	plt.plot(iteration,loss)
	plt.show()
#-------------------------------------------

# -------------- similarity score functions
def cos_sim_score(output1, output2):
	score = F.cosine_similarity(output1,output2, dim = 1)
	return score

def pearson_corr_score(output1, output2):
	xmean = torch.mean(output1)
	ymean = torch.mean(output2)
	p_score = torch.sum((output1-xmean)*(output2-ymean))/torch.sqrt(torch.sum((output1-xmean)**2)*torch.sum((output2-ymean)**2))
	return p_score

def weighted_score(output1,output2):
	w_score = 0.2*F.pairwise_distance(output1, output2, keepdim = True)**2+0.4*cos_sim_score(output1, output2)+0.4*pearson_corr_score(output1,output2)
	return w_score
#------------------------------------------

```main()``` function for training and testing



In [11]:
def main():
    if MODE == 'train':

        # Load the training dataset
        folder_dataset = datasets.ImageFolder(root=DATASET_PATH)

        # Resize the images and transform to tensors
        transformation = transforms.Compose([transforms.Resize((299,299)),
                                             transforms.ToTensor()
                                            ])

        # Initialize the network
        dataset = NetworkDataset(imageFolderDataset=folder_dataset,
                                                transform=transformation)

        # Load the training dataset
        train_dataloader = DataLoader(dataset,
                                shuffle=True,
                                num_workers=8,
                                batch_size=16)


        net = xception(pretrained = True, to_cuda = True, reload_previous = RELOAD_PREVIOUS, RELOAD_PATH = RELOAD_PATH)
        criterion = TripletLoss()
        optimizer = optim.Adam(net.parameters(), lr = 0.001 )

        # base_optimizer = torch.optim.Adam  # define an optimizer for the "sharpness-aware" update
        # optimizer = SAM(net.parameters(), base_optimizer, lr=0.1)

        counter = []
        loss_history = []
        loss_epoch_history = np.zeros([TRAINING_EPOCH, 2])
        iteration_number= 0
        ending_epoch = STARTING_EPOCH+TRAINING_EPOCH
        # Iterate throught the epochs
        for epoch in range(STARTING_EPOCH, ending_epoch):
            if epoch %10 == 0 or epoch ==  ending_epoch-1:
                torch.save(net.state_dict(), "{0}_{1}.pth".format(MODEL_PATH, epoch))
            np.savetxt("{0}_loss.csv".format(MODEL_PATH, epoch), loss_epoch_history, delimiter=',')
            epoch_loss_ave = 0.0
            # Iterate over batches
            for i, (img0, img1, img2) in enumerate(train_dataloader, 0):
                # def closure():
                    # loss = criterion(output1, output2, output3)
                    # loss.backward()
                    # return loss
                if torch.cuda.is_available():
                    # Send the images and labels to CUDA
                    img0, img1, img2 = img0.cuda(), img1.cuda(), img2.cuda()

                # Zero the gradients
                optimizer.zero_grad()

                # Pass in the two images into the network and obtain two outputs
                output1, output2, output3 = net(img0, img1, img2)

                # Pass the outputs of the networks and label into the loss function
                loss_triplet = criterion(output1, output2, output3)

                # Calculate the backpropagation
                loss_triplet.backward()

                # Optimize
                optimizer.step()
                # optimizer.step(closure)

                epoch_loss_ave += loss_triplet.item()
                print(epoch_loss_ave)
                # Every 10 batches print out the loss
                if i % 10 == 0 :
                    print(f"Epoch number {epoch}\n Current loss {loss_triplet.item()}\n")
                    iteration_number += 10
                    counter.append(iteration_number)
                    loss_history.append(loss_triplet.item())
            epoch_loss_ave = epoch_loss_ave/i
            loss_epoch_history[epoch,0] = epoch
            loss_epoch_history[epoch,1] = epoch_loss_ave
        show_plot(counter, loss_history)

    elif MODE == 'test':
        # Locate the test dataset and load it into the NetworkDataset
        folder_dataset_test = datasets.ImageFolder(root="./data/testing/")
        test_dataset = NetworkDataset(imageFolderDataset=folder_dataset_test,
                                                transform=transformation)
        test_dataloader = DataLoader(test_dataset, num_workers=2, batch_size=1, shuffle=True)

        # Grab one image that we are going to test
        dataiter = iter(test_dataloader)
        x0, _, _ = next(dataiter)

        for i in range(5):
            # Iterate over 5 images and test them with the first image (x0)
            _, x1, x2 = next(dataiter)

            # Concatenate the two images together
            concatenated = torch.cat((x0, x1, x2), 0)

            output1, output2, output3 = net(x0.cuda(), x1.cuda(), x2.cuda())
            euclidean_distance1 = F.pairwise_distance(output1, output2)
            euclidean_distance2 = F.pairwise_distance(output1, output3)
            imshow(torchvision.utils.make_grid(concatenated), f'Similarity: {euclidean_distance1.item():.2f}')

One caveat is that if you let the notebook run in the background without interaction, google colab will kill the session after 12 hours. To avoid the issue, we need to set a clicker in the background that will click on the connect button in a set time interval. To do that, do Ctrl-Shift-i to access the web console, the copy the following code to the console:

```
function ClickConnect(){
console.log("Working");
document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click();
}
var clicker = setInterval(ClickConnect,60000);
```

To clear the clicker, do:
```
clearInterval(clicker);
```

Now we run the ```main()``` function, which will run and save models based on the path you set below. Remember to change ```DATASET_PATH``` to the path pointing to the dataset folder, change ```MODEL_PATH``` to your desired folder for saving the intermittent model parameters.

In [None]:
DATASET_PATH = "/content/drive/MyDrive/Music_Plagiarism/melspectrogram_preprocessed_dataset/training"
MODEL_PATH = "/content/drive/MyDrive/Music_Plagiarism/MelspectrogramPreprocessed_savepoint/model_spectrogram_preprocessed"
TRAINING_EPOCH = 200
RELOAD_PREVIOUS = False
RELOAD_PATH = "/content/drive/MyDrive/Music_Plagiarism/Melspectrogram10s_savepoint/model_spectrogram10s_140.pth"
STARTING_EPOCH = 0
MODE = 'train'
main()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
157.94802270829678
158.73690603673458
159.88437746465206
160.89504145085812
161.55395703017712
Epoch number 1
 Current loss 0.6589155793190002

162.43517760932446
163.34291510283947
164.0763793438673
164.6004232019186
165.1377004235983
165.53840796649456
166.40814004838467
167.51457510888577
167.98248521983624
168.38130955398083
Epoch number 1
 Current loss 0.3988243341445923

168.56099359691143
169.64736215770245
170.71800319850445
171.57707600295544
172.13558368384838
172.64325980842113
173.39258401095867
173.5155497044325
174.87965182960033
175.3120127171278
Epoch number 1
 Current loss 0.4323608875274658

176.58072333037853
177.62998954951763
178.08321173489094
178.73865695297718
179.28226126730442
179.91417695581913
180.16356943547726
181.17191825807095
182.8653248399496
183.57838319242
Epoch number 1
 Current loss 0.713058352470398

184.97544859349728
185.45107220113277
185.9151781052351
186.60522447526455
188.18996