First, mount your google drive here to access the dataset folder later.

In [2]:
from google.colab import drive
# drive.flush_and_unmount()
drive.mount('/content/drive/')

Mounted at /content/drive/


Import all the necessary packages.

In [3]:
import librosa
import numpy as np
import matplotlib.pyplot as plt
import math
import os
import pandas as pd
from moviepy.editor import *
import random
from PIL import Image
import PIL.ImageOps

Now load some pytorch functions.

In [4]:
import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
import torchvision.utils
import torch
from torch.autograd import Variable
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import torch.utils.model_zoo as model_zoo
from torch.nn import init

Check if gpu is available. If the result is False, change runtime type in Runtime-->Change runtime type-->Hardware accelerator "T4 GPU"

In [5]:
torch.cuda.is_available()

True

This is the xception model that we will use. It takes inputted images in size 299x299, and outputs a vector of size 1000, which will be the feature map that encodes information about the inputted diagrams.

One important thing is to change the path to the pretrained model based on your mount location. Find the line:


```
if pretrained:
        model.load_state_dict(torch.load('/content/drive/MyDrive/Music_Plagiarism/xception-43020ad28.pth'))
```
Change the path inside ```torch.load()``` to the path pointing to ```xception-43020ad28.pth``` in our shared folder. This is the pretrained parameters trained on imagenet data.


In [6]:
"""
Creates an Xception Model as defined in:

Francois Chollet
Xception: Deep Learning with Depthwise Separable Convolutions
https://arxiv.org/pdf/1610.02357.pdf

This weights ported from the Keras implementation. Achieves the following performance on the validation set:

Loss:0.9173 Prec@1:78.892 Prec@5:94.292

REMEMBER to set your image size to 3x299x299 for both test and validation

normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5],
                                  std=[0.5, 0.5, 0.5])

The resize parameter of the validation transform should be 333, and make sure to center crop at 299x299
"""
import math
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.model_zoo as model_zoo
from torch.nn import init
import torch

__all__ = ['xception']

model_urls = {
#     'xception':'https://www.dropbox.com/s/1hplpzet9d7dv29/xception-c0a72b38.pth.tar?dl=1'
    'xception':'http://data.lip6.fr/cadene/pretrainedmodels/xception-43020ad28.pth'
}


class SeparableConv2d(nn.Module):
    def __init__(self,in_channels,out_channels,kernel_size=1,stride=1,padding=0,dilation=1,bias=False):
        super(SeparableConv2d,self).__init__()

        self.conv1 = nn.Conv2d(in_channels,in_channels,kernel_size,stride,padding,dilation,groups=in_channels,bias=bias)
        self.pointwise = nn.Conv2d(in_channels,out_channels,1,1,0,1,1,bias=bias)

    def forward(self,x):
        x = self.conv1(x)
        x = self.pointwise(x)
        return x


class Block(nn.Module):
    def __init__(self,in_filters,out_filters,reps,strides=1,start_with_relu=True,grow_first=True):
        super(Block, self).__init__()

        if out_filters != in_filters or strides!=1:
            self.skip = nn.Conv2d(in_filters,out_filters,1,stride=strides, bias=False)
            self.skipbn = nn.BatchNorm2d(out_filters)
        else:
            self.skip=None

        self.relu = nn.ReLU(inplace=True)
        rep=[]

        filters=in_filters
        if grow_first:
            rep.append(self.relu)
            rep.append(SeparableConv2d(in_filters,out_filters,3,stride=1,padding=1,bias=False))
            rep.append(nn.BatchNorm2d(out_filters))
            filters = out_filters

        for i in range(reps-1):
            rep.append(self.relu)
            rep.append(SeparableConv2d(filters,filters,3,stride=1,padding=1,bias=False))
            rep.append(nn.BatchNorm2d(filters))

        if not grow_first:
            rep.append(self.relu)
            rep.append(SeparableConv2d(in_filters,out_filters,3,stride=1,padding=1,bias=False))
            rep.append(nn.BatchNorm2d(out_filters))

        if not start_with_relu:
            rep = rep[1:]
        else:
            rep[0] = nn.ReLU(inplace=False)

        if strides != 1:
            rep.append(nn.MaxPool2d(3,strides,1))
        self.rep = nn.Sequential(*rep)

    def forward(self,inp):
        x = self.rep(inp)

        if self.skip is not None:
            skip = self.skip(inp)
            skip = self.skipbn(skip)
        else:
            skip = inp

        x+=skip
        return x



class Xception(nn.Module):
    """
    Xception optimized for the ImageNet dataset, as specified in
    https://arxiv.org/pdf/1610.02357.pdf
    """
    def __init__(self, num_classes=1000):
        """ Constructor
        Args:
            num_classes: number of classes
        """
        super(Xception, self).__init__()


        self.num_classes = num_classes

        self.conv1 = nn.Conv2d(3, 32, 3,2, 0, bias=False)
        self.bn1 = nn.BatchNorm2d(32)
        self.relu = nn.ReLU(inplace=True)

        self.conv2 = nn.Conv2d(32,64,3,bias=False)
        self.bn2 = nn.BatchNorm2d(64)
        #do relu here

        self.block1=Block(64,128,2,2,start_with_relu=False,grow_first=True)
        self.block2=Block(128,256,2,2,start_with_relu=True,grow_first=True)
        self.block3=Block(256,728,2,2,start_with_relu=True,grow_first=True)

        self.block4=Block(728,728,3,1,start_with_relu=True,grow_first=True)
        self.block5=Block(728,728,3,1,start_with_relu=True,grow_first=True)
        self.block6=Block(728,728,3,1,start_with_relu=True,grow_first=True)
        self.block7=Block(728,728,3,1,start_with_relu=True,grow_first=True)

        self.block8=Block(728,728,3,1,start_with_relu=True,grow_first=True)
        self.block9=Block(728,728,3,1,start_with_relu=True,grow_first=True)
        self.block10=Block(728,728,3,1,start_with_relu=True,grow_first=True)
        self.block11=Block(728,728,3,1,start_with_relu=True,grow_first=True)

        self.block12=Block(728,1024,2,2,start_with_relu=True,grow_first=False)

        self.conv3 = SeparableConv2d(1024,1536,3,1,1)
        self.bn3 = nn.BatchNorm2d(1536)

        #do relu here
        self.conv4 = SeparableConv2d(1536,2048,3,1,1)
        self.bn4 = nn.BatchNorm2d(2048)

        self.fc = nn.Linear(2048, num_classes)

        #------- init weights --------
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
        #-----------------------------

    def forward_once(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)

        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)

        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.block4(x)
        x = self.block5(x)
        x = self.block6(x)
        x = self.block7(x)
        x = self.block8(x)
        x = self.block9(x)
        x = self.block10(x)
        x = self.block11(x)
        x = self.block12(x)

        x = self.conv3(x)
        x = self.bn3(x)
        x = self.relu(x)

        x = self.conv4(x)
        x = self.bn4(x)
        x = self.relu(x)

        x = F.adaptive_avg_pool2d(x, (1, 1))
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x

    def forward(self, input1, input2, input3):
        # In this function we pass in both images and obtain both vectors
        # which are returned
        output1 = self.forward_once(input1)
        output2 = self.forward_once(input2)
        output3 = self.forward_once(input3)

        return output1, output2, output3

# Define the triplet Loss Function
class TripletLoss(torch.nn.Module):
    def __init__(self, margin=2.0):
        super(TripletLoss, self).__init__()
        self.margin = margin

    # output 1 for anchor, output2 for positive, output3 for negative
    def forward(self, output1, output2, output3):
      # Calculate the euclidean distance and calculate the contrastive loss
      euclidean_distance_p = F.pairwise_distance(output1, output2, keepdim = True)
      euclidean_distance_n = F.pairwise_distance(output1, output3, keepdim = True)

      loss_triple = torch.mean((torch.clamp(self.margin - euclidean_distance_n + euclidean_distance_p, min=0.0)))

      return loss_triple

def xception(pretrained=True, to_cuda = False, reload_previous = False, **kwargs):
    """
    Construct Xception.
    """

    if to_cuda:
        model = Xception().cuda()
    else:
        model = Xception()
    if pretrained:
      # CHANGE HERE(path to )
        model.load_state_dict(torch.load('/content/drive/MyDrive/Music_Plagiarism/xception-43020ad28.pth'))
        # model.load_state_dict(model_zoo.load_url(model_urls['xception']))
    if reload_previous:
        load_model(model, path = kwargs['RELOAD_PATH'])
    return model

def load_model(model, path = './xception.pth'):
    model.load_state_dict(torch.load(path))

Dataset loader for training

In [7]:
class NetworkDataset(Dataset):
    def __init__(self,imageFolderDataset,transform=None):
        self.imageFolderDataset = imageFolderDataset
        self.transform = transform

    def __getitem__(self,index):
        img0_tuple = random.choice(self.imageFolderDataset.imgs)

        #We need to get a image in the same class and in a different class
        while True:
            #Look untill the same class image is found
            img1_tuple = random.choice(self.imageFolderDataset.imgs)
            if img0_tuple[1] == img1_tuple[1]:
                break
        while True:
            #Look untill a different class image is found
            img2_tuple = random.choice(self.imageFolderDataset.imgs)
            if img0_tuple[1] != img2_tuple[1]:
                break

        # open image file
        img0 = Image.open(img0_tuple[0])
        img1 = Image.open(img1_tuple[0])
        img2 = Image.open(img2_tuple[0])

        # to RBG
        img0 = img0.convert("RGB")
        img1 = img1.convert("RGB")
        img2 = img2.convert("RGB")


        if self.transform is not None:
            img0 = self.transform(img0)
            img1 = self.transform(img1)
            img2 = self.transform(img2)

        return img0, img1, img2

    def __len__(self):
        return len(self.imageFolderDataset.imgs)

Some helper functions needed for audio processing and etc..



In [8]:
# ------------ string related
def convert_time_to_sec(time_str):
    # convert time string like ['1:27'] to seconds
    if "," in time_str:
        return [convert_time_to_sec(i) for i in time_str.split(",")]

    # if "[" in time_str:
    time_str = time_str.replace("[", "")
    time_str = time_str.replace("]", "")
    time_str = time_str.replace("'", "")

    if ":" in time_str:
        time_str = time_str.split(":")
    elif "_" in time_str:
        time_str = time_str.split("_")
    return int(time_str[0]) * 60 + int(time_str[1])

def replace_invalid_char(string):
	invalid_char = ['/',':','*','?','"','<','>','|']
	new_string = ''
	for s in string:
		if s in invalid_char:
			s = '_'
		new_string += s
	return new_string

def remove_special_char(string):
	test_str = ''.join(letter for letter in string if letter.isalnum())
	return test_str
#--------------------------------------------


# ------------------------- audio related
def stretch_audio(audio, sr, rate):
	new_audio = librosa.effects.time_stretch(audio, rate=rate)
	return new_audio

def shift_audio(audio, sr, semitone):
	y_shifted = librosa.effects.pitch_shift(audio, sr=sr, n_steps=semitone)
	return y_shifted

def bpm_estimation(audio, sr):
	bpm, _ = librosa.beat.beat_track(y=audio, sr=sr)
	return bpm

def calculate_eightbars_duration(audio, sr):
	bpm = bpm_estimation(audio, sr)
	# assuming a 44 time signature
	secs = 60/bpm*4*8
	return secs
# --------------------------------------

# ---------------------for dataset conversion
def list_of_existing_sampletime(dir_list):
    return np.array(dir_list, dtype = np.int32)

def find_closest_number(array, target):
  # Find the minimum distance between the target value and each number in the array.
  distances = [np.abs(target - number) for number in array]
  # Return the index of the number with the minimum distance.
  return distances.index(min(distances))

def save_mel_spectrogram(audio_clip, file_path):
    # Produce the mel-spectrogram
    S = librosa.feature.melspectrogram(y=audio_clip, sr=SAMPLE_RATE)
    S_DB = librosa.power_to_db(S, ref=np.max)

    # Save the mel-spectrogram
    plt.figure(figsize=(10, 10))
    librosa.display.specshow(S_DB, sr=SAMPLE_RATE)
    plt.tight_layout()
    plt.savefig(file_path, bbox_inches='tight', pad_inches = 0, transparent = True)
    plt.close()

def save_chroma_feature(audio_clip, file_path):
     # Produce the chroma feature
    S = np.abs(librosa.stft(audio_clip, n_fft=4096))**2
    S = librosa.feature.chroma_stft(y=audio_clip, sr=SAMPLE_RATE)
    chroma = librosa.amplitude_to_db(S, ref=np.max)

    # Save the chroma feature
    plt.figure(figsize=(10, 10))
    librosa.display.specshow(chroma, sr=SAMPLE_RATE)
    plt.tight_layout()
    plt.savefig(file_path, bbox_inches='tight', pad_inches = 0, transparent = True)
    plt.close()

def get_mel_spectrogram(audio_clip):
    S = librosa.feature.melspectrogram(y=audio_clip, sr=SAMPLE_RATE)
    S_DB = librosa.power_to_db(S, ref=np.max)

    plt.figure(figsize=(10, 10))
    librosa.display.specshow(S_DB, sr=SAMPLE_RATE)
    plt.tight_layout()
    tempname = "/content/drive/MyDrive/Music_Plagiarism/temp_{0}.png".format(time.time())
    plt.savefig(tempname, bbox_inches='tight', pad_inches = 0, transparent = True)
    img = Image.open(tempname)
    img = img.convert("RGB")
    os.remove(tempname)
    plt.close()
    return img

def get_chroma_feature(audio_clip):
    S = np.abs(librosa.stft(audio_clip, n_fft=4096))**2
    S = librosa.feature.chroma_stft(y=audio_clip, sr=SAMPLE_RATE)
    chroma = librosa.amplitude_to_db(S, ref=np.max)

    plt.figure(figsize=(10, 10))
    librosa.display.specshow(chroma, sr=SAMPLE_RATE)
    plt.tight_layout()
    tempname = "./temp_{0}.png".format(time.time())
    plt.savefig(tempname, bbox_inches='tight', pad_inches = 0, transparent = True)
    img = Image.open(tempname)
    img = img.convert("RGB")
    img.show()
    os.remove(tempname)
    plt.close()
    return img
#--------------------------------------------

# ------------------------ for images
# Showing images
def imshow(img, text=None):
	npimg = img.numpy()
	plt.axis("off")
	if text:
		plt.text(75, 8, text, style='italic',fontweight='bold',
			bbox={'facecolor':'white', 'alpha':0.8, 'pad':10})

	plt.imshow(np.transpose(npimg, (1, 2, 0)))
	plt.show()

# Plotting data
def show_plot(iteration,loss):
	plt.plot(iteration,loss)
	plt.show()

#-------------------------------------------

# -------------- similarity score functions
def cos_sim_score(output1, output2):
	score = F.cosine_similarity(output1,output2, dim = 1)
	return score

def pearson_corr_score(output1, output2):
	xmean = torch.mean(output1)
	ymean = torch.mean(output2)
	p_score = torch.sum((output1-xmean)*(output2-ymean))/torch.sqrt(torch.sum((output1-xmean)**2)*torch.sum((output2-ymean)**2))
	return p_score

def weighted_score(output1,output2):
	edist = torch.dist(output1, output2)**2
	cos_sim = cos_sim_score(output1, output2)
	p_corr = pearson_corr_score(output1, output2)
	w_score = 0.2*edist+0.4*cos_sim+0.4*p_corr
	return [w_score, edist, cos_sim, p_corr]

def num_segment(duration, seg):
	interval = seg/2
	num = np.ceil((duration-seg)/interval)
	return num

def seg_interval(ind, duration, seg):
	interval = seg/2
	start_time = ind*interval
	end_time = start_time+seg
	if end_time>duration:
		end_time = duration
	return start_time, end_time
# ---------------------------------------

```main()``` function for training and testing



In [9]:
# Hyperparameters
# sample rate of songs
SAMPLE_RATE = 48000


def main():

	#
	if MODE == 'train':

		# Load the training dataset
		folder_dataset = datasets.ImageFolder(root=DATASET_PATH)

		# Resize the images and transform to tensors
		transformation = transforms.Compose([transforms.Resize((299,299)),
											 transforms.ToTensor()
											])

		# Initialize the network
		dataset = NetworkDataset(imageFolderDataset=folder_dataset,
												transform=transformation)

		# Load the training dataset
		train_dataloader = DataLoader(dataset,
								shuffle=True,
								num_workers=8,
								batch_size=16)

		# Initialize network
		net = xception(pretrained = True, to_cuda = CUDA, reload_previous = RELOAD_PREVIOUS, RELOAD_PATH = RELOAD_PATH)
		# Define the loss metric as Triplet Loss
		criterion = TripletLoss()
		# Define the optimizer used as Adam
		optimizer = optim.Adam(net.parameters(), lr = 0.001 )


		# Record loss value during training
		counter = []
		loss_history = []
		loss_epoch_history = np.zeros([TRAINING_EPOCH, 2])
		# If training is restarted from a savepoint, load previous loss history
		if RELOAD_PREVIOUS:
			data = np.loadtxt("{0}_loss.csv".format(MODEL_PATH), delimiter=',')
			nonzerorow, nonzerocol = np.nonzero(data)
			cutrow = np.min([np.max(nonzerorow), STARTING_EPOCH])
			nonzerodata = data[:cutrow+1,:]
			loss_epoch_history = np.vstack([nonzerodata, loss_epoch_history])
		iteration_number= 0
		# Ending index of epoch
		ending_epoch = STARTING_EPOCH+TRAINING_EPOCH
		# Iterate throught the epochs
		for epoch in range(STARTING_EPOCH, ending_epoch):
			# Initialize loss
			epoch_loss_ave = 0.0
			# Iterate over batches
			for i, (img0, img1, img2) in enumerate(train_dataloader, 0):
				if CUDA:
					# Send the images and labels to CUDA
					img0, img1, img2 = img0.cuda(), img1.cuda(), img2.cuda()

				# Zero the gradients
				optimizer.zero_grad()

				# Pass in the two images into the network and obtain two outputs
				output1, output2, output3 = net(img0, img1, img2)

				# Pass the outputs of the networks and label into the loss function
				loss_triplet = criterion(output1, output2, output3)

				# Calculate the backpropagation
				loss_triplet.backward()

				# Optimize
				optimizer.step()

				# Accumulate loss value
				epoch_loss_ave += loss_triplet.item()

				# Every 10 batches print out the loss
				if i % 10 == 0 :
					print(f"Epoch number {epoch} batch {i}\n Current loss {loss_triplet.item()}\n")
					iteration_number += 10
					counter.append(iteration_number)
					loss_history.append(loss_triplet.item())
			# Average loss value over batches
			epoch_loss_ave = epoch_loss_ave/i
			print(f"Epoch number {epoch}\n Current loss {epoch_loss_ave}\n")
			# Record loss value
			loss_epoch_history[epoch,0] = epoch
			loss_epoch_history[epoch,1] = epoch_loss_ave
			# Save current savepoint
			torch.save(net.state_dict(), "{0}_{1}.pth".format(MODEL_PATH, epoch))
			# Save current loss value
			np.savetxt("{0}_loss.csv".format(MODEL_PATH), loss_epoch_history, delimiter=',')

	elif MODE == 'mean_distance':

		# Load the training dataset
		folder_dataset = datasets.ImageFolder(root=DATASET_PATH)

		# Resize the images and transform to tensors
		transformation = transforms.Compose([transforms.Resize((299,299)),
											 transforms.ToTensor()
											])

		 # Initialize the network
		dataset = NetworkDataset(imageFolderDataset=folder_dataset,
												transform=transformation)

		# Load the training dataset
		validate_dataloader = DataLoader(dataset,
								shuffle=False,
								num_workers=8,
								batch_size=128)

		# Initialize the network
		net = xception(pretrained = True, to_cuda = CUDA, reload_previous = RELOAD_PREVIOUS, RELOAD_PATH = RELOAD_PATH)
		# Define evaluation metric as mean distance
		criterion = MeanDistance()

		# Record loss value during training
		ending_epoch = STARTING_EPOCH+TRAINING_EPOCH
		# Initialize array to record distance history
		distance_history = np.zeros([ending_epoch,3])
		# Iterate throught the epochs
		for epoch in range(STARTING_EPOCH, ending_epoch):
			# Load savepoint at each epoch
			LOAD_PATH = "{1}_{0}.pth".format(epoch, MODEL_PATH)
			load_model(net, path = LOAD_PATH)
			# Initialize mean distance
			epoch_distance_n = 0.0
			epoch_distance_p = 0.0
			# Iterate over batches
			for i, (img0, img1, img2) in enumerate(validate_dataloader, 0):

				if CUDA:
					# Send the images and labels to CUDA
					img0, img1, img2 = img0.cuda(), img1.cuda(), img2.cuda()
				with torch.no_grad():
					# Pass in the two images into the network and obtain two outputs
					output1, output2, output3 = net(img0, img1, img2)

				# Pass the outputs of the networks and label into the loss function
				mean_distance_p, mean_distance_n = criterion(output1, output2, output3)
				# Accumulate mean distance
				epoch_distance_n += mean_distance_n.item()
				epoch_distance_p += mean_distance_p.item()
			# Average distance over batches
			epoch_distance_n = epoch_distance_n/i
			epoch_distance_p = epoch_distance_p/i
			print(f"Epoch number {epoch}\n Mean positive distance {epoch_distance_p}\n Mean negative distance {epoch_distance_n}\n")
			# Record the mean distances
			distance_history[epoch,1] = epoch_distance_p
			distance_history[epoch,2] = epoch_distance_n
			distance_history[epoch,0] = epoch
			np.savetxt("{0}_distance.csv".format(MODEL_PATH), distance_history, delimiter=',')

	elif MODE == 'validate':
		# Load the training dataset
		folder_dataset = datasets.ImageFolder(root=VALIDATE_PATH)

		# Resize the images and transform to tensors
		transformation = transforms.Compose([transforms.Resize((299,299)),
											 transforms.ToTensor()
											])

		# Initialize the network
		dataset = NetworkDataset(imageFolderDataset=folder_dataset,
												transform=transformation)

		# Load the training dataset
		validate_dataloader = DataLoader(dataset,
								shuffle=False,
								num_workers=8,
								batch_size=128)
		# Initialize the network
		net = xception(pretrained = True, to_cuda = CUDA, reload_previous = RELOAD_PREVIOUS, RELOAD_PATH = RELOAD_PATH)
		# Define the loss metric as Triplet Loss
		criterion = TripletLoss()

		ending_epoch = STARTING_EPOCH+TRAINING_EPOCH

		validate_history = np.zeros([ending_epoch,2])
		# Iterate throught the epochs
		for epoch in range(STARTING_EPOCH, ending_epoch):
			LOAD_PATH = "{1}_{0}.pth".format(epoch, MODEL_PATH)
			load_model(net, path = LOAD_PATH)
			epoch_loss_ave = 0.0
			# Iterate over batches
			for i, (img0, img1, img2) in enumerate(validate_dataloader, 0):
				# def closure():
					# loss = criterion(output1, output2, output3)
					# loss.backward()
					# return loss
				if CUDA:
					# Send the images and labels to CUDA
					img0, img1, img2 = img0.cuda(), img1.cuda(), img2.cuda()
				with torch.no_grad():
					# Pass in the two images into the network and obtain two outputs
					output1, output2, output3 = net(img0, img1, img2)

				# Pass the outputs of the networks and label into the loss function
				loss_triplet = criterion(output1, output2, output3)

				epoch_loss_ave += loss_triplet.item()
			epoch_loss_ave = epoch_loss_ave/i
			print(f"Epoch number {epoch}\n Validation loss {epoch_loss_ave}\n")
			validate_history[epoch,1] = epoch_loss_ave
			validate_history[epoch,0] = epoch
			np.savetxt("{0}_loss_val.csv".format(MODEL_PATH), validate_history, delimiter=',')

	elif MODE == 'test':

		# Initialize the network
		net = xception(pretrained = True, to_cuda = CUDA, reload_previous = RELOAD_PREVIOUS, RELOAD_PATH = RELOAD_PATH)
		# Resize the images and transform to tensors
		transformation = transforms.Compose([transforms.Resize((299,299)),
											 transforms.ToTensor()
											])
		# Define model used
		if 'Melspectrogram' in MODEL_PATH:
			mel = True
			inputname = 'melspectrogram'
		elif 'Chroma' in MODEL_PATH:
			mel = False
			inputname = 'chroma'
		if '10s' in MODEL_PATH:
			seg10s = True
			segname = '10s'
		elif 'preprocess' in MODEL_PATH:
			seg10s = False
			segname = 'preprocessed'

		# Grab audios
		file_list = os.listdir(TEST_PATH)

		# Initialize lists
		song_list = []
		duration_list = []
		seg_list = []
		num_seg_list = []
		feature_list = []

		# Define batch size to run at once
		run_batch = 48

		for i in range(len(file_list)):
			# Find all the mp3 files in the TEST_PATH folder
			if 'mp3' in file_list[i]:
				filename = file_list[i].split('.')[0]
				# Get the song name
				song_list.append(filename)
				# Load the song
				y, sr = librosa.load(os.path.join(TEST_PATH, file_list[i]), sr=SAMPLE_RATE)
				# Select the slicing length
				if seg10s:
					seg = 10
				else:
					seg = calculate_eightbars_duration(y, SAMPLE_RATE)
				seg_list.append(seg)

				# Get the duration of the song
				duration = librosa.get_duration(y=y, sr=SAMPLE_RATE)
				duration_list.append(duration)

				# Compute how many segments could be sliced out
				numseg = num_segment(duration, seg)
				num_seg_list.append(numseg)

				# Initialize tensor to save transformed diagrams and outputted feature maps
				segarrs = torch.empty((int(numseg), 3, 299, 299))
				featuremap = torch.empty((int(numseg), 1000))

				# Iterate over the segments
				for j in range(int(numseg)):
					# Define the slice starting and ending time
					start_time, end_time = seg_interval(j, duration, seg)
					start_sample = librosa.time_to_samples(start_time, sr=SAMPLE_RATE)
					end_sample = librosa.time_to_samples(end_time, sr=SAMPLE_RATE)
					# Slice the audio
					audio_clip = y[start_sample:end_sample]
					# Generate appropraite diagrams
					if mel:
						img = get_mel_spectrogram(audio_clip)
					else:
						img = get_chroma_feature(audio_clip)

					# Transform the diagrams and save them
					arr = transformation(img)
					segarrs[j,:,:,:] = arr

					# Compute feature maps over batches
					if (j%run_batch == 0 and j!=0):
						batch_ind = j//run_batch
						# Compute batch starting and ending index in saved input array
						start_ind = (batch_ind-1)*run_batch
						end_ind = np.min([(batch_ind)*run_batch,int(numseg)])
						# Select the batch inputs
						segbatch = segarrs[start_ind:end_ind,:,:,:]
						if CUDA:
							segbatch = segbatch.cuda()
						with torch.no_grad():
							# Compute the feature map
							featuremap_batch = net.forward_once(segbatch)
						# Save the feature map
						featuremap[start_ind:end_ind,:] = featuremap_batch.cpu()

					if (j==int(numseg)-1):
						batch_ind = j//run_batch
						# Compute batch starting and ending index in saved input array
						start_ind = (batch_ind)*run_batch
						end_ind = int(numseg)
						# Select the batch inputs
						segbatch = segarrs[start_ind:end_ind,:,:,:]
						if CUDA:
							segbatch = segbatch.cuda()
						with torch.no_grad():
							# Compute the feature map
							featuremap_batch = net.forward_once(segbatch)
						# Save the feature map
						featuremap[start_ind:end_ind,:] = featuremap_batch.cpu()
				# Save the computed feature maps
				feature_list.append(featuremap)

		# Initialize array to save similarity scores
		record = np.zeros([int(num_seg_list[0]*num_seg_list[1]),8])
		counter = 0
		# Get the duration, segment length and computed feature maps of the pair
		duration1 = duration_list[0]
		duration2 = duration_list[1]
		seg1 = seg_list[0]
		seg2 = seg_list[1]
		featuremap1 = feature_list[0]
		featuremap2 = feature_list[1]
		# Iterate over feature maps of all segments
		for i in range(int(num_seg_list[0])):
			out1 = featuremap1[[i],:]
			# Compute the corresponding segment starting and ending time
			start_time1, end_time1 = seg_interval(i, duration1, seg1)
			for j in range(int(num_seg_list[1])):
				out2 = featuremap2[[j],:]
				# Compute the corresponding segment starting and ending time
				start_time2, end_time2 = seg_interval(j, duration2, seg2)
				# Compute similarity scores of pairs
				[w_score, edist, cos_sim, p_corr] = weighted_score(out1, out2)
				# Record the computed results
				record[counter, 0] = start_time1
				record[counter, 1] = end_time1
				record[counter, 2] = start_time2
				record[counter, 3] = end_time2
				record[counter, 4] = w_score
				record[counter, 5] = edist
				record[counter, 6] = cos_sim
				record[counter, 7] = p_corr
				counter += 1

		# Find the minimum euclidean distance and corresponding index
		min_score = np.min(record[:,5])
		min_index = np.argmin(record[:,5])

		print("For Case {9}, using {0} and {1} segments, the min euclidean distance is {2}. \n Occurs at {3} ({4}:{5}) and {6} ({7}:{8}). \n".format(inputname,segname, min_score, song_list[0], record[min_index,0], record[min_index,1], song_list[1], record[min_index,2], record[min_index,3],TEST_PATH.split(' ')[-1]))
		# Save records
		recordname = "{0}_{1}_{2}_{3}".format(inputname, segname, song_list[0], song_list[1])
		np.savetxt("{1}/{0}.csv".format(recordname,TEST_PATH),np.array(record), delimiter=',')

One caveat is that if you let the notebook run in the background without interaction, google colab will kill the session after 12 hours. To avoid the issue, we need to set a clicker in the background that will click on the connect button in a set time interval. To do that, do Ctrl-Shift-i to access the web console, the copy the following code to the console:

```
function ClickConnect(){
console.log("Working");
document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click();
}
var clicker = setInterval(ClickConnect,60000);
```

To clear the clicker, do:
```
clearInterval(clicker);
```

Now we run the ```main()``` function, which will run and save models based on the path you set below. Remember to change ```DATASET_PATH``` to the path pointing to the dataset folder, change ```MODEL_PATH``` to your desired folder for saving the intermittent model parameters.

There are four different modes to choose from:

1.   ```'train'```: start training the model using Adam optimizer and Triplet loss. If ```RELOAD_PREVIOUS``` is set to False, ```STARTING_EPOCH``` will be set to 0, and the model will be trained from scratch.
If the user wants to start training from an existing savepoint, set ```RELOAD_PREVIOUS``` to ```True``` and there will be a prompt asking for the starting epoch of the training. The user should input the epoch corresponding to the savepoint.          
2.   ```'validate'```: the script will reload savepoints in ```MODEL_PATH``` and compute the loss value of the validation dataset for each savepoint.
3.   ```'mean_distance'```: the script will reload savepoints in ```MODEL_PATH``` and compute the mean distance value of the training dataset for each savepoint.
4.   ```'test'```: the script to load the pair of mp3 files in ```TEST_PATH```. By reloading the savepoint in ```RELOAD_PATH```, the model will slice segments of the two mp3 files in ```TEST_PATH``` based on the ```model_name``` and compute the feature maps of the segments. A minimum Euclidean distance between feature maps and the corresponding segments are printed to display.

In [None]:
MODE = 'train'
if MODE == 'test' or MODE == 'validate' or MODE == 'mean_distance':
	RELOAD_PREVIOUS = True
else:
	response = input("Do you want to reload a savepoint for retraining? (Yes or No) ")
	RELOAD_PREVIOUS = True if response == 'Yes' else False

# path pointing to the training dataset
DATASET_PATH = "/content/drive/MyDrive/Music_Plagiarism/chroma_feature10s_dataset/training"
# path pointing to the validation dataset
VALIDATE_PATH = "/content/drive/MyDrive/Music_Plagiarism/chroma_feature10s_dataset/testing"
# path pointing to the folder containing a pair of mp3 files to be compared
# the folder should ONLY contain two mp3 files
TEST_PATH = "/content/drive/MyDrive/Music_Plagiarism/Dataset/Audio/Case 1"
# path pointing to the folder for saving training savepoints
MODEL_PATH = "/content/drive/MyDrive/Music_Plagiarism/chroma_feature10s_savepoint"
# name for saving your model
model_name = "model_chroma10s"
# controls the script whether or not to use GPU resources.
CUDA = True
# epoch duration of the training
TRAINING_EPOCH = 150
if RELOAD_PREVIOUS:
	if MODE == 'train':
		STARTING_EPOCH = int(input("Enter the starting epoch for retraining: "))
		RELOAD_PATH = "{1}/{2}_{0}.pth".format(STARTING_EPOCH, MODEL_PATH, model_name)
	else:
		STARTING_EPOCH = 0
		RELOAD_PATH = input("Enter the path to the savepoint: ")
else:
	STARTING_EPOCH = 0
	RELOAD_PATH = ""
main()