## TRIPLET LOSS CNN IMPLEMENTATION

In [1]:
from Model import triplet_model, embed_model
from utils import genre_count_dataset, img_from_ID, images_from_ids
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
import operator
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


### Loading data.csv
Contains
* Id
* Genre
* Song Name
* Spectrogram(path)

In [2]:
data = pd.read_csv("data.csv")

In [3]:
print("DATASET SHAPE",data.shape)
data.head()

DATASET SHAPE (3208, 4)


Unnamed: 0,Id,Genre,Song Name,Spectrogram
0,1,Classical,Beethoven-Symphony 9,Spectrograms/Classical/Beethoven-Symphony 9/Be...
1,2,Classical,Beethoven-Symphony 9,Spectrograms/Classical/Beethoven-Symphony 9/Be...
2,3,Classical,Beethoven-Symphony 9,Spectrograms/Classical/Beethoven-Symphony 9/Be...
3,4,Classical,Beethoven-Symphony 9,Spectrograms/Classical/Beethoven-Symphony 9/Be...
4,5,Classical,Beethoven-Symphony 9,Spectrograms/Classical/Beethoven-Symphony 9/Be...


In [4]:
print("DATASET GENRE COMPOSITION\n")
cl, h ,m ,r ,p ,co = genre_count_dataset(data, data.shape[0])

DATASET GENRE COMPOSITION

Classical 595
Hip-Hop 573
Metal 643
Rock 469
Pop 462
Country 466


### CALLING EMBEDDING MODEL
* embed_model gives 128 embeddings corresponding to a spectrogram image (128,1402,1)

In [5]:
submodel = embed_model()

In [6]:
submodel.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 128, 1402, 1)      0         
_________________________________________________________________
zero_padding2d_1 (ZeroPaddin (None, 134, 1408, 1)      0         
_________________________________________________________________
conv1 (Conv2D)               (None, 64, 701, 64)       3200      
_________________________________________________________________
activation_1 (Activation)    (None, 64, 701, 64)       0         
_________________________________________________________________
zero_padding2d_2 (ZeroPaddin (None, 66, 703, 64)       0         
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 32, 351, 64)       0         
_________________________________________________________________
lambda_1 (Lambda)            (None, 32, 351, 64)       0         
__________

### CALL TRIPLET MODEL
* Gets you triplet model
* implements triplet loss
* metric accuracy for custom accuracy check

In [7]:
triplet_model = triplet_model()

  name='loss', output_shape=(1, ))
  name=name)
  model = Model(input=[anchor_input, positive_input, negative_input], output=loss)


In [8]:
triplet_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
anchor_input (InputLayer)       (None, 128, 1402, 1) 0                                            
__________________________________________________________________________________________________
positive_input (InputLayer)     (None, 128, 1402, 1) 0                                            
__________________________________________________________________________________________________
negative_input (InputLayer)     (None, 128, 1402, 1) 0                                            
__________________________________________________________________________________________________
model_2 (Model)                 (None, 128)          496976      anchor_input[0][0]               
                                                                 positive_input[0][0]             
          

### GENERATING TRIPLETS
Concept:
* Pick n (32) random anchor points (ids from database)
* Make a remaining list containing ids not included by anchors
* For each anchor find most optimal semi-hard positives and negatives.


Concept for chosing triplets:
* Choose a positive from same genre but farthest in distance (embedding distance np.linalg.norm) to the anchor.
* Choose a negative from differest genre but closest in distance to the anchor.

### GET IMAGES FROM ID (SPECTROGRAM PATH)

In [9]:
data = images_from_ids(data)

Images added to dataframe


In [10]:
data.Images[1].shape

(1, 128, 1402, 1)

### PREDICTING EMBEDDINGS FOR EACH SONG ID 
* Predicting embeddings for each song ID takes alot of time
* Storing it in temp_embed dictionary saves time 

In [11]:
temp_embed = {}
for i in range(1, data.shape[0]+1):
    img =  img_from_ID(data, i)
    temp_embed[i] = submodel.predict_on_batch(img)  
    

In [12]:
len(temp_embed)

3208

### GENERATING TRIPLETS
* Randomly picking anchor_batch (32) id's. 
* Marking list of positive distances (in same genre) and negative distances (not of same genre).
* Looking for farthest positive distance for Positive and closest negative distance for Negative for each anchor point.

In [13]:
anchor_batch = 32
def generate_triplets():  
            import time
            trip_ids = []
            triplets  = []
            
            # id's 
            anchors = random.sample(list(range(1,data.shape[0]+1)), anchor_batch)
            remaining = [i for i in range(1, data.shape[0]+1) if i not in anchors]
            
            start_time = time.time()
#             for a in tqdm(anchors): 
            for a in tqdm(anchors):
                pos_dict = {}
                neg_dict = {}
                
                # got embeddings, genre of an id in anchor
                a_embed =  temp_embed[a]            
                a_genre = data.loc[a-1]['Genre']                
                
                for rem in remaining:
             # iterating over remaining and checking for its genre, embed
                    rem_embed = temp_embed[rem]
                    rem_genre = data.loc[rem-1]['Genre']

                    # distances
                    if (a_genre == rem_genre):
                        pos_dict[rem] = np.linalg.norm(rem_embed - a_embed)
                    else:
                        neg_dict[rem] = np.linalg.norm(rem_embed - a_embed)
                        
                # indices of min distance neg and max distance pos
                pos_index = max(pos_dict.items(), key=operator.itemgetter(1))[0]
                neg_index = min(neg_dict.items(), key=operator.itemgetter(1))[0]


                # add images corresponding to indices 
                anchor_img = img_from_ID(data, a)
                positive_img =img_from_ID(data, pos_index)
                negative_img = img_from_ID(data, neg_index)
                
                trip_ids.append([a, pos_index, neg_index])
                triplets.append([anchor_img, positive_img, negative_img])
           
            trip_ids = np.array(trip_ids)
            triplets = np.array(triplets)
            triplets = triplets.reshape(anchor_batch,3, 128, 1402,1)
            
            print("Execution Time: ", time.time()-start_time) 
            return trip_ids, triplets


In [14]:
tripvec = np.vectorize(generate_triplets, otypes=[np.ndarray])
trip = tripvec()

# # n,p = generate_triplets()

100%|██████████████████████████████████████████████████████████████████████████████████| 32/32 [00:13<00:00,  2.33it/s]


Execution Time:  13.764098882675171


In [15]:
# trip = np.array(trip)
# treip_ids = np.array(treip_ids)

In [16]:
# treip_ids.shape

In [17]:
# trip.shape

#### Model fit and evaluate
* Model.fit - fits the triplets into the model using accurcay (custom metric) and triplet loss

* Model.evaluate - evaluates on the metrics the model was compiled in

In [52]:
epochs = 5
split = 2
# increase to 32
batch_size = 6
# alpha = 0.1

In [53]:
# for epoch in tqdm(range(epochs)):
for epoch in range(epochs):
    print('Epoch %s \n' % epoch)
    
    print("GENERATING TRIPLETS")
    tripvec = np.vectorize(generate_triplets, otypes=[np.ndarray])
    trip_ids, trip = tripvec()
    
    anchors = trip[:,0]
    positives = trip[:,1]
    negatives = trip[:,2]
    
#     print(trip.shape)
    print("MODEL FITTING")
    triplet_model.fit([anchors, positives, negatives], y = np.zeros(32), batch_size= batch_size, verbose = 1)
    
    
    if(epoch% split==0):
        print("EVALUATION:")
        eval = triplet_model.evaluate([anchors, positives, negatives], y = np.zeros(32), verbose=1)
        print("EVAL",eval)
        
        # picking random point and checking its prediction
        pred = triplet_model.predict([anchors[9].reshape(1,128, 1402, 1), positives[9].reshape(1,128, 1402, 1), negatives[9].reshape(1,128, 1402, 1)], verbose=1)
        print("PRED",pred)
  
    print("\n\n")

Epoch 0 

GENERATING TRIPLETS


100%|██████████████████████████████████████████████████████████████████████████████████| 32/32 [00:13<00:00,  2.29it/s]


Execution Time:  14.032296180725098
MODEL FITTING
Epoch 1/1
EVALUATION:
EVAL 3.19988155365
PRED [ 0.10000217]



Epoch 1 

GENERATING TRIPLETS


100%|██████████████████████████████████████████████████████████████████████████████████| 32/32 [00:14<00:00,  2.20it/s]


Execution Time:  14.604769229888916
MODEL FITTING
Epoch 1/1



Epoch 2 

GENERATING TRIPLETS


100%|██████████████████████████████████████████████████████████████████████████████████| 32/32 [00:13<00:00,  2.31it/s]


Execution Time:  13.868413925170898
MODEL FITTING
Epoch 1/1
EVALUATION:
EVAL 3.19997692108
PRED [ 0.1000018]



Epoch 3 

GENERATING TRIPLETS


100%|██████████████████████████████████████████████████████████████████████████████████| 32/32 [00:14<00:00,  2.24it/s]


Execution Time:  14.345121145248413
MODEL FITTING
Epoch 1/1



Epoch 4 

GENERATING TRIPLETS


100%|██████████████████████████████████████████████████████████████████████████████████| 32/32 [00:14<00:00,  2.29it/s]


Execution Time:  14.03913927078247
MODEL FITTING
Epoch 1/1
EVALUATION:
EVAL 3.19995260239
PRED [ 0.10000558]





# RUN TILL HERE FIRST!

## CLUSTERING USING KNN  -- SUPERVISED (On Genre)

### DATA PREPARATION
* Copy data into new dataframe
* Get me embeddings of each song from embed_model (Embedding column)
* Save to csv (embed_data.csv)

In [20]:
embed_data = data.copy(deep= True)
embeddings = []

for i in range(data.shape[0]):
    curr_id = data.Id [i] 
    test_point = img_from_ID(data,curr_id)
    curr_embedding = submodel.predict_on_batch(test_point)
    curr_embedding = np.ndarray.tolist(curr_embedding)
    embeddings.append(np.array(curr_embedding[0]))
#     embeddings =  np.array(embeddings)
    
    
embed_data['Embeddings'] =embeddings
embed_data.Embeddings[0].shape
# np.savetxt('test.csv', embed_data) 
embed_data.to_csv('embed_dataa.csv', na_rep = None )

In [21]:
embeddings = np.array(embeddings)

In [22]:
embeddings[0].dtype

dtype('float64')

In [23]:
embed_data.Embeddings[0].dtype

dtype('float64')

### USER INPUT CASE (Imagining its not in database rn)
CONTEXT-
User inputs a value or new song

FOR NOW-
Picking random point from database

In [24]:
curr_id = data.Id [0] 
test_point = img_from_ID(data,curr_id)

In [25]:
test_point.shape

(1, 128, 1402, 1)

#### Get me embeddings!

In [26]:
e = submodel.predict_on_batch(test_point)

### TESTING KNN APPROACH: SHOW ME WHAT YOU'VE GOT KNN?

In [27]:
edata = pd.read_csv("embed_dataa.csv")
X = embed_data.drop(['Id','Genre',' Spectrogram','Song Name','Images'],axis=1)
y = embed_data['Genre']

In [28]:
X.Embeddings[9].shape

(128,)

In [29]:
X.head()

Unnamed: 0,Embeddings
0,"[-0.150376558304, 0.0313371270895, -0.02312531..."
1,"[-0.132028907537, 0.0377219542861, -0.02441185..."
2,"[-0.134034276009, 0.0244921408594, -0.01880592..."
3,"[-0.125250160694, 0.0297113582492, -0.01148473..."
4,"[-0.13614282012, 0.0398959517479, -0.024466186..."


In [30]:
X.Embeddings[0].dtype

dtype('float64')

In [31]:
X.dtypes

Embeddings    object
dtype: object

In [32]:
X = X.values.T.tolist()

In [33]:
X = np.array(X)
X = X.reshape(3208,128)

In [34]:
y[0]

'Classical'

# ERROR - Cant convert string to float_32
* Embeddings (python list) saving as string in csv
* Cant fit string in KNN!!

In [35]:
# X.Embeddings = X.Embeddings.convert_object(convert_numeric=True)

In [36]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.20, random_state=42)

In [37]:
X[0].shape

(128,)

In [38]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
knn = KNeighborsClassifier(n_neighbors=8)


In [39]:
# for i in range(embed_data.shape[0]):
#     print(embed_data.Embeddings[i].shape)

In [40]:
### X_train.shape
knn.fit((X_train),(y_train))
pred = knn.predict(np.array(X_test))
print (accuracy_score(y_test, pred))

0.426791277259


#### OUTPUT
* Gets me class of genre for test data point (pred)
* Now I know the accuracy too.

### KNN NEXT STEP 
#### AIM - Suggest me top k (5) similar songs please!

* I've predicted the class of genre my input song belongs to. Now I need to pick top 5 Song Name from this Genre with lease distant spectrogram embeddings.

In [41]:
## new random point 
## call predict on it
pred = knn.predict([X_test[0]])
## class of genre
# print(pred)

In [42]:
pred[0]

'Metal'

In [43]:
# # Suggestions of top 5 songs
# curr_embed = (X_test[0])
# predicted_Genre = pred
# dist = []
# dict = {}
# for i in range(edata.shape[0]):
#     if edata.Genre[i] == predicted_Genre:
#         dist = np.linalg.norm(np.array(edata.Embeddings[i]),curr_embed)
# #         dict.append([i])
#         dist = np.sort(dist, axis = None)
        
# print(dist[5:])
        
        
        
    

## CLUSTERING USING K Means  -- UNSUPERVISED 

In [44]:
from sklearn.cluster import KMeans

In [45]:
kmeans = KMeans(n_clusters=6 , random_state=32).fit(X_train)

In [46]:
kmeans.labels_

array([4, 1, 1, ..., 1, 4, 3])

In [47]:
kmeans.predict(X_test)
# kmeans.cluster_centers_

array([2, 5, 2, 2, 2, 1, 2, 5, 4, 5, 4, 2, 1, 4, 1, 4, 2, 2, 1, 5, 2, 0, 4,
       2, 1, 2, 5, 1, 1, 1, 2, 1, 5, 1, 1, 4, 4, 4, 4, 4, 2, 1, 5, 2, 5, 4,
       5, 3, 5, 4, 2, 1, 1, 4, 4, 2, 5, 4, 2, 3, 4, 2, 1, 1, 2, 4, 2, 0, 2,
       5, 4, 5, 5, 1, 4, 5, 5, 1, 2, 4, 2, 4, 1, 2, 2, 1, 1, 4, 1, 2, 0, 1,
       4, 5, 0, 4, 5, 4, 1, 5, 2, 1, 2, 5, 2, 1, 3, 4, 1, 5, 4, 5, 2, 1, 1,
       1, 1, 1, 0, 1, 2, 2, 4, 1, 2, 1, 5, 2, 1, 1, 4, 0, 0, 3, 4, 2, 1, 5,
       1, 4, 4, 1, 4, 1, 4, 4, 5, 4, 2, 5, 0, 4, 0, 5, 2, 1, 4, 1, 4, 4, 2,
       2, 2, 1, 1, 4, 5, 3, 3, 4, 2, 1, 4, 3, 1, 1, 3, 1, 1, 2, 0, 4, 5, 3,
       1, 0, 5, 1, 1, 1, 1, 2, 5, 0, 1, 1, 4, 4, 1, 0, 5, 1, 5, 5, 4, 5, 0,
       2, 2, 2, 2, 0, 1, 4, 0, 5, 5, 5, 1, 2, 3, 1, 1, 2, 5, 2, 3, 5, 4, 1,
       5, 2, 4, 0, 4, 1, 4, 2, 0, 4, 1, 1, 0, 1, 0, 1, 0, 2, 0, 1, 1, 1, 1,
       1, 5, 5, 4, 2, 5, 5, 1, 1, 2, 5, 1, 2, 4, 4, 1, 4, 4, 1, 4, 1, 2, 2,
       4, 5, 1, 1, 2, 5, 2, 4, 1, 5, 4, 2, 4, 1, 4, 1, 1, 5, 5, 5, 4, 4, 4,
       4, 4,

In [48]:
kmeans.cluster_centers_

array([[ -1.31196782e-01,   5.23434589e-02,  -3.31131550e-02,
          6.61033725e-02,  -5.69456800e-02,  -9.68556355e-03,
          2.84999520e-02,  -2.64676836e-02,   1.03869581e-02,
          9.36835593e-02,  -4.33294559e-02,  -1.06340722e-01,
          6.18451159e-02,   1.06235386e-01,  -2.00249885e-02,
          1.57729727e-02,  -1.34469389e-01,  -7.33397656e-02,
          8.96429315e-02,  -3.77647850e-02,  -9.57833684e-02,
          7.57373072e-02,   8.71683236e-02,  -3.73569943e-02,
         -2.80543968e-04,   2.27685390e-02,   2.42228666e-02,
         -3.32348652e-02,  -1.35734930e-01,   8.18459693e-02,
          2.04680059e-02,   1.23391565e-01,   7.19275914e-02,
         -4.27960543e-02,   1.34437923e-02,   2.95200005e-02,
          3.16944572e-02,  -1.33075925e-01,  -1.04155146e-02,
         -2.17998912e-02,   8.32341664e-03,   6.02813668e-03,
          1.26445535e-01,  -1.78896448e-01,  -8.29863449e-02,
          1.30646166e-02,  -1.62167604e-02,  -8.28745210e-02,
        