<h1 style="color:rgb(0,120,170)">344.038, KV Multimedia Search and Retrieval (WS2023/24)</h1>
<h2 style="color:rgb(0,120,170)">Task 2_Group B</h2>

| First Name | Family Name  | Matr.Nr   |
|:-----------|:-------------|:----------|
| Harald     | Eibensteiner | K01300179 |
| Hadi       | Sanaei       | K11733444 |
| Lukas      | Troyer       | K12006666 |
| Lukas      | Wagner       | K01357626 |
| Branko     | Paunović     | K12046370 |

### Load Data & Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
information_data = pd.read_csv('datasets/id_information_mmsr.tsv', delimiter='\t')

blf_correlation_data = pd.read_csv('datasets/id_blf_correlation_mmsr.tsv', delimiter='\t')
blf_spectral_data = pd.read_csv('datasets/id_blf_spectral_mmsr.tsv', delimiter='\t')

mfcc_bow_data = pd.read_csv('datasets/id_mfcc_bow_mmsr.tsv', delimiter='\t')
mfcc_stats_data = pd.read_csv('datasets/id_mfcc_stats_mmsr.tsv', delimiter='\t')

ivec256_data = pd.read_csv('datasets/id_ivec256_mmsr.tsv', delimiter='\t')
ivec512_data = pd.read_csv('datasets/id_ivec512_mmsr.tsv', delimiter='\t')

musicnn_data = pd.read_csv('datasets/id_musicnn_mmsr.tsv', delimiter='\t')


### Retrieve top N similar tracks

In [3]:
# Function to calculate similarity between two tracks
def calculate_similarity(query_features, target_features):
    similarity_matrix = cosine_similarity(query_features, target_features)
    return similarity_matrix

# Function to retrieve top N similar tracks
def retrieve_top_similar_tracks(query_track_id, features_data, feature_columns, information_data, num_top_similar):
    if query_track_id not in features_data['id'].values:
        print(f"Track ID {query_track_id} not found in the data.")
        return
    
    query_track_features = features_data[features_data['id'] == query_track_id].iloc[:, 1:].values

    # Calculate similarity with the given features
    similarity_matrix = calculate_similarity(query_track_features, features_data.iloc[:, 1:].values)

    # Get the indices of the top N similar tracks
    top_indices = np.argsort(similarity_matrix[0])[-num_top_similar:][::-1]

    # Create a DataFrame to store the results
    result_df = pd.DataFrame(columns=['Track ID', 'Similarity', 'Artist', 'Song'])


    # Create a list to store rows
    result_rows = []

    # Populate the list with song and artist information
    for track_index in top_indices:
        track_id = features_data.loc[track_index, 'id']

        info_row = information_data[information_data['id'] == track_id]
        if not info_row.empty:
            artist = info_row['artist'].values[0]
            song = info_row['song'].values[0]

            result_rows.append({
                'Track ID': track_id,
                'Similarity': similarity_matrix[0][track_index],
                'Artist': artist,
                'Song': song
            })
            
    # Create the DataFrame from the list of dictionaries
    result_df = pd.DataFrame(result_rows)

    # Sort the DataFrame based on "Similarity" column
    result_df = result_df.sort_values(by='Similarity', ascending=False)

    # Display sorted results
    print(f"\nTop {num_top_similar} Similar Tracks:")
    print(result_df)


### Audio-based(similarity, MFCC)

In [4]:
# Sample query ID:  GDGURAgKxNPEuXzM

num_top_similar = int(input("Enter the number of top similar tracks: "))
retrieve_top_similar_tracks("GDGURAgKxNPEuXzM", mfcc_bow_data, mfcc_bow_data.columns[1:], information_data, num_top_similar)
retrieve_top_similar_tracks("GDGURAgKxNPEuXzM", mfcc_stats_data, mfcc_stats_data.columns[1:], information_data, num_top_similar)


Enter the number of top similar tracks: 5

Top 5 Similar Tracks:
           Track ID  Similarity          Artist  \
0  GDGURAgKxNPEuXzM    1.000000  Monster Magnet   
1  1fmfftKuLUstbtui    0.940009     Iron Maiden   
2  W0I0QY8ucb4NcE3y    0.931293         Placebo   
3  P3zsw3IZzxbf237f    0.930562        Chevelle   
4  dhqEiyEoMOEDFjZE    0.928979      Candlemass   

                               Song  
1          When the Wild Wind Blows  
2                       Haemoglobin  
3                           Enemies  
4                The Dying Illusion  

Top 5 Similar Tracks:
           Track ID  Similarity                 Artist  \
0  GDGURAgKxNPEuXzM    1.000000         Monster Magnet   
1  c0oTPqvKZKkIyvkx    0.907322               Ladytron   
2  48mN3L6l1FWVFNUX    0.906402  The Birthday Massacre   
3  yUh1QII9semRCo19    0.905528            The Mission   
4  4tE73sPTNQvO0BkP    0.905316       Ringo Deathstarr   

                               Song  
1                         De

In [5]:
#num_top_similar = int(input("Enter the number of top similar tracks: "))
retrieve_top_similar_tracks("GDGURAgKxNPEuXzM", mfcc_bow_data, mfcc_bow_data.columns[1:], information_data, num_top_similar)
retrieve_top_similar_tracks("GDGURAgKxNPEuXzM", mfcc_stats_data, mfcc_stats_data.columns[1:], information_data, num_top_similar)


Top 5 Similar Tracks:
           Track ID  Similarity          Artist  \
0  GDGURAgKxNPEuXzM    1.000000  Monster Magnet   
1  1fmfftKuLUstbtui    0.940009     Iron Maiden   
2  W0I0QY8ucb4NcE3y    0.931293         Placebo   
3  P3zsw3IZzxbf237f    0.930562        Chevelle   
4  dhqEiyEoMOEDFjZE    0.928979      Candlemass   

                               Song  
1          When the Wild Wind Blows  
2                       Haemoglobin  
3                           Enemies  
4                The Dying Illusion  

Top 5 Similar Tracks:
           Track ID  Similarity                 Artist  \
0  GDGURAgKxNPEuXzM    1.000000         Monster Magnet   
1  c0oTPqvKZKkIyvkx    0.907322               Ladytron   
2  48mN3L6l1FWVFNUX    0.906402  The Birthday Massacre   
3  yUh1QII9semRCo19    0.905528            The Mission   
4  4tE73sPTNQvO0BkP    0.905316       Ringo Deathstarr   

                               Song  
1                         Deep Blue  
2                         Goodni

### Audio-based(similarity, BLFs)

In [6]:
#num_top_similar = int(input("Enter the number of top similar tracks: "))
retrieve_top_similar_tracks("GDGURAgKxNPEuXzM", blf_correlation_data, blf_correlation_data.columns[1:], information_data, num_top_similar)
retrieve_top_similar_tracks("GDGURAgKxNPEuXzM", blf_spectral_data, blf_spectral_data.columns[1:], information_data, num_top_similar)


Top 5 Similar Tracks:
           Track ID  Similarity                 Artist  \
0  GDGURAgKxNPEuXzM    1.000000         Monster Magnet   
1  6PtJA0gCyouQ1DQ3    0.774222          The Offspring   
2  mqJkX1enaX9MwXoy    0.762280            Descendents   
3  0K2vsBA9smDZxKno    0.753529           Bad Religion   
4  NMLi17kclJbJxQjs    0.746089  The Pigeon Detectives   

                               Song  
1                Stuff Is Messed Up  
2                      Victim Of Me  
3                 Markovian Process  
4                       I Found Out  

Top 5 Similar Tracks:
           Track ID  Similarity                    Artist  \
0  GDGURAgKxNPEuXzM    1.000000            Monster Magnet   
1  N3lLuewmtmcilyLd    0.942433                 Motörhead   
2  8ikd0gLo4sAMaBnE    0.939286  Unknown Mortal Orchestra   
3  RRZEwA4gez2F70f7    0.932766           Omnium Gatherum   
4  hXg4xQvc1sgGkw1B    0.930391             Paradise Lost   

                               Song  
1         

### Audio-based(similarity, i-vectors)

In [7]:
#num_top_similar = int(input("Enter the number of top similar tracks: "))
retrieve_top_similar_tracks("GDGURAgKxNPEuXzM", ivec256_data, ivec256_data.columns[1:], information_data, num_top_similar)
retrieve_top_similar_tracks("GDGURAgKxNPEuXzM", ivec512_data, ivec512_data.columns[1:], information_data, num_top_similar)


Top 5 Similar Tracks:
           Track ID  Similarity           Artist  \
0  GDGURAgKxNPEuXzM    1.000000   Monster Magnet   
1  DNjUvcL366eWefpl    0.427675        Sepultura   
2  JT4j4AEG4zHM5Fjv    0.397676         Mudvayne   
3  gurWvetupCQwYE83    0.380437  Jeff Rosenstock   
4  v9b0jR48vKbO7HZa    0.376609     Manfred Mann   

                               Song  
1                      Straighthate  
2                A Cinderella Story  
3                  Darkness Records  
4                   Pretty Flamingo  

Top 5 Similar Tracks:
           Track ID  Similarity          Artist  \
0  GDGURAgKxNPEuXzM    1.000000  Monster Magnet   
1  ssLA8i7a6EryN9XM    0.310392          Thalía   
2  JT4j4AEG4zHM5Fjv    0.295929        Mudvayne   
3  DNjUvcL366eWefpl    0.281713       Sepultura   
4  v9b0jR48vKbO7HZa    0.277520    Manfred Mann   

                               Song  
1             No Puedo Vivir Sin Ti  
2                A Cinderella Story  
3                      Straigh

### Audio-based(similarity, musicnn)

In [8]:
#num_top_similar = int(input("Enter the number of top similar tracks: "))
retrieve_top_similar_tracks("GDGURAgKxNPEuXzM", musicnn_data, musicnn_data.columns[1:], information_data, num_top_similar)


Top 5 Similar Tracks:
           Track ID  Similarity          Artist  \
0  GDGURAgKxNPEuXzM    1.000000  Monster Magnet   
1  1fmfftKuLUstbtui    0.996261     Iron Maiden   
2  qwGkARjoeH6k0JDy    0.995610         Ramones   
3  UOuWHCEIps7Mjz3E    0.995505           Oasis   
4  1X00fAREh70517ij    0.995283    Running Wild   

                               Song  
1          When the Wild Wind Blows  
2                  Carbona Not Glue  
3                       Shakermaker  
4                   Bad to the Bone  


### Evaluation