In [None]:
!pip install msclap

In [None]:
from msclap import CLAP
clap_model = CLAP(version = '2023', use_cuda=True)

CLAP_weights_2023.pth:   0%|          | 0.00/690M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
!pip install librosa
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [None]:
import torch
import numpy as np

class_list = ["blues", "classical", "country", "disco", "hiphop", "jazz", "metal", "pop", "reggae", "rock"]
text_embeddings = clap_model.get_text_embeddings(class_list)
# print(text_embeddings)

music_folder = "drive/MyDrive/music"
# music_file_list = ["Too_Sweet.mp3"]
music_file_list = ["Too_Sweet.mp3", "i_like_the_way_you_kiss_me.mp3", "Beautiful_Things.mp3", "espresso.mp3", "good_luck_babe.mp3"]
music_path_list = []

for music_file in music_file_list:
    music_path = f"{music_folder}/{music_file}"
    music_path_list.append(music_path)

music_embeddings = clap_model.get_audio_embeddings(music_path_list)
similarities = clap_model.compute_similarity(music_embeddings, text_embeddings)

for i in range(len(similarities)):
    temp_similarity = similarities[i]
    values, indices = torch.topk(temp_similarity, 3)
    music_name = music_file_list[i].split(".")[0].lower().replace("_", " ")
    print("-"*10)
    print(music_name)
    for index in indices:
        print(class_list[index], end = " ")
    print()


----------
too sweet
disco hiphop country 
----------
i like the way you kiss me
disco country rock 
----------
beautiful things
country disco rock 
----------
espresso
disco country hiphop 
----------
good luck babe
disco country jazz 


In [None]:
import os
import torch
import librosa
import pprint
import json
import numpy as np
from pydub import AudioSegment

music_folder = "drive/MyDrive/music/"
lyrics_folder = "drive/MyDrive/lyrics/"
# music_file_list = ["Too_Sweet.mp3"]
music_file_list = ["Too_Sweet.mp3", "i_like_the_way_you_kiss_me.mp3", "Beautiful_Things.mp3", "espresso.mp3", "good_luck_babe.mp3"]
music_features = {}

genre__list = ["blues", "classical", "country", "disco", "hiphop", "jazz", "metal", "pop", "reggae", "rock"]
genre_embeddings = clap_model.get_text_embeddings(class_list)
speed__list = ["fast", "medium", "slow"]
speed_embeddings = clap_model.get_text_embeddings(speed__list)
emotion__list = ["happy", "sad", "calm", "angry", "fearful", "romantic", "inspirational", "reflective"]
emotion_embeddings = clap_model.get_text_embeddings(emotion__list)

def extract_feature(music_name, lyric_data):
    file_list = os.listdir(music_name)
    file_list = [int(item.split(".")[0]) for item in file_list]
    file_list.sort()
    single_music_feature = []

    for i in range(len(file_list)):
        music_path = f"{music_name}/{file_list[i]}.0.mp3"
        music_data, sampling_rate = librosa.load(music_path)

        music_embeddings = clap_model.get_audio_embeddings([music_path])
        similarities_genre = clap_model.compute_similarity(music_embeddings, genre_embeddings)
        similarities_speed = clap_model.compute_similarity(music_embeddings, speed_embeddings)
        similarities_emotion = clap_model.compute_similarity(music_embeddings, emotion_embeddings)

        values_genre, indices_genre = torch.topk(similarities_genre, 1)
        values_speed, indices_speed = torch.topk(similarities_speed, 1)
        values_emotion, indices_emotion = torch.topk(similarities_emotion, 1)

        genre = genre__list[indices_genre[0]]
        speed = speed__list[indices_speed[0]]
        emotion = emotion__list[indices_emotion[0]]

        rms = np.mean(librosa.feature.rms(y=music_data)[0])

        spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=music_data)[0])
        spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=music_data)[0])
        spectral_contrast = np.mean(librosa.feature.spectral_contrast(y=music_data)[0])
        spectral_flatness = np.mean(librosa.feature.spectral_flatness(y=music_data)[0])
        spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=music_data)[0])

        tempo = librosa.feature.tempo(y=music_data, sr=sampling_rate)[0]

        clip_feature = {
            "start_time": file_list[i],
            "lyric": lyric_data[i],
            "genre": genre,
            "speed": speed,
            "emotion": emotion,
            "rms": str(round(rms, 3)),
            "spectral_centroid": str(round(spectral_centroid, 3)),
            "spectral_bandwidth": str(round(spectral_bandwidth, 3)),
            "spectral_contrast": str(round(spectral_contrast, 3)),
            "spectral_flatness": str(round(spectral_flatness, 3)),
            "spectral_rolloff": str(round(spectral_rolloff, 3)),
            "tempo": str(round(tempo, 3)),
        }
        single_music_feature.append(clip_feature)
    music_features[music_name] = single_music_feature
    print(f"music_name: {music_name} finished")

def split_music(music_name, music_path, time_data):
    if not os.path.exists(music_name):
        os.mkdir(music_name)
    music = AudioSegment.from_file(music_path)
    for (start_time, end_time) in time_data:
        clip = music[start_time: end_time]
        save_path = f"{music_name}/{start_time}.mp3"
        clip.export(save_path, format="mp3")

def get_time_ms(line):
    time_str = line.split("]")[0].split("[")[-1]
    time_minute = int(time_str.split(":")[0])
    time_second = float(time_str.split(":")[1])
    time_ms = time_minute*60*1000 + time_second*1000
    return time_ms

def process_lyric(music_name, music_path, lyric_path):
    lyric_data = []
    time_data = []
    with open(lyric_path, 'r') as f:
        while True:
            line = f.readline().strip()
            if not line:
                break
            lyric_data.append(line)

    for i in range(len(lyric_data)):
        temp_line = lyric_data[i]
        try:
            start_time_ms = get_time_ms(temp_line)
        except:
            break
        if (i == len(lyric_data)-1):
            end_time_ms = AudioSegment.from_file(music_path).duration_seconds*1000
            time_data.append((start_time_ms, round(end_time_ms, 1)))
            continue
        next_line = lyric_data[i+1]
        end_time_ms = get_time_ms(next_line)
        time_data.append((start_time_ms, end_time_ms))
    split_music(music_name, music_path, time_data)

    for i in range(len(lyric_data)):
        lyric_data[i] = lyric_data[i].split(']')[1]
    return lyric_data

for music_file in music_file_list:
    music_path = music_folder + music_file
    music_name = music_file.split(".")[0].lower()
    lyric_path = lyrics_folder + music_name + ".txt"
    lyric_data = process_lyric(music_name, music_path, lyric_path)
    extract_feature(music_name, lyric_data)

with open("music_features.json", 'w') as f:
    json.dump(music_features, f)

music_name: too_sweet finished
music_name: i_like_the_way_you_kiss_me finished
music_name: beautiful_things finished
music_name: espresso finished
music_name: good_luck_babe finished
