In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install --find-links https://download.pytorch.org/whl/torch_stable.html wandb einops>=0.4 vector-quantize-pytorch>=0.10.15 librosa==0.10.0 torchlibrosa==0.1.0 ftfy tqdm transformers encodec==0.1.1 gdown accelerate>=0.24.0 beartype joblib h5py scikit-learn wget

In [None]:
import os
os.chdir("/content/drive/open_musiclm")

In [None]:
import os
import sys
from pathlib import Path

import torch
import torchaudio
from einops import rearrange
from torchaudio.functional import resample
from open_musiclm.config import (create_clap_quantized_from_config,
                                 create_coarse_transformer_from_config,
                                 create_semcoarsetosem_transformer_from_config,
                                 create_encodec_from_config,
                                 create_hubert_kmeans_from_config,
                                 my_load_model_config,load_model_config)
from open_musiclm.open_musiclm import (SemcoarsetosemStage,CoarseStage,
                                       get_or_compute_clap_token_ids,
                                       get_or_compute_acoustic_token_ids,
                                       get_or_compute_semantic_token_ids)
from open_musiclm.utils import int16_to_float32, float32_to_int16, zero_mean_unit_var_norm
from scripts.train_utils import disable_print


In [None]:
def make_file_name(file_string):
  file_string=str(file_string)
  # 문자열을 "/"을 기준으로 분할하여 리스트로 만듭니다.
  parts = file_string.split("/")

  # 파일 이름 부분에서 원하는 정보 추출
  file_name = parts[-1]
  file_name_parts = file_name.split("_")
  number = file_name_parts[-1].split(".")[0]
  description_with_underscore = parts[-3].replace(" ", "_")
  # 새로운 파일 이름 생성
  new_file_name = f"{description_with_underscore}_{number}"
  return new_file_name



In [None]:
import random
folder = "/content/drive/MyDrive/data/instrument/train/instrument_train"
path = Path(folder)
train_files = [file for file in path.glob(f'**/vocals/*.wav')]
print(len(train_files))


all_files = train_files
print("Total files found:", len(all_files))

random.shuffle(all_files)
print(all_files)

In [None]:
my_model_path="real_semcoarsetosem.transformer.5170.pt"
my_model_config="my_musiclm_for_semcoarsetosem.json"
my_model_config = my_load_model_config(my_model_config)

# coarse_path = "/content/drive/MyDrive/my_code/mymusiclm/my-open-musiclm-main/explorer/wandb/run-20240223_102209-e01qb72w/files/coarse_generation_test.transformer.900.pt"
rvq_path="/content/drive/MyDrive/data/weight/clap.rvq.950_no_fusion.pt"
kmeans_path="/content/drive/MyDrive/data/weight/kmeans_10s_no_fusion.joblib"
seed = 42
torch.manual_seed(seed)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
clap = create_clap_quantized_from_config(my_model_config, rvq_path, device)
wav2vec = create_hubert_kmeans_from_config(my_model_config, kmeans_path, device)
encodec_wrapper = create_encodec_from_config(my_model_config, device)
semcoarsetosem_transformer = create_semcoarsetosem_transformer_from_config(my_model_config,my_model_path, device)

semcoarsetosem_stage = SemcoarsetosemStage(
    semcoarsetosem_transformer=semcoarsetosem_transformer,
    neural_codec=encodec_wrapper,
    wav2vec=wav2vec,
)

In [None]:
model_config="/content/drive/MyDrive/my_code/mymusiclm/open-musiclm-main/configs/model/musiclm_large_small_context.json"
model_config = load_model_config(model_config)
duration=5
results_folder = "/content/drive/MyDrive/my_code/mymusiclm/my-open-musiclm-main/explorer/music_generation"
Path(results_folder).mkdir(parents=True, exist_ok=True)
coarse_path="/content/drive/MyDrive/data/weight/coarse.transformer.18000.pt"
device = 'cuda' if torch.cuda.is_available() else 'cpu'
clap = create_clap_quantized_from_config(model_config,rvq_path, device)
wav2vec = create_hubert_kmeans_from_config(model_config, kmeans_path, device)
encodec_wrapper = create_encodec_from_config(model_config, device)
coarse_transformer = create_coarse_transformer_from_config(model_config, coarse_path, device)
torch.manual_seed(42)
coarse_stage = CoarseStage(
        coarse_transformer=coarse_transformer,
        neural_codec=encodec_wrapper,
        wav2vec=wav2vec,
        clap=clap
)
text=["Diverse kinds of instrument and richness"]
clap_token_ids = get_or_compute_clap_token_ids(None, clap, None, text)

In [None]:
import librosa
import numpy as np
import soundfile as sf
import torchaudio
import torch
from torchaudio.functional import resample

def int16_to_float32(x):
    return (x / 32767.0).type(torch.float32)

def float32_to_int16(x):
    x = torch.clamp(x, min=-1., max=1.)
    return (x * 32767.).type(torch.int16)

def zero_mean_unit_var_norm(x):
    return (x - x.mean(dim=-1, keepdim=True)) / torch.sqrt(x.var(dim=-1, keepdim=True) + 1e-7)

def my_linear_mixing(audio1, audio2, output_file):
    # 오디오 파일 로드

    y1, sr1 = torchaudio.load(audio1)
    y2, sr2 = torchaudio.load(audio2)
    print("sr1",sr1,"sr2,",sr2)
    if y2.shape[0] > 1:
            y2 = torch.mean(y2, dim=0).unsqueeze(0)
    if y1.shape[0] > 1:
            y1 = torch.mean(y1, dim=0).unsqueeze(0)
     # 오디오 길이 일치화

    y2 = resample(y2, sr2, sr1)
    y2 = int16_to_float32(float32_to_int16(y2))

    min_length = min(y1.shape[1], y2.shape[1])


    y1 = y1[:,:min_length]

    y2 =y2[:, :min_length]

    # 선형으로 엮기
    mixed_audio = y1 * 0.5 + y2
    mixed_audio=mixed_audio.squeeze()
    # 결과 저장
    sf.write(output_file, mixed_audio, sr1)
    print(f"{output_file} 에 믹스 파일 저장함")

def linear_mixing(audio1, audio2, output_file):
    # 오디오 파일 로드
    y1, sr1 = librosa.load(audio1, sr=None)
    y2, sr2 = librosa.load(audio2, sr=None)
    # y3, sr3 = librosa.load(audio3, sr=None)

    # 오디오 길이 일치화
    min_length = min(len(y1), len(y2))
    y1 = y1[:min_length]
    y2 = y2[:min_length]
    # y3 = y3[:min_length]

    # 선형으로 엮기
    mixed_audio = y1 * 0.8 + y2
    # 결과 저장
    sf.write(output_file, mixed_audio, sr1)


In [None]:
cnt=0
for audio_path in all_files:
    cnt+=1
    name=make_file_name(audio_path)
    data, sample_hz = torchaudio.load(audio_path)

    if data.shape[0] > 1:
        data = torch.mean(data, dim=0).unsqueeze(0)

    target_length = int(10 * sample_hz)
    normalized_data = zero_mean_unit_var_norm(data)

    data = data[:, :target_length]
    normalized_data = normalized_data[: , :target_length]
    audio_for_encodec = resample(data, sample_hz, encodec_wrapper.sample_rate)
    audio_for_wav2vec = resample(normalized_data, sample_hz, wav2vec.target_sample_hz)

    audio_for_encodec = int16_to_float32(float32_to_int16(audio_for_encodec)).to(device)
    audio_for_wav2vec = int16_to_float32(float32_to_int16(audio_for_wav2vec)).to(device)
    vocals_semantic_token_ids = get_or_compute_semantic_token_ids(None, audio_for_wav2vec, wav2vec)
    vocals_coarse_token_ids, _ = get_or_compute_acoustic_token_ids(None, None, audio_for_encodec, encodec_wrapper, model_config.global_cfg.num_coarse_quantizers)

    generated_inst_semantic_ids = semcoarsetosem_stage.generate(
        vocals_semantic_token_ids=vocals_semantic_token_ids,
        vocals_coarse_token_ids=vocals_coarse_token_ids,
        # max_time_steps=10,
        max_time_steps=200,
        temperature=0.90,
    )
    print(generated_inst_semantic_ids.shape)
    generated_wave = coarse_stage.generate(
        clap_token_ids=clap_token_ids,
        semantic_token_ids=generated_inst_semantic_ids.squeeze(2),
        #gt_semantic_token_ids[0].unsqueeze(0),
        coarse_token_ids=None,
        # max_time_steps=10,
        max_time_steps=duration*75,
        # max_time_steps=int(model_config.global_cfg.coarse_audio_length_seconds * 75),
        reconstruct_wave=True,
        include_eos_in_output=False,
        append_eos_to_conditioning_tokens=True,
        temperature=0.95,
    )

    generated_wave = rearrange(generated_wave, 'b n -> b 1 n').detach().cpu()

    for i, wave in enumerate(generated_wave):
        torchaudio.save(f'{results_folder}/{name}.wav', wave, encodec_wrapper.sample_rate)
        print("=============================================================")
        print(f"{results_folder}/{name}.wav 에 최종 만듦\n\n\n")
    my_inst=f"{results_folder}/{name}.wav"
    gt_inst=audio_path.replace("vocals","instrument")
    linear_mixing(gt_inst,audio_path,f"{results_folder}/{name}_mixed_gt.wav")
    my_linear_mixing(my_inst,audio_path,f"{results_folder}/{name}_mixed.wav")