## 导入库和构建文件架构

In [1]:
from functions import separate_vocal, apply_so_vits, fuse_vocal_and_instrumental, convert_ncm, Path
from resource_manager import get_data_from_source

building file structures...done


## 定义路径

In [4]:
SONG_PATH = Path("./demo_assets/escape.wav")
OUT_PATH = Path("../files/output/").joinpath(SONG_PATH.name)

## 下载模型(来源huggingface, 下载可能会较慢, 请等待)

In [5]:
model_so_vits = get_data_from_source("so-vits", "model", "genshin", update_cache=True)
model_demucs = get_data_from_source("demucs", "model", "hdemucs_mmi", update_cache=True)

Fetching 28 files:   0%|          | 0/28 [00:00<?, ?it/s]

Downloading hdemucs_mmi.yaml (33 bytes)
hdemucs_mmi.yaml downloaded
Downloading 75fc33f5-1941ce65.th (167407275 bytes)
75fc33f5-1941ce65.th downloaded


## 利用demucs分离音频

In [6]:
separated_path = separate_vocal(SONG_PATH, OUT_PATH)
print(separated_path)

Selected model is a bag of 1 models. You will see that many progress bars per track.
Separated tracks will be stored in /Users/ayano/Documents/developing/AI/song_generating_pack/files/output/escape.wav
Separating track /Users/ayano/Documents/developing/AI/song_generating_pack/src/demo_assets/escape.wav


100%|████████████████████████████████████████████████████████████████████████| 337.5/337.5 [01:09<00:00,  4.87seconds/s]


converting file to mp3...
done


File '/Users/ayano/Documents/developing/AI/song_generating_pack/files/output/escape.wav/vocals.mp3' already exists. Overwrite? [y/N] 

## 利用so-vits处理音频

In [8]:
counterfeited_path = apply_so_vits(separated_path["vocal"], output_path=OUT_PATH, model_path=model_so_vits["nahida_jp_G_40000.pth"], config_file_path=model_so_vits["nahida.json"], cluster=model_so_vits["nahida_jp_kmeans_10000.pt"], auto_predict_f0=False, speaker="nahida")
print(counterfeited_path)



[20:09:17] Decoder type: hifi-gan
[20:09:18] Loaded checkpoint '/Users/ayano/Documents/developing/AI/song_generating_pack/files/models/so-vits/genshin/nahida_jp_G_40000.pth' (epoch 378)
[20:09:18] Chunk: Chunk(Speech: True, 1764000.0)


44100 50.0
0.5


[20:09:20] F0 inference time:       1.538s, RTF: 0.038

[20:09:21] HuBERT inference time  : 1.460s, RTF: 0.036
[20:09:31] Inferece time: 10.21s, RTF: 0.25
[20:09:31] Chunk: Chunk(Speech: True, 1764000.0)
[20:09:32] F0 inference time:       0.523s, RTF: 0.013
[20:09:33] HuBERT inference time  : 1.484s, RTF: 0.036
[20:09:43] Inferece time: 10.15s, RTF: 0.25
[20:09:43] Chunk: Chunk(Speech: True, 1764000.0)
[20:09:44] F0 inference time:       0.513s, RTF: 0.013
[20:09:45] HuBERT inference time  : 1.512s, RTF: 0.037
[20:09:56] Inferece time: 10.19s, RTF: 0.25
[20:09:56] Chunk: Chunk(Speech: True, 1764000.0)
[20:09:56] F0 inference time:       0.512s, RTF: 0.012
[20:09:58] HuBERT inference time  : 1.445s, RTF: 0.035
[20:10:08] Inferece time: 10.12s, RTF: 0.25
[20:10:08] Chunk: Chunk(Speech: True, 1764000.0)
[20:10:08] F0 inference time:       0.566s, RTF: 0.014
[20:10:10] HuBERT inference time  : 1.566s, RTF: 0.038
[20:10:20] Inferece time: 10.18s, RTF: 0.25
[20:10:20] Chunk: Chunk(Speech: T

## 合并

In [11]:
output = fuse_vocal_and_instrumental(vocal_path=counterfeited_path, instrumental_path=separated_path["instrumental"], output_path=OUT_PATH, speaker="nahida")
print(output)

../files/output/escape.wav/voice_generated_with_nahida_counterfeited_from_nahida.wav
