In [9]:
# This code is to make *.csv dataset for LSTF from long wav files


import os
import numpy as np
import librosa
import csv
import datetime
import progressbar

SR=22050


def convert_audio_to_csv(audio_file, output_file):
    # Load the audio file
    SR = 22050  # Assuming a sample rate of 22050 (replace with your desired value)
    y, sr = librosa.load(audio_file, sr=SR, duration=10)

    # Create a progress bar
    prog_bar = progressbar.ProgressBar(maxval=len(y))

    # Convert the audio to a CSV file
    with open(output_file, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')

        # Write the header
        writer.writerow(['timeframe', 'frame_number', 'data'])
        prog_bar.start()

        # Write the audio data to the CSV file
        for i in range(len(y)):
            timeframe = i / sr
            frame_number = i
            time_struct = datetime.timedelta(seconds=timeframe)

            # Extract hours, minutes, and seconds directly from the timedelta object
            hours, remainder = divmod(time_struct.seconds, 3600)
            minutes, seconds = divmod(remainder, 60)
            formatted_timeframe = f"{hours:02d}:{minutes:02d}:{seconds:02d}"

            writer.writerow([formatted_timeframe, frame_number, y[i]])
            prog_bar.update(i + 1)  # Update the progress bar with the current index
        
        prog_bar.finish()

            
            
# Settings
audio_y_file = '/anc/data0213/audio_y.wav' # reverse audio of x2
audio_x1_file = '/anc/data0213/audio_x1.wav' # upstream recorded
audio_x2_file = '/anc/data0213/audio_x2.wav' # downstream recorded
event_file = '/hifi-gan/yolo7_detection/detection_results.txt' # event file(txt file). shape is (: , 4)
output_dir = '/LTSF-Linear/dataset'
dataset_csv = '/LTSF-Linear/dataset/audio_x2.csv'
teacher_dir = '/hifi-gan/yolo7_detection/noise_speech_data/teacher' # for fine tuning
min_length_frames = SR*2 ## 2sec
#total_length_frames = 215649000 ## 9780초

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Load the wav file

convert_audio_to_csv(audio_x2_file, dataset_csv)




100% |########################################################################|


In [None]:
'''

fps = 30


audio_y, sr = librosa.load(audio_y_file, sr=SR)
print('dimension of audio y :', audio_y.shape[0])
print('time of audio(sec) : ', audio_y.shape[0]/SR)
print('time of audio(min) : ', audio_y.shape[0]/SR/60)

# Load the text file with specified columns and replace "_" in first column
data = np.genfromtxt(event_file, delimiter=',', dtype=None, encoding=None)

# Remove "_" from first column and convert to integer
data['f0'] = np.char.replace(data['f0'], "_", "").astype(int)
print('dimension of text data :', data.shape)
print(data[0:5])
'''

In [5]:
import torch

# Loop through the audio file with a step of segment_duration_s
segment_duration_s = 2  # 2 seconds
audio_duration_s = len(audio_x1) / SR

# From config_v3.json, Hifi-GAN
n_fft = 1024
hop_size = 256
win_size = 1024
num_mels = 80
sampling_rate = SR
fmin = 0
fmax = 8000

# Create a list to store the metadata for each segment
metadata = []

for i in range(0, int(audio_duration_s - 100), segment_duration_s):
    # Extract the segment
    start_sample = i * SR
    end_sample = (i + segment_duration_s) * SR
    segment = audio_x1[start_sample:end_sample]
    
    # for Fine tuning .npy
    segment_y = audio_y[start_sample:end_sample]
    
    # From meldataset.py
    segment_y = torch.FloatTensor(segment_y)
    segment_y = segment_y.unsqueeze(0)
    
    mel_spec = mel_spectrogram(segment_y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False)

    #### 이전 버전 작업인데 차원이 맞지 않아서, Hifi-GAN에 있는 함수를 쓰기로.
    #mel_spec = librosa.feature.melspectrogram(y=segment_y, sr=SR, n_mels=80, hop_length=256, n_fft=1024)
    #mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
    
    # Save the segment
    segment_name = f"x1_segment_{i // segment_duration_s}.wav"
    segment_name_meta = f"x1_segment_{i // segment_duration_s}"
    sf.write(segment_name, segment, SR)
    
    #파일 이름이 x1(입력과 동일한)으로 되어야 fine tuning이 동작됨.
    segment_name_y = f"x1_segment_{i // segment_duration_s}.npy"
    np.save(segment_name_y, mel_spec)
    
    if i & 100 == 0 :
        print (f'progress on making training file until {segment_name}...') 
    
    # Find the corresponding rows in the text file
    start_time = i  # convert seconds to frames
    end_time = i + segment_duration_s  # convert seconds to frames
    frame_start = start_time * fps  # convert time to frames
    frame_end = end_time * fps  # convert time to frames

    # Select rows where the frame number is within the start and end frames
    segment_metadata = data[(data['f0'].astype(int) >= frame_start) & (data['f0'].astype(int) < frame_end)]

    # Keep only the columns you want (excluding the second and ninth)
    segment_metadata = segment_metadata[['f2', 'f3', 'f4', 'f5', 'f6', 'f7']]
    metadata_line = segment_name_meta
    
    # Add the segment metadata to the list
    print('Start to making metadata..')
    for j, row in enumerate(segment_metadata):
        metadata_row = ', '.join(map(str, row))
        if j == 0:
            metadata_line += "|" + metadata_row
        else:
            metadata_line += ", " + metadata_row
    metadata.append(metadata_line)


with open("metadata.txt", 'w') as f:
    for line in metadata:
        f.write(line + "\n")
        
print('Noise speech data has been made.')

write the file until x1_segment_0.wav
write the file until x1_segment_1.wav
write the file until x1_segment_2.wav
write the file until x1_segment_3.wav
write the file until x1_segment_4.wav
write the file until x1_segment_5.wav
write the file until x1_segment_6.wav
write the file until x1_segment_7.wav
write the file until x1_segment_8.wav
write the file until x1_segment_9.wav
write the file until x1_segment_10.wav
write the file until x1_segment_11.wav
write the file until x1_segment_12.wav
write the file until x1_segment_13.wav
write the file until x1_segment_14.wav
write the file until x1_segment_15.wav
write the file until x1_segment_16.wav
write the file until x1_segment_17.wav
write the file until x1_segment_18.wav
write the file until x1_segment_19.wav
write the file until x1_segment_20.wav
write the file until x1_segment_21.wav
write the file until x1_segment_22.wav
write the file until x1_segment_23.wav
write the file until x1_segment_24.wav
write the file until x1_segment_25.

In [6]:
# To Split train and validation data

# Set the random seed
np.random.seed(42)

# Read the metadata file
with open('metadata.txt', 'r') as f:
    lines = [line.strip() for line in f.readlines()]

# Shuffle the lines
np.random.shuffle(lines)

# Calculate the index for splitting the data
split_idx = int(0.8 * len(lines))
split_idx2 = int(0.9 * len(lines))

# Split the data into training and validation sets
train_lines = lines[:split_idx]
valid_lines = lines[split_idx:split_idx2]
test_lines = lines[split_idx2:]

# Write the training data to a file
with open('yolo7_detection/noise_speech_data/training.txt', 'w') as f:
    for line in train_lines:
        f.write(line + '\n')

# Write the validation data to a file
with open('yolo7_detection/noise_speech_data/validation.txt', 'w') as f:
    for line in valid_lines:
        f.write(line + '\n')
        
with open('yolo7_detection/noise_speech_data/test.txt', 'w') as f:
    for line in test_lines:
        f.write(line + '\n')

print('train, validation and test data has been made.')

train, validation and test data has been made.
