In [7]:
import os
import subprocess

from pydub import AudioSegment
import pandas as pd
from tqdm import tqdm
import librosa
import numpy as np
import simpleaudio as sa
import soundfile as sf

import pandas as pd
from tqdm import tqdm

In [8]:
ANNOTATOR_NAME = 'annotator_1'

base_path = '/dir_to_wav/'
save_path = f'/dir_to_wav/segments/{ANNOTATOR_NAME}'

annot = pd.read_csv(f'/AudioData/{ANNOTATOR_NAME}_audio.csv', sep=";")

#20 movies 
movies = ['tt0097576', 'tt0108160', 'tt0109830', 'tt0110912', 'tt0119822',
       'tt0120338', 'tt0375679', 'tt0467406', 'tt0822832', 'tt0970416',
       'tt1045658', 'tt1142988', 'tt1193138', 'tt1285016', 'tt1454029',
       'tt1568346', 'tt1570728', 'tt1632708', 'tt1907668', 'tt2267998']

annotator = annot[annot['imdb_key'].isin(movies)]

DATA = annotator.copy()

In [9]:
DATA["speech_index"] = DATA.apply(lambda row: f"{row['imdb_key']}_{str(row['speech_index']).split('_')[-1]}", axis=1)

In [10]:
DATA['imdb_key'].unique()

array(['tt0097576', 'tt0108160', 'tt0109830', 'tt0110912', 'tt0119822',
       'tt0120338', 'tt0375679', 'tt0467406', 'tt0822832', 'tt0970416',
       'tt1045658', 'tt1142988', 'tt1193138', 'tt1285016', 'tt1454029',
       'tt1568346', 'tt1570728', 'tt1632708', 'tt1907668', 'tt2267998'],
      dtype=object)

In [11]:
def convert_to_mono(file_path):
    mono_file_path = file_path.replace('.wav', '_mono.wav')
    try:
        subprocess.run(['ffmpeg', '-i', file_path, '-ac', '1', mono_file_path], check=True)
        os.replace(mono_file_path, file_path)  #replace original file with mono version
        print(f"Converted {file_path} to mono")
    except subprocess.CalledProcessError as e:
        print(f"Failed to convert {file_path}: {e}")

def process_wav_files(directory):
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.wav'):
                file_path = os.path.join(root, file)
                convert_to_mono(file_path)

In [12]:
def load_audio(file_path):
    y, sr = librosa.load(file_path, sr=None)  
    return y, sr

In [13]:
segment_paths = []
labels = []
start_frames = []
end_frames = []

for movie in movies: 
    file_path = os.path.join(base_path, f"{movie}.wav")
    audio_data, sr = load_audio(file_path)

    audio_length = len(audio_data)

    for index, row in tqdm(DATA.iterrows(), total=len(DATA)): 
        if row['imdb_key'] == movie:
            start_sec = row['start_frame'] / row['framerate']
            end_sec = row['end_frame'] / row['framerate']


            
            start_sample = int(start_sec * sr)
            end_sample = int(end_sec * sr)
            
            segment = audio_data[start_sample:end_sample]
    
            
            segment_file_name = f"{row['speech_index']}_{row['annotator']}.wav"
            save_path_movie = os.path.join(save_path, row['imdb_key'])
            
            if not os.path.exists(save_path_movie):
                os.makedirs(save_path_movie)
            
            segment_file_path = os.path.join(save_path_movie, segment_file_name)
            sf.write(segment_file_path, segment, sr, format='WAV')
            
            segment_paths.append(segment_file_path)
            labels.append(row['label_audio'])
            start_frames.append(row['start_frame'])
            end_frames.append(row['end_frame'])

    segments_df = pd.DataFrame({
        'segment_path': segment_paths,
        'label': labels,
        'annotation_start_frame': start_frames,
        'annotation_end_frame': end_frames
    })
    
    segments_df.to_csv(os.path.join(save_path, f'audio_segments_{ANNOTATOR_NAME}.csv'), index=False)
    
    print("Audio segments and labels have been saved.")


100%|█████████████████████████████████████| 3718/3718 [00:02<00:00, 1845.01it/s]


Audio segments and labels have been saved.


100%|█████████████████████████████████████| 3718/3718 [00:02<00:00, 1415.11it/s]


Audio segments and labels have been saved.


100%|█████████████████████████████████████| 3718/3718 [00:03<00:00, 1120.38it/s]


Audio segments and labels have been saved.


100%|█████████████████████████████████████| 3718/3718 [00:02<00:00, 1307.28it/s]


Audio segments and labels have been saved.


100%|█████████████████████████████████████| 3718/3718 [00:02<00:00, 1261.49it/s]


Audio segments and labels have been saved.


100%|█████████████████████████████████████| 3718/3718 [00:02<00:00, 1257.92it/s]


Audio segments and labels have been saved.


100%|█████████████████████████████████████| 3718/3718 [00:01<00:00, 2141.62it/s]


Audio segments and labels have been saved.


100%|█████████████████████████████████████| 3718/3718 [00:01<00:00, 3078.51it/s]


Audio segments and labels have been saved.


100%|█████████████████████████████████████| 3718/3718 [00:02<00:00, 1329.84it/s]


Audio segments and labels have been saved.


100%|█████████████████████████████████████| 3718/3718 [00:01<00:00, 2009.40it/s]


Audio segments and labels have been saved.


100%|█████████████████████████████████████| 3718/3718 [00:02<00:00, 1360.91it/s]


Audio segments and labels have been saved.


100%|█████████████████████████████████████| 3718/3718 [00:02<00:00, 1601.56it/s]


Audio segments and labels have been saved.


100%|█████████████████████████████████████| 3718/3718 [00:02<00:00, 1526.78it/s]


Audio segments and labels have been saved.


100%|█████████████████████████████████████| 3718/3718 [00:02<00:00, 1581.19it/s]


Audio segments and labels have been saved.


100%|██████████████████████████████████████| 3718/3718 [00:03<00:00, 999.10it/s]


Audio segments and labels have been saved.


100%|█████████████████████████████████████| 3718/3718 [00:02<00:00, 1419.97it/s]


Audio segments and labels have been saved.


100%|█████████████████████████████████████| 3718/3718 [00:02<00:00, 1240.93it/s]


Audio segments and labels have been saved.


100%|█████████████████████████████████████| 3718/3718 [00:02<00:00, 1613.73it/s]


Audio segments and labels have been saved.


100%|█████████████████████████████████████| 3718/3718 [00:03<00:00, 1206.42it/s]


Audio segments and labels have been saved.


100%|█████████████████████████████████████| 3718/3718 [00:02<00:00, 1266.13it/s]

Audio segments and labels have been saved.



