imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from moviepy.editor import VideoFileClip
import numpy as np
import librosa
from concurrent.futures import ThreadPoolExecutor
import os
import json
import re
import glob

define audio

In [None]:
# save file in '../proc_csv/raw_videos'

def extract_audio_features(audio_path, index,savePath):
    try:
        # Load the audio file
        y, sr = librosa.load(audio_path, sr=None)

        # Extract audio features
        mfccs = librosa.feature.mfcc(y=y, sr=sr).mean(axis=1)
        chroma = librosa.feature.chroma_stft(y=y, sr=sr).mean(axis=1)
        spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr).mean(axis=1)
        tonnetz = librosa.feature.tonnetz(y=y, sr=sr).mean(axis=1)

        # Create a dictionary to hold features
        features = {
            'mfccs': mfccs.tolist(),  # Convert numpy arrays to lists
            'chroma': chroma.tolist(),
            'spectral_contrast': spectral_contrast.tolist(),
            'tonnetz': tonnetz.tolist()
        }

        # Save features to a file, naming it with the DataFrame index for alignment
        feature_filename = f'{savePath}/features_{index}.json'
        with open(feature_filename, 'w') as file:
            json.dump(features, file)

        return index, feature_filename  # Return the index and filename for tracking

    except Exception as e:
        print(f"Error processing file {audio_path}: {e}")
        return index, None

def process_video_batch(start_index, end_index, df, save_path, raw_video_path):
    futures = []
    video_lengths = {}  # Dictionary to store video lengths with DataFrame indices as keys

    # Ensure the temp_audio directory exists
    temp_audio_dir = os.path.join(save_path, 'temp_audio')
    os.makedirs(temp_audio_dir, exist_ok=True)

    with ThreadPoolExecutor(max_workers=20) as executor:
        for index, row in df.iloc[start_index:end_index].iterrows():
            video_path = os.path.join(raw_video_path, f"{row['video_id']}/{row['clip_id']}.mp4")
            temp_audio_path = os.path.join(temp_audio_dir, f"{row['video_id']}_{row['clip_id']}.wav")

            try:
                video_clip = VideoFileClip(video_path)
                video_clip.audio.write_audiofile(temp_audio_path)
                video_clip.close()

                # Submit audio feature extraction task along with the index and path to save the features
                futures.append(executor.submit(extract_audio_features, temp_audio_path, index, save_path))

                # Store video length in the dictionary using the original DataFrame index
                video_lengths[index] = video_clip.duration

            except Exception as e:
                print(f"Error extracting audio from {video_path}: {e}")

            finally:
                if os.path.exists(temp_audio_path):
                    os.remove(temp_audio_path)

    # Process future results for audio features
    for future in futures:
        index, feature_filename = future.result()
        if feature_filename:
            print(f"Features extracted and saved for index {index}: {feature_filename}")
        else:
            print(f"Failed to process video at index {index}")

    return video_lengths

def load_features_from_json(file_path):
    # Extract the index from the file name using a regular expression
    # Assuming the file name format is "features_{index}.json"
    match = re.search(r'features_(\d+).json', file_path)
    if match:
        index = int(match.group(1))
    else:
        raise ValueError(f"Index not found in file name: {file_path}")

    with open(file_path, 'r') as file:
        features = json.load(file)

    return index, features

In [None]:
def audio_proc():
# Adjust the path below to the location of your CSV file within Google Drive
    csv_file_path = './label.csv'

    # Read the CSV file
    df = pd.read_csv(csv_file_path)

    # Filter the necessary columns
    filtered_df = df[['video_id', 'clip_id', 'text', 'annotation']]

    batch_size = 100  # Define your batch size
    total_batches = (len(df) + batch_size - 1) // batch_size

    # Create the directory for storing video lengths if it doesn't exist
    video_lengths_dir = '../proc_csv/videoLengths'
    os.makedirs(video_lengths_dir, exist_ok=True)

    successful_indices = set()  # To keep track of successful indices

    for batch_num in range(total_batches):
        start_index = batch_num * batch_size
        end_index = min((batch_num + 1) * batch_size, len(df))
        print(f"Processing batch {batch_num + 1}/{total_batches}: Indices {start_index} to {end_index}")

        batch_video_lengths = process_video_batch(
            start_index, end_index, df,
            '../proc_csv/temp_audio',
            '../proc_csv/raw_videos'
        )

        # Update successful indices based on this batch's results
        successful_indices.update(batch_video_lengths.keys())

        # Save this batch's video lengths to a file
        batch_file_path = os.path.join(video_lengths_dir, f'video_lengths_batch_{batch_num + 1}.json')
        with open(batch_file_path, 'w') as file:
            json.dump(batch_video_lengths, file)

        print(f"Saved video lengths for batch {batch_num + 1} to {batch_file_path}")

    # Initialize an empty dictionary to compile all video lengths
    compiled_video_lengths = {}

    # Read each batch's video lengths from their respective files and compile them
    for batch_num in range(total_batches):
        batch_file_path = os.path.join(video_lengths_dir, f'video_lengths_batch_{batch_num + 1}.json')
        with open(batch_file_path, 'r') as file:
            batch_video_lengths = json.load(file)
            compiled_video_lengths.update(batch_video_lengths)




    # Path where your JSON files are stored
    json_files_path = '../proc_csv/*.json'
    json_files = glob.glob(json_files_path)

    all_features = []

    for file_path in json_files:
        index, features = load_features_from_json(file_path)
        all_features.append((index, features))

    # Initialize a list to hold the feature vectors
    feature_vectors = []

    # Initialize a list to hold the indices
    indices = []

    # Number of features for each type
    n_mfcc, n_chroma, n_contrast, n_tonnetz = 20, 12, 7, 6

    # Define column labels
    mfcc_labels = [f'mfcc_{i}' for i in range(1, n_mfcc + 1)]
    chroma_labels = [f'chroma_{i}' for i in range(1, n_chroma + 1)]
    contrast_labels = [f'contrast_{i}' for i in range(1, n_contrast + 1)]
    tonnetz_labels = [f'tonnetz_{i}' for i in range(1, n_tonnetz + 1)]

    column_labels = mfcc_labels + chroma_labels + contrast_labels + tonnetz_labels

    for index, features in all_features:
        # Flatten the features into a single vector and append to the feature_vectors list
        feature_vector = features['mfccs'] + features['chroma'] + features['spectral_contrast'] + features['tonnetz']
        feature_vectors.append(feature_vector)

        # Append the index to the indices list
        indices.append(index)

    # Create the DataFrame
    df_features = pd.DataFrame(feature_vectors, columns=column_labels)

    # Set the DataFrame's index to the extracted indices
    df_features.index = indices

    # Sort the DataFrame by index to ensure proper alignment
    df_features.sort_index(inplace=True)

    # Convert the compiled video lengths to a DataFrame
    video_lengths_df = pd.DataFrame(list(compiled_video_lengths.items()), columns=['index', 'video_length'])
    video_lengths_df.set_index('index', inplace=True)
    if video_lengths_df.index.dtype == 'object':
        video_lengths_df.index = video_lengths_df.index.astype('int')
    merged_df = df_features.merge(video_lengths_df, left_index=True, right_index=True, how='inner')
    merged_df.drop('index', axis=1, inplace=True)
    merged_df.head()

    # Define the file path where you want to save the DataFrame
    file_path = '../proc_csv/AudioFeaturesMOSI.csv'

    # Save the merged DataFrame to a CSV file
    merged_df.to_csv(file_path, index=False)

    # Assuming 'video_length' column exists in the original merged_df
    average_length = merged_df['video_length'].mean()

    print(f"Average video length: {average_length:.2f} seconds")

    from sklearn.preprocessing import StandardScaler
    from sklearn.preprocessing import MinMaxScaler

    scalerMinMax = MinMaxScaler()
    # Reshape the 'video_length' column to a 2D array
    video_lengths_reshaped = merged_df['video_length'].values.reshape(-1, 1)

    # Fit the scaler to the data and transform it
    scaled_video_lengths = scalerMinMax.fit_transform(video_lengths_reshaped)

    # Select only the feature columns (excluding non-feature columns if any, such as text labels or video IDs)
    feature_columns = [col for col in merged_df.columns if col not in ['video_length']]  # Adjust non-feature column names as needed
    features_to_normalize = merged_df[feature_columns]

    # Initialize the scaler
    scaler = StandardScaler()

    # Fit the scaler to the features and transform them
    normalized_features = scaler.fit_transform(features_to_normalize)

    # Create a new DataFrame with the normalized features
    normalized_df = pd.DataFrame(normalized_features, columns=feature_columns, index=merged_df.index)
    normalized_df['video_length'] = scaled_video_lengths

    class_labels_for_normalized_df = filtered_df.loc[normalized_df.index, 'annotation']
    normalized_df['class_labels'] = class_labels_for_normalized_df

    # Define the file path where you want to save the DataFrame
    file_path = '../proc_csv/AudioFeaturesMOSINormalised.csv'

    # Save the merged DataFrame to a CSV file
    normalized_df.to_csv(file_path, index=True)