In [1]:
import os
import sys
import pandas as pd
from tslearn.metrics import dtw

from sklearn.preprocessing import StandardScaler

# Import library with current code functions
sys.path.append(os.path.join("..", "lib"))
import general_functions as gf, files_paths as fp

In [2]:
def preprocess_data_current_dataset(dataset_path):
    SEQUENCES_NAME_FILE = dataset_path
    SEQUENCES = pd.read_csv(SEQUENCES_NAME_FILE)
    SEQUENCES.drop(columns=['Unnamed: 0'], inplace=True)
    
    SEQUENCES_DT = SEQUENCES.copy()

    # Select features
    features = SEQUENCES_DT.columns.to_list()[3:25]
    
    sample_id = set(SEQUENCES_DT['sample_id'])
    
    grouped_data = []
    for sample_id, group in SEQUENCES_DT.groupby('sample_id'):

        frame_seq = SEQUENCES_DT[SEQUENCES_DT.sample_id == sample_id].iloc[:].frame_seq
        sample_id = SEQUENCES_DT[SEQUENCES_DT.sample_id == sample_id].iloc[0].sample_id
        video_id = SEQUENCES_DT[SEQUENCES_DT.sample_id == sample_id].iloc[0].video_id
        sequence_features = group[features]
        label_measures = SEQUENCES_DT[SEQUENCES_DT.sample_id == sample_id].iloc[0].label_measures
        label = SEQUENCES_DT[SEQUENCES_DT.sample_id == sample_id].iloc[0].label
        seed_name = SEQUENCES_DT[SEQUENCES_DT.sample_id == sample_id].iloc[0].seed_name

        frame_start = SEQUENCES_DT[SEQUENCES_DT.sample_id == sample_id].iloc[0].frame_seq
        frame_end = SEQUENCES_DT[SEQUENCES_DT.sample_id == sample_id].iloc[-1].frame_seq


        grouped_data.append((frame_seq, sample_id, video_id, sequence_features, label_measures, label, seed_name, frame_start, frame_end))
    
    return grouped_data

In [3]:
def find_item_with_number(file_list, number):
    number_str = str(number).zfill(10)
    for file_path in file_list:
        if number_str in file_path:
            return file_path
    return None

In [4]:
def distant_dtw_filter(current_occurrence):
    new_data = current_occurrence

    valid = 1
    dtw_measures = []
    
    sample_id = new_data[1]
    emotion = new_data[5]
    seed_number = new_data[6]
    video_id = new_data[2]
    frame_seq_start = new_data[7]
    frame_seq_end = new_data[8]

    seed_path = find_item_with_number(FILE_LIST_SEED, seed_number)
    df_seed = pd.read_csv(seed_path)
    
    df_seed_features = df_seed.iloc[:, 3:27]
    df_measure_features = new_data[3]

    filtered_df = df_seed_features[df_seed_features['label_measures'] != '{}']

    scaler = StandardScaler()

    for i, (col, acc_distance) in enumerate(columns):
        y_seed = filtered_df[col].values.reshape(-1, 1)
        y_measure = df_measure_features[col].values.reshape(-1, 1)

        y_seed_scaled = scaler.fit_transform(y_seed)
        y_measure_scaled = scaler.fit_transform(y_measure)

        y_seed_scaled = y_seed_scaled.flatten()
        y_measure_scaled = y_measure_scaled.flatten()

        distance = dtw(y_seed_scaled, y_measure_scaled)
        
        dtw_measures.append([col, distance, acc_distance])

        if distance > columns[i][1]:  
            valid = 0
    
    print(f"Sample ID: {sample_id}")
    for col, real_measure, expected_limit in dtw_measures:
        print(f"Col: {col}, DTW distance: {real_measure}, expected limit: {expected_limit}")

    if valid:
        print(f"Seed {seed_number} marking {video_id} as {emotion} from {frame_seq_start} to {frame_seq_end} has been MAINTAINED!\n")
        return new_data
    else:
        print(f"Seed {seed_number} marking {video_id} as {emotion} from {frame_seq_start} to {frame_seq_end} has been REMOVED!\n")
        return -1

In [5]:
def build_new_filtered_sequence_dataset(new_grouped_data, columns, output_folder='result_sequences'):
    
    parts = current_sequence_file.split('_')
    number = parts[3].split('.')[0]

    output_filename=f'filtered_sequences_{number}'
    for i, item in enumerate(columns):
        output_filename += str(f'_{columns[i][0]}_{columns[i][1]}')
    output_filename += ".CSV"

    # Define the output directory and ensure it exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    all_data = []
    
    # Iterate over each item in the new_grouped_data list
    for item in new_grouped_data:
        # Extract each element from the tuple
        frame_seq = item[0].tolist()  # Convert to list
        sample_id = item[1]
        video_id = item[2]
        sequence_features_df = item[3]
        label_measures = item[4]
        label = item[5]
        seed_name = item[6]
        
        # Repeat the single value fields for all rows in sequence_features_df
        num_rows = sequence_features_df.shape[0]
        
        # Create a dictionary for the DataFrame
        data_dict = {
            'frame_seq': frame_seq,
            'sample_id': [sample_id] * num_rows,
            'video_id': [video_id] * num_rows,
            'label_measures': [label_measures] * num_rows,
            'label': [label] * num_rows,
            'seed_name': [seed_name] * num_rows
        }
        
        # Create a DataFrame from the dictionary
        data_df = pd.DataFrame(data_dict)
        
        # Split the DataFrame into two parts and insert sequence_features_df between them
        part1 = data_df[['frame_seq', 'sample_id', 'video_id']]
        part2 = data_df[['label_measures', 'label', 'seed_name']]
        
        # Combine the parts and sequence_features_df
        combined_df = pd.concat([part1.reset_index(drop=True), sequence_features_df.reset_index(drop=True), part2.reset_index(drop=True)], axis=1)
        
        # Append the combined DataFrame to the list
        all_data.append(combined_df)
    
    # Concatenate all DataFrames
    final_df = pd.concat(all_data, ignore_index=True)
    
    # Define the full path for the output file
    output_path = os.path.join(output_folder, output_filename)
    
    # Save the DataFrame to a CSV file
    final_df.to_csv(output_path, index=False)
    
    print(f'Dataset saved to {output_path}')

In [6]:
sequences_folder = os.path.join("result_sequences")
datase_seed_ck = os.path.join("..", "Dataset", "REF-Gold-Label")
FILE_LIST_VD_DATA_SEQUENCES = gf.find_files_in_all_subdirectories([sequences_folder], "*.CSV")
FILE_LIST_SEED = gf.find_files_in_all_subdirectories([datase_seed_ck], fp.VD_LABELED_L0)

In [7]:
FILE_LIST_VD_DATA_SEQUENCES

['result_sequences/DATASET_SEQUENCES_17.CSV',
 'result_sequences/DATASET_SEQUENCES_2.CSV',
 'result_sequences/DATASET_SEQUENCES_4.CSV',
 'result_sequences/filtered_sequences_17_m1_30_m3_25_m12_15_e2_20.CSV']

In [8]:
current_sequence_file = FILE_LIST_VD_DATA_SEQUENCES[0]
grouped_data = preprocess_data_current_dataset(current_sequence_file)

In [9]:
smile_data = [emotion for emotion in grouped_data if emotion[5] == 'happy']
print(len(smile_data))

333


In [10]:
number_occurrence = 0

# Measure, accepted distance
columns = [
    ['m1', 30], 
    ['m3', 25], 
    ['m12', 15], 
    ['e2', 20]
]

In [11]:
new_grouped_data = []
for occurrence in grouped_data:
    distance = distant_dtw_filter(occurrence)
    if distance != -1:
        new_grouped_data.append(distance)

Sample ID: 0
Col: m1, DTW distance: 2.4548661728400694, expected limit: 30
Col: m3, DTW distance: 2.784821185982177, expected limit: 25
Col: m12, DTW distance: 1.8138284798272157, expected limit: 15
Col: e2, DTW distance: 2.2729101113391987, expected limit: 20
Seed 29 marking 1 as happy from 1011 to 1022 has been MAINTAINED!

Sample ID: 1
Col: m1, DTW distance: 1.5853839074941134, expected limit: 30
Col: m3, DTW distance: 3.539424157730818, expected limit: 25
Col: m12, DTW distance: 2.175161807314891, expected limit: 15
Col: e2, DTW distance: 2.4204512212194413, expected limit: 20
Seed 44 marking 1 as happy from 1069 to 1083 has been MAINTAINED!

Sample ID: 2
Col: m1, DTW distance: 1.527119992518361, expected limit: 30
Col: m3, DTW distance: 2.0936773052813624, expected limit: 25
Col: m12, DTW distance: 0.8243762135595738, expected limit: 15
Col: e2, DTW distance: 1.6665526298168125, expected limit: 20
Seed 29 marking 1 as happy from 1129 to 1140 has been MAINTAINED!

Sample ID: 3
Col:

In [12]:
smile_data = [emotion for emotion in new_grouped_data if emotion[5] == 'happy']
print(len(new_grouped_data))

333


In [13]:
build_new_filtered_sequence_dataset(new_grouped_data, columns)

Dataset saved to result_sequences/filtered_sequences_17_m1_30_m3_25_m12_15_e2_20.CSV
