In [1]:
import cv2
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

from joblib import Parallel, delayed

# Define the folder path for cleaned data
sorted_folder = r'3_sorted'
# 
merged_folder = r'4_measured'

# Image folder
original_folder = r'D:\Research\GJ DL\images'

# Function to compute MSE between two frames
def compute_mse(frame1, frame2):
    gray_frame1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
    gray_frame2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)
    diff = (gray_frame1.astype(np.float32) - gray_frame2.astype(np.float32)) ** 2
    mse = np.mean(diff)
    return mse

# Function to extract frame number from filename
def extract_frame_number(filename):
    parts = filename.split('Frame')
    if len(parts) > 1:
        frame_number = parts[1].split('.')[0]
        return int(frame_number)
    return None

def process_and_write(name_file):
    if name_file.endswith('.csv'):
        # Initialize results list
        results = []    

        # Read CSV file
        print(name_file)
        df = pd.read_csv(os.path.join(sorted_folder,name_file))

        # Loop through each row in the DataFrame
        for index, row in df.iterrows():
            filename = row['FileName']

            # Extract folder name (portion before 'Frame')
            folder_name = filename.split('Frame')[0].strip('\\')  # Extract folder name
            
            # Construct image paths using filename
            path1 = os.path.join(original_folder, folder_name, filename)
            
            # For the first row, use the same frame for both previous and current
            if index == 0:
                path2 = path1
            else:
                # Get the filename of the previous row
                prev_filename = df.loc[index - 1, 'FileName']
                path2 = os.path.join(original_folder, folder_name, prev_filename)

            # Read frames
            # print('path1=',path1)
            # print('path2=',path2)
            frame1 = cv2.imread(path1)
            frame2 = cv2.imread(path2)

            # Compute MSE
            mse_value = compute_mse(frame1, frame2)

            # Append result to list
            results.append({
                'frame_number': extract_frame_number(filename),
                'Distance': mse_value
            })

        # Create DataFrame from results
        results_df = pd.DataFrame(results)

        os.makedirs(merged_folder, exist_ok=True)  # Create the folder if it doesn't exist

        # Define the output file path + .csv
        merged_path = os.path.join(merged_folder, f'4_measured_{folder_name}.csv')
        
        # Write the result DataFrame to the output CSV file
        results_df.to_csv(merged_path, index=False)        

# List all files in the folder
name_files = os.listdir(sorted_folder)

# Specify the number of parallel processes
num_cores = -1  # Use all available CPU cores

# Execute the tasks in parallel
Parallel(n_jobs=num_cores)(
    delayed(process_and_write)(name_file)
    for name_file in name_files
    )
# for file_name in file_names:
#     (process_and_write)(file_name)



[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [10]:
import os
import pandas as pd
from joblib import Parallel, delayed
import math

# Define the folder path for cleaned data
measured_folder = r'4_measured'
threshold_folder = r'5_threshold'

# Define percentage of samples
number_of_tool = 1
percent_sample = 0.5/number_of_tool
print(percent_sample)

def process_and_write(file_name):
    if file_name.endswith('.csv'):

        # Read the CSV file
        df = pd.read_csv(os.path.join(measured_folder,file_name))

        # Assuming 'Distance' column contains the distances
        distances = df['Distance']

        print('filename:',file_name,' number of samples:', len(distances))

        # Specify the desired number of indices
        desired_indices = math.ceil(percent_sample * len(distances))

        # Filter rows where distance > 0
        filtered_df = df[df['Distance'] > 0]

        # Count the number of rows where distance > 0
        count_greater_than_zero = filtered_df.shape[0]

        print(count_greater_than_zero < desired_indices)

        if count_greater_than_zero < desired_indices:
            # Initialize a list to store rows (converted to tuples)
            indices_where_condition_met = []

            # Iterate over rows in filtered_df and append each row to indices_where_condition_met
            for index, row in filtered_df.iterrows():
                indices_where_condition_met.append(index)
        else:
            # Initialize threshold value
            threshold_value = 0.00001#1  # 

            # Initialize variables
            indices_where_condition_met = []  # List to store indices where condition is met
            index = 0

            # Iterate until the desired number of indices is reached
            while len(indices_where_condition_met) < desired_indices:
                # Reset variables
                indices_where_condition_met = []
                index = 0
                
                # Loop until all indices are read
                while index < len(distances):
                    # Initialize current sum for the current set of intervals
                    current_sum = 0  + 1e-6
                    
                    # Loop over intervals starting from the current index
                    for i in range(index, len(distances)):
                        # Add the distance to the current sum
                        current_sum += distances[i]  # Adding 1e-6 to each distance
                        
                        # Check if the inverse of the sum exceeds the threshold
                        if (1 / current_sum) <= threshold_value:
                            # Add the index where the condition is met to the list
                            indices_where_condition_met.append(i) 
                            
                            # Move to the next index after this interval
                            index = i + 1
                            break
                    else:
                        # If no break occurred, all distances were considered and index should be set to the length of distances
                        index = len(distances)
                    
                    # print((1 / current_sum))
                    # print('len',len(indices_where_condition_met))
                print('number of indices:',len(indices_where_condition_met), '/ desired len:',desired_indices)
                # Adjust the threshold value for the next iteration
                threshold_value += 0.000001  # Increase the threshold by a small amount

            print("Threshold value:", threshold_value)
            print("Number of indices meeting the condition:", len(indices_where_condition_met))

        # Extract the identifier from the filename 
        video_id = os.path.splitext(file_name)[0].split('_')[-1]

        os.makedirs(threshold_folder, exist_ok=True)  # Create the folder if it doesn't exist

        # Define the output file path + .csv
        threshold_path = os.path.join(threshold_folder, f'5_threshold_{video_id}.csv')

        # Save lines corresponding to the indices where the condition is met
        with open(threshold_path, "w") as output_file:
            # Write the header line
            output_file.write('frame_number\n')
            
            # Write the indices to the file
            for idx in indices_where_condition_met:
                # Write the index to the output file
                output_file.write(str(df['frame_number'].iloc[idx]) + '\n')
    
    # return  threshold_value           

# List all files in the folder
file_names = os.listdir(measured_folder)

# Specify the number of parallel processes
num_cores = -1  # Use all available CPU cores

# Execute the tasks in parallel
Parallel(n_jobs=num_cores)(
    delayed(process_and_write)(file_name)
    for file_name in file_names
    )
# for file_name in file_names:
#     (process_and_write)(file_name)




0.5


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [11]:
import pandas as pd
from tqdm import tqdm

# Define the folder path for cleaned data
sorted_folder = r'3_sorted'
threshold_folder = r'5_threshold'
filtered_folder = r'6_filtered'

# Function to extract frame number from filename
def extract_frame_number(filename):
    parts = filename.split('Frame')
    if len(parts) > 1:
        frame_number = parts[1].split('.')[0]
        return int(frame_number)
    return None

# List all files in the folder
file_names = os.listdir(sorted_folder)

# print(file_names)
for file_name in tqdm(file_names, desc='Processing csv files'):
    if file_name.endswith('.csv'):

        # Read the first CSV file into DataFrame
        first_file = os.path.join(sorted_folder,file_name)
        df_first = pd.read_csv(first_file)

        # Extract frame numbers from the 'FileName' column in df_first
        df_first['frame_number'] = df_first['FileName'].apply(lambda x: extract_frame_number(x))

        # Extract the identifier from the filename 
        video_id = os.path.splitext(file_name)[0].split('_')[-1]
        
        # Read the frame numbers from the second CSV file into a list
        second_file = os.path.join(threshold_folder, f'5_threshold_{video_id}.csv')
        df_second = pd.read_csv(second_file)

        # Convert the 'frame_number' column in df_second to a list
        frame_numbers_to_keep = df_second['frame_number'].tolist()

        # Filter df_first to retain rows with frame numbers that match the ones in the list
        df_filtered = df_first[df_first['frame_number'].isin(frame_numbers_to_keep)]

        # Keep only unique filenames in the filtered DataFrame
        df_filtered = df_filtered.drop_duplicates(subset='FileName')

        # Drop the 'frame_number' column from df_filtered
        df_filtered.drop('frame_number', axis=1, inplace=True)

        os.makedirs(filtered_folder, exist_ok=True)  # Create the folder if it doesn't exist

        # Define the output file path + .csv
        filtered_path = os.path.join(filtered_folder, f'6_filtered_{video_id}.csv')

        # Write the filtered DataFrame to a new text file with custom formatting
        with open(filtered_path, 'w') as f:
            # Write the DataFrame to the text file row by row with custom formatting
            for _, row in df_filtered.iterrows():
                # Convert row values to a list of strings
                row_values = [str(value) for value in row.tolist()]
                # Join row values with '---' and write to file
                f.write('---'.join(row_values) + '\n')

print(f"Filtered DataFrame based on matching frame numbers has been written to '{filtered_folder}' in text format with custom formatting.")

Processing csv files: 100%|██████████| 30/30 [00:01<00:00, 25.61it/s]

Filtered DataFrame based on matching frame numbers has been written to '6_filtered' in text format with custom formatting.





In [12]:
import os
import random

# Define the folder path containing CSV files
folder_path = r'6_filtered'

# Define the output file path for the shuffled lines
output_file_path = r'MSE_0_5_v1.txt'

# Initialize an empty list to store all lines from CSV files
all_lines = []

# Loop through each file in the folder
for file_name in os.listdir(folder_path):
    # Check if the file is a CSV file
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)
        
        # Read all lines from the CSV file
        with open(file_path, 'r') as file:
            lines = file.readlines()
        
        # Append all lines to the list of all lines
        all_lines.extend(lines)

# Shuffle all lines randomly
random.shuffle(all_lines)

# Write the shuffled lines to the output file
with open(output_file_path, 'w') as output_file:
    output_file.writelines(all_lines)

print(f"Shuffled lines from CSV files written to: {output_file_path}")


Shuffled lines from CSV files written to: MSE_0_5_v1.txt


In [14]:
# Specify the path to the text file
# file_path = 'train_baseline.txt'
file_path = 'MSE_0_01_v1.txt'
# file_path = 'train.txt'
# file_path = 'train_baseline_0_3_v2.txt'

# Dictionary to count occurrences of each class
class_counts = {}

# Read the file and analyze each line
with open(file_path, 'r') as file:
    for line in file:
        # Split each line by '---'
        parts = line.strip().split('---')
        
        if len(parts) >= 2:
            # Extract the class number (the second part after splitting)
            class_number = parts[1].strip()
            
            # Update the class count in the dictionary
            if class_number in class_counts:
                class_counts[class_number] += 1
            else:
                class_counts[class_number] = 1

# Print the class counts
print("Class Counts:")
for class_num, count in sorted(class_counts.items()):
    print(f"Class {class_num}: {count} samples")


Class Counts:
Class 0: 77 samples
Class 1: 38 samples
Class 2: 148 samples
Class 3: 135 samples
Class 4: 225 samples
Class 5: 169 samples
Class 6: 110 samples
