In [None]:
# Filter left hand tool with ID = 0 (which appears most)
#
#
import pandas as pd
import os
from tqdm import tqdm

# Define the file path

folder_path = r'D:\Research\Phong\ToolTracking_Project\centroid'

# List all files in the folder
file_names = os.listdir(folder_path)

# print(file_names)
for file_name in tqdm(file_names, desc='Processing csv files'):
    if file_name.endswith('.csv'):
        # Read the CSV file
        df = pd.read_csv(os.path.join(folder_path, file_name))

        # Assuming 'entroid' contains tuples as strings, convert them to actual tuples
        # df['centroid'] = df['centroid'].apply(lambda x: tuple(map(float, x.strip('()').split(','))))
        df['centroid'] = df['centroid'].apply(lambda x: tuple(round(float(coord), 4) for coord in x.strip('()').split(',')))

        # Filter based on 'Class ID'
        classID_df = df[df['Class ID'] == 0]

        # Get the right hand tool
        filtered_df = classID_df[classID_df['centroid'].apply(lambda x: x[0] <= 0.5)]

        # Remove duplicate rows based on 'frame_number'
        cleaned_df = filtered_df.drop_duplicates(subset='frame_number', keep='first')

        # Define the folder path for cleaned data
        cleaned_folder = '1_cleaned_df'
        os.makedirs(cleaned_folder, exist_ok=True)  # Create the folder if it doesn't exist

        # Extract the identifier from the filename
        file_identifier = file_name.split('_')[-1]

        # Define the output file path for cleaned data
        cleaned_file_path = os.path.join(cleaned_folder, f'1_cleaned_df_{file_identifier}')

        # Save the filtered DataFrame to a new file
        cleaned_df.to_csv(cleaned_file_path, index=False)

        
print(f"Cleaned data saved to '{cleaned_folder}' folder")


In [2]:
import pandas as pd
import os
from tqdm import tqdm

# Define the file path

folder_path = r'1_cleaned_df'

# List all files in the folder
file_names = os.listdir(folder_path)

# print(file_names)
for file_name in tqdm(file_names, desc='Processing csv files'):
    if file_name.endswith('.csv'):
        # Read the CSV file
        df = pd.read_csv(os.path.join(folder_path, file_name))

        # Sort DataFrame by frame_number
        df = df.sort_values(by='frame_number')

        # Initialize variables
        previous_xy = None
        filled_rows = []

        # Iterate through each row in the DataFrame
        for index, row in df.iterrows():
            # If the frame_number is greater than the previous frame_number + 1, fill missing frames
            if previous_xy is not None and row['frame_number'] != previous_frame_number + 1:
                for i in range(previous_frame_number + 1, row['frame_number']):
                    # Create a new row with the frame_number and xy_cords from the previous frame
                    new_row = (i, previous_class_id, previous_xy)
                    filled_rows.append(new_row)
            
            # Update previous_xy and previous_frame_number for the next iteration
            previous_xy = row['centroid']
            previous_frame_number = row['frame_number']
            previous_class_id = row['Class ID']
            
            # Add the current row to filled_rows
            filled_rows.append((row['frame_number'], row['Class ID'], row['centroid']))

        # Fill xy cords for frame number 1 if missing
        if filled_rows[0][0] != 1:
            filled_rows.insert(0, (1, 0, (0, 0)))

        # Create a DataFrame from filled_rows
        filled_df = pd.DataFrame(filled_rows, columns=['frame_number', 'Class ID', 'centroid'])

        # Sort DataFrame by frame_number
        filled_df = filled_df.sort_values(by='frame_number')

        # Define the folder path for cleaned data
        corrected_data_folder = '2_corrected_data'
        os.makedirs(corrected_data_folder, exist_ok=True)  # Create the folder if it doesn't exist

        # Extract the identifier from the filename
        file_identifier = file_name.split('_')[-1]

        # Define the output file path
        corrected_data_path = os.path.join(corrected_data_folder, f'2_corrected_data_{file_identifier}')


        # Save filled DataFrame to a new file
        filled_df.to_csv(corrected_data_path, index=False)

print("Filled data saved to corrected_data folder")


Processing csv files: 100%|██████████| 42/42 [01:18<00:00,  1.86s/it]

Filled data saved to corrected_data folder





In [None]:
# import pandas as pd
# import matplotlib.pyplot as plt

# # Open the file to read data
# with open('4_condition_met_indexes.csv', 'r') as file:
#     # Read the CSV file
#     df = pd.read_csv(file)



# # Print the first few rows of the DataFrame to verify data reading
# print(df.head())

# # Calculate differences between indices
# index_diff = [df['frame_number'][i+1] - df['frame_number'][i] for i in range(len(df['frame_number'])-1)]

# # Plotting the differences between indices
# plt.figure(figsize=(10, 4))
# plt.plot(index_diff, marker='o', linestyle='-')
# plt.xlabel('Data Point')
# plt.ylabel('Frame Difference')
# plt.title('Difference Between Frame Changes')
# plt.grid(True)
# # Set y-axis limit
# plt.ylim(0, 200)
# plt.show()




In [3]:
import pandas as pd
import os
from tqdm import tqdm

# Define the file path
file_path = r'train.txt'  # Replace 'file.txt' with the path to your file
folder_path = r'2_corrected_data'
# Define the folder path for cleaned data
sorted_folder = '3_sorted'

# List all files in the folder
file_names = os.listdir(folder_path)

def sort_train(keyword):
    # Read lines from the file and filter for lines containing the search keyword
    filtered_lines = []
    with open(file_path, 'r') as file:
        for line in file:
            if keyword in line:
                filtered_lines.append(line.strip())

        # Check if existing search_keyword in train file
        if(len(filtered_lines) > 0):
            # Create a DataFrame from filtered lines
            df = pd.DataFrame([line.split('---') for line in filtered_lines],
                            columns=['FileName', 'Class', 'NumberOfFrames', 'FrameRate'])

            # Sort the DataFrame by the first column ('FileName')
            df_sorted = df.sort_values(by='FileName')

            os.makedirs(sorted_folder, exist_ok=True)  # Create the folder if it doesn't exist

            # Define the output file path + .csv
            sorted_path = os.path.join(sorted_folder, f'3_sorted_{search_keyword}.csv')

            # Write the sorted DataFrame to a new CSV file
            df_sorted.to_csv(sorted_path, index=False)

# print(file_names)
for file_name in tqdm(file_names, desc='Processing csv files'):
    if file_name.endswith('.csv'):
        # Extract the identifier from the filename and define a search keyword
        search_keyword = os.path.splitext(file_name)[0].split('_')[-1]

        sort_train(search_keyword)

        
print(f"Processed and saved sorted {sorted_folder} folder")


Processing csv files: 100%|██████████| 42/42 [00:00<00:00, 86.67it/s]

Processed and saved sorted 3_sorted folder





In [16]:
import pandas as pd
import os
from tqdm import tqdm

# Define the folder path for cleaned data
sorted_folder = r'3_sorted'
corrected_data_path = r'2_corrected_data'

merged_folder = r'4_merged'

# Function to extract frame number from filename
def extract_frame_number(filename):
    parts = filename.split('Frame')
    if len(parts) > 1:
        frame_number = parts[1].split('.')[0]
        return int(frame_number)
    return None

# List all files in the folder
file_names = os.listdir(sorted_folder)

# print(file_names)
for file_name in tqdm(file_names, desc='Processing csv files'):
    if file_name.endswith('.csv'):
        
        # Read the first CSV file into DataFrame
        df_first = pd.read_csv(os.path.join(sorted_folder,file_name))

        # Extract frame numbers from the 'FileName' column in df_first
        df_first['frame_number'] = df_first['FileName'].apply(lambda x: extract_frame_number(x))

        # Extract the identifier from the filename 
        video_id = os.path.splitext(file_name)[0].split('_')[-1]

        # Read the second CSV file into DataFrame
        second_file = os.path.join(corrected_data_path,f'2_corrected_data_{video_id}.csv')

        df_second = pd.read_csv(second_file)

        # Initialize a list to store the matched rows from df_second
        matched_rows = []

        # print(df_first)

        # Loop through each frame number in df_first
        for frame_number in df_first['frame_number']:
            # Find all rows in df_second that match the current frame number
            matching_rows = df_second[df_second['frame_number'] == frame_number]
            # Append the matching rows to the list
            matched_rows.extend(matching_rows.values.tolist())

        # Create a DataFrame from the matched rows with column names from df_second
        columns = df_second.columns
        result_df = pd.DataFrame(matched_rows, columns=columns)

        os.makedirs(merged_folder, exist_ok=True)  # Create the folder if it doesn't exist

        # Define the output file path + .csv
        merged_path = os.path.join(merged_folder, f'4_merged_{video_id}.csv')
        
        # Write the result DataFrame to the output CSV file
        result_df.to_csv(merged_path, index=False)

print(f"Matching information based on frame numbers has been written to '{merged_folder}' folder")


Processing csv files: 100%|██████████| 30/30 [00:10<00:00,  2.77it/s]

Matching information based on frame numbers has been written to '4_merged' folder





In [17]:
import pandas as pd
import os
from tqdm import tqdm
import numpy as np

# Define the folder path for cleaned data
merged_folder = r'4_merged'
measured_folder = r'5_measured'

# List all files in the folder
file_names = os.listdir(merged_folder)

# print(file_names)
for file_name in tqdm(file_names, desc='Processing csv files'):
    if file_name.endswith('.csv'):

        # Read the CSV file
        df = pd.read_csv(os.path.join(merged_folder,file_name))

        # Convert xy cords to numerical values
        df['centroid'] = df['centroid'].apply(lambda x: np.array(eval(x)))

        # Function to calculate Euclidean distance between two points
        def euclidean_distance(point1, point2):
            return np.linalg.norm(point2 - point1)

        # Initialize variables
        total_distances = []  # List to store distances for each interval
        prev_xy = None
        current_interval_distance = 0

        # Iterate through each row in the DataFrame
        for index, row in df.iterrows():

            # Append the total distance for the previous interval to the list
            total_distances.append(current_interval_distance)
            current_interval_distance = 0  # Reset the distance for the new interval
            # prev_xy = None
            
            # Check if Class ID_x is 0
            if row['Class ID'] == 0:
                # Skip the first frame
                if prev_xy is None:
                    prev_xy = row['centroid']
                    continue
                
                # Calculate distance traveled
                distance = euclidean_distance(prev_xy, row['centroid'])
                
                # Update previous xy coordinates
                prev_xy = row['centroid']
                
                # Add distance to the current interval's total
                current_interval_distance += distance

                # print(current_interval_distance)

        # Append the last interval's total distance to the list
        total_distances.append(current_interval_distance)

        # Calculate velocity for each interval
        # Assuming the time interval between frames is constant
        time_interval = 1  # You need to specify the time interval between frames
        velocities = np.diff(total_distances) / time_interval

        # Create DataFrame for results with frame_number, Interval, Distance, and Velocity
        result_df = pd.DataFrame({
            'frame_number': df['frame_number'],  # Use frame_number from the original DataFrame
            'Distance': total_distances[:-1],  # Exclude the last interval's distance
            'Velocity': velocities
        })

        # Extract the identifier from the filename 
        video_id = os.path.splitext(file_name)[0].split('_')[-1]

        os.makedirs(measured_folder, exist_ok=True)  # Create the folder if it doesn't exist

        # Define the output file path + .csv
        measured_path = os.path.join(measured_folder, f'5_measured_{video_id}.csv')

        # Save the result to a CSV file
        result_df.to_csv(measured_path, index=False)

print("Results saved to '{measured_folder}' folder")


Processing csv files: 100%|██████████| 30/30 [00:03<00:00,  9.70it/s]

Results saved to '{measured_folder}' folder





In [29]:
import os
import pandas as pd

def merge_and_sort_csv(source_folder, target_folder, output_folder):
        
    # Get list of CSV files in source folder
    source_files = [f for f in os.listdir(source_folder) if f.endswith('.csv')]
    
    # for filename in source_files:
    for filename in tqdm(source_files, desc='Processing csv files'):
        # Construct paths for source and target files
        source_file_path = os.path.join(source_folder, filename)
        target_file_path = os.path.join(target_folder, filename)
        
        # Read CSV files into DataFrames
        source_df = pd.read_csv(source_file_path)
        
        target_df = pd.read_csv(target_file_path)
        
        # Concatenate both DataFrames
        combined_df = pd.concat([source_df, target_df], ignore_index=True)
        
        # Sort DataFrame by 'frame_number'
        combined_df.sort_values(by='frame_number', inplace=True)

        # Extract the identifier from the filename 
        video_id = os.path.splitext(filename)[0].split('_')[-1]

        os.makedirs(output_folder, exist_ok=True)  # Create the folder if it doesn't exist

        # Define the output file path + .csv
        combined_path = os.path.join(output_folder, f'6_combined_{video_id}.csv')

        # Write sorted DataFrame to a new CSV file
        combined_df.to_csv(combined_path, index=False)
            
# Specify paths to source folder (5_measured), target folder, and output folder
source_folder_path = r'5_measured'
target_folder_path = r'data1\5_measured'
output_folder_path = r'6_mixed'

# Call the function to process CSV files in the source folder
merge_and_sort_csv(source_folder_path, target_folder_path, output_folder_path)


Processing csv files: 100%|██████████| 30/30 [00:00<00:00, 72.81it/s]


In [1]:
# Use velocity and distance instead of distance only
import os
import pandas as pd
from joblib import Parallel, delayed
import math

# Define the folder path for cleaned data
measured_folder = r'6_mixed'
threshold_folder = r'7_threshold'

# Define percentage of samples
number_of_tool = 2
percent_sample = 0.15/number_of_tool
print(percent_sample)

def process_and_write(file_name):
    if file_name.endswith('.csv'):

        # Read the CSV file
        df = pd.read_csv(os.path.join(measured_folder,file_name))

        # Assuming 'Velocity' column contains the velocity
        distances = df['Distance']
        velocities = df['Velocity']

        print('filename:',file_name,' number of samples:', len(distances))

        # Specify the desired number of indices
        desired_indices = math.ceil(percent_sample * len(distances))

        # Filter rows where abs(velocity) > 0
        # filtered_df = df[abs(df['Velocity']) > 0]
        # use all data
        filtered_df = df

        # Count the number of rows where distance > 0
        count_greater_than_zero = filtered_df.shape[0]

        print(count_greater_than_zero < desired_indices)

        if count_greater_than_zero < desired_indices:
            # Initialize a list to store rows (converted to tuples)
            indices_where_condition_met = []

            # Iterate over rows in filtered_df and append each row to indices_where_condition_met
            for index, row in filtered_df.iterrows():
                indices_where_condition_met.append(index)
        else:
            # Initialize threshold value
            threshold_value = 0.01#1  # 

            # Initialize variables
            indices_where_condition_met = []  # List to store indices where condition is met
            index = 0

            # Iterate until the desired number of indices is reached
            while len(indices_where_condition_met) < desired_indices:
                # Reset variables
                indices_where_condition_met = []
                index = 0
                
                # Loop until all indices are read
                while index < len(distances):
                    # Initialize current sum for the current set of intervals
                    current_sum = 0  + 1e-6
                    
                    # Loop over intervals starting from the current index
                    for i in range(index, len(distances)):
                        # Add the distance to the current sum
                        current_sum = current_sum + distances[i] + abs(velocities[i])  # Adding 1e-6 to each distance
                        
                        # Check if the inverse of the sum exceeds the threshold
                        if (1 / current_sum) <= threshold_value:
                            # Add the index where the condition is met to the list
                            indices_where_condition_met.append(i) 
                            
                            # Move to the next index after this interval
                            index = i + 1
                            break
                    else:
                        # If no break occurred, all distances were considered and index should be set to the length of distances
                        index = len(distances)
                    
                    # print((1 / current_sum))
                    # print('len',len(indices_where_condition_met))
                print('number of indices:',len(indices_where_condition_met), '/ desired len:',desired_indices)
                # Adjust the threshold value for the next iteration
                threshold_value += 0.01  # Increase the threshold by a small amount

            print("Threshold value:", threshold_value)
            print("Number of indices meeting the condition:", len(indices_where_condition_met))

        # Extract the identifier from the filename 
        video_id = os.path.splitext(file_name)[0].split('_')[-1]

        os.makedirs(threshold_folder, exist_ok=True)  # Create the folder if it doesn't exist

        # Define the output file path + .csv
        threshold_path = os.path.join(threshold_folder, f'7_threshold_{video_id}.csv')

        # Save lines corresponding to the indices where the condition is met
        with open(threshold_path, "w") as output_file:
            # Write the header line
            output_file.write('frame_number\n')
            
            # Write the indices to the file
            for idx in indices_where_condition_met:
                # Write the index to the output file
                output_file.write(str(df['frame_number'].iloc[idx]) + '\n')
    
    # return  threshold_value           

# List all files in the folder
file_names = os.listdir(measured_folder)

# Specify the number of parallel processes
num_cores = -1  # Use all available CPU cores

# Execute the tasks in parallel
Parallel(n_jobs=num_cores)(
    delayed(process_and_write)(file_name)
    for file_name in file_names
    )
# for file_name in file_names:
#     (process_and_write)(file_name)




0.075


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [2]:
import pandas as pd
from tqdm import tqdm

# Define the folder path for cleaned data
sorted_folder = r'3_sorted'
threshold_folder = r'7_threshold'
filtered_folder = r'8_filtered'

# Function to extract frame number from filename
def extract_frame_number(filename):
    parts = filename.split('Frame')
    if len(parts) > 1:
        frame_number = parts[1].split('.')[0]
        return int(frame_number)
    return None

# List all files in the folder
file_names = os.listdir(sorted_folder)

# print(file_names)
for file_name in tqdm(file_names, desc='Processing csv files'):
    if file_name.endswith('.csv'):

        # Read the first CSV file into DataFrame
        first_file = os.path.join(sorted_folder,file_name)
        df_first = pd.read_csv(first_file)

        # Extract frame numbers from the 'FileName' column in df_first
        df_first['frame_number'] = df_first['FileName'].apply(lambda x: extract_frame_number(x))

        # Extract the identifier from the filename 
        video_id = os.path.splitext(file_name)[0].split('_')[-1]
        
        # Read the frame numbers from the second CSV file into a list
        second_file = os.path.join(threshold_folder, f'7_threshold_{video_id}.csv')
        df_second = pd.read_csv(second_file)

        # Convert the 'frame_number' column in df_second to a list
        frame_numbers_to_keep = df_second['frame_number'].tolist()

        # Filter df_first to retain rows with frame numbers that match the ones in the list
        df_filtered = df_first[df_first['frame_number'].isin(frame_numbers_to_keep)]

        # Keep only unique filenames in the filtered DataFrame
        df_filtered = df_filtered.drop_duplicates(subset='FileName')

        # Drop the 'frame_number' column from df_filtered
        df_filtered.drop('frame_number', axis=1, inplace=True)

        os.makedirs(filtered_folder, exist_ok=True)  # Create the folder if it doesn't exist

        # Define the output file path + .csv
        filtered_path = os.path.join(filtered_folder, f'8_filtered_{video_id}.csv')

        # Write the filtered DataFrame to a new text file with custom formatting
        with open(filtered_path, 'w') as f:
            # Write the DataFrame to the text file row by row with custom formatting
            for _, row in df_filtered.iterrows():
                # Convert row values to a list of strings
                row_values = [str(value) for value in row.tolist()]
                # Join row values with '---' and write to file
                f.write('---'.join(row_values) + '\n')

print(f"Filtered DataFrame based on matching frame numbers has been written to '{filtered_folder}' in text format with custom formatting.")

Processing csv files: 100%|██████████| 30/30 [00:00<00:00, 46.40it/s]

Filtered DataFrame based on matching frame numbers has been written to '8_filtered' in text format with custom formatting.





In [34]:
# # Add 
# import os
# import random

# # Function to extract video name from CSV file name
# def extract_video_name(csv_file):
#     parts = csv_file.split('_')
#     video_name = parts[-1].replace('.csv', '')
#     return video_name

# # Define percentage of samples
# percent_sample = 0.15

# # Define pattern
# class_pattern = '---0---'

# # Specify the folder containing CSV files
# folder_path = r'8_filtered'

# # List all CSV files in the folder
# csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

# # Specify the train.txt file path
# train_file_path = r'train.txt'

# # # Process each CSV file in the folder
# # csv_file = r'7_filtered_GJ01182018MP.csv'

# # if(1==1):
# for csv_file in csv_files:
#     # Get the full path of the CSV file
#     csv_file_path = os.path.join(folder_path, csv_file)
    
#     # Get the video name from the CSV file name
#     video_name = extract_video_name(csv_file)
    
#     # List to store lines with matching video name and class 0
#     matching_lines = []
    
#     # Read the train.txt file and search for lines
#     with open(train_file_path, 'r') as train_file:
#         for line in train_file:
#             if video_name in line and class_pattern in line:
#                 matching_lines.append(line.strip())         
    
#     if(len(matching_lines) > 0):
#         # Calculate the number of lines to select (5% of matching lines)
#         num_lines_to_select = max(1, int(percent_sample * len(matching_lines)))
        
#         # Randomly select 5% of the matching lines
#         random_selected_lines = random.sample(matching_lines, num_lines_to_select)
        
#         # Append the randomly selected lines to the target CSV file
#         with open(csv_file_path, 'a') as target_file:
#             for line in random_selected_lines:
#                 target_file.write(line + '\n')
        
#         print(f"Successfully appended {num_lines_to_select} random lines to {csv_file_path}.")
#     else:
#         print(f"Skip {csv_file_path}.")

# print("Process completed for all CSV files in the folder.")


In [3]:
import os
import random

# Define the folder path containing CSV files
folder_path = r'8_filtered'

# Initialize an empty list to store all lines from CSV files
all_lines = []

# Loop through each file in the folder
for file_name in os.listdir(folder_path):
    # Check if the file is a CSV file
    if file_name.endswith('.csv'):
        file_path = os.path.join(folder_path, file_name)
        
        # Read all lines from the CSV file
        with open(file_path, 'r') as file:
            lines = file.readlines()
        
        # Append all lines to the list of all lines
        all_lines.extend(lines)

# Shuffle all lines randomly
random.shuffle(all_lines)

# Define the output file path for the shuffled lines
output_file_path = r'data2Tool_av_0_15_v1.txt'

# Write the shuffled lines to the output file
with open(output_file_path, 'w') as output_file:
    output_file.writelines(all_lines)

print(f"Shuffled lines from CSV files written to: {output_file_path}")


Shuffled lines from CSV files written to: data2Tool_av_0_15_v1.txt


In [113]:
# # Extract at interval
# import pandas as pd

# def extract_data_at_interval(file_path, interval_percentage):
#     # Read the train.txt file and count total lines
#     with open(file_path, 'r') as file:
#         lines = file.readlines()
#         total_lines = len(lines)
    
#     # Calculate number of lines to extract (5% of total lines)
#     lines_to_extract = int(total_lines * interval_percentage)
    
#     # Determine the fixed interval for data extraction
#     interval = total_lines // lines_to_extract
    
#     # Initialize a list to store extracted data
#     extracted_data = []

#     # Iterate through lines and extract data at the fixed interval
#     for i in range(0, total_lines, interval):
#         line = lines[i].strip()
#         parts = line.split('---')
#         filename = parts[0]
#         label = int(parts[1])
#         value1 = int(parts[2])
#         value2 = int(parts[3])
#         extracted_data.append((filename, label, value1, value2))
    
#     return extracted_data

# def write_extracted_data_to_file(extracted_data, output_file_path):
#     # Write extracted data to the specified output file
#     with open(output_file_path, 'w') as output_file:
#         for data in extracted_data:
#             line = '---'.join([str(elem) for elem in data]) + '\n'
#             output_file.write(line)

# # Example usage:
# file_path = 'train.txt'
# interval_percentage = 0.2  # % of the data
# output_file_path = 'train_baseline_0_2_v1.txt'  # Path to the output file

# # Extract data from train.txt
# extracted_data = extract_data_at_interval(file_path, interval_percentage)

# # Write extracted data to the output file
# write_extracted_data_to_file(extracted_data, output_file_path)

# print(f"Extracted data has been written to {output_file_path}.")


Extracted data has been written to train_baseline_0_2_v1.txt.


In [4]:
# Specify the path to the text file
# file_path = 'train_baseline.txt'
file_path = 'data2Tool_av_0_15_v1.txt'
# file_path = 'train.txt'
# file_path = 'train_baseline_0_3_v2.txt'

# Dictionary to count occurrences of each class
class_counts = {}

# Read the file and analyze each line
with open(file_path, 'r') as file:
    for line in file:
        # Split each line by '---'
        parts = line.strip().split('---')
        
        if len(parts) >= 2:
            # Extract the class number (the second part after splitting)
            class_number = parts[1].strip()
            
            # Update the class count in the dictionary
            if class_number in class_counts:
                class_counts[class_number] += 1
            else:
                class_counts[class_number] = 1

# Print the class counts
print("Class Counts:")
for class_num, count in sorted(class_counts.items()):
    print(f"Class {class_num}: {count} samples")


Class Counts:
Class 0: 734 samples
Class 1: 334 samples
Class 2: 1245 samples
Class 3: 622 samples
Class 4: 1742 samples
Class 5: 1422 samples
Class 6: 997 samples
