Import libraries

In [None]:
from typing import List, Tuple
import pandas as pd
import numpy as np
import ast
import glob
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
from scipy.signal import find_peaks
import os
import gzip
from tqdm import tqdm

Add path to the folder

The folder has files for each fish by 4 trials.
The first two characters of the filename is the fish ID and 5th character is the trial number.
A few fishes have only 3 trials.

In [None]:

path = "/home/kkumari/PhD/fish-data/long-term-free-swim/"
# path = "C:/PhD/long_term_free_swim"

In [None]:
def load_and_collate_data(all_files: List[str], desired_cols: List[str]) -> dict:
    fish_data = {}
    trial_counter = {}

    for file in tqdm(all_files, desc="Processing files"):
       
        with gzip.open(file, 'rb') as f:
            df = pd.read_csv(f, usecols=desired_cols, nrows=1000)
        fish_id = os.path.basename(file)[:2]

        if fish_id not in fish_data:
            fish_data[fish_id] = {'df': [], 'files': []}
            trial_counter[fish_id] = 0  # Initialize the trial counter for the fish

        trial_counter[fish_id] += 1

        if trial_counter[fish_id] <= 3:
            fish_data[fish_id]['df'].append(df)
            fish_data[fish_id]['files'].append(file)

    return fish_data


In [None]:
# cartesian to spherical coordinates
def cart2sph(x,y,z):
    azimuth = np.arctan2(y,x)
    elevation = np.arctan2(z, np.sqrt(x**2 + y**2))
    R = np.sqrt(x**2 + y**2 + z**2)
    return(azimuth, elevation, R)

In [None]:
def moving_avg_interpolate(df, window_size=11, interpolate_method='pad'):

    sdf= df.rolling(window_size,center=True).mean().interpolate(method=interpolate_method)
    ddf=sdf.diff().interpolate(method='bfill')
    return sdf, ddf


In [None]:
def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    # Calculate step sizes and find large steps
    df_diff = df.diff(periods=1, axis=0)
    steps = np.sqrt(df["fishx"]**2 + df["fishy"]**2 + df["fishz"]**2)
    df["steps"] = steps
    max_stepsize = 0.02
    large_steps = df['steps'] > max_stepsize
    count_large_steps = np.count_nonzero(large_steps)
    print(f"Number of large steps: {count_large_steps}")
    w = 10
    selected_columns = ['fishz', 'fishy', 'fishx']
    large_step_indices = large_steps[large_steps].index.values
    for i in range(count_large_steps):
        lsi = large_step_indices[i]
        df.loc[lsi-w:lsi+w, selected_columns] = np.nan
    
    # Modify data based on coordinate conversion and thresholds
    err = 0.001
    df.loc[df['fishz'] < - (0.09 + err), selected_columns] = np.nan
    df.loc[df['fishz'] > 0 + err, selected_columns] = np.nan
    zoffset = 0.11
    azimuth, elevation, R = cart2sph(df['fishx'], df['fishy'], df['fishz'] - zoffset)
    err = 0.005
    df.loc[R > 0.2 + err, selected_columns] = np.nan
    df.loc[R < 0.11 - err, selected_columns] = np.nan
    
    return df


In [None]:
def calculate_angles(df: pd.DataFrame, peaks: np.ndarray) -> Tuple[List[float], List[float]]:
    dx = df["fishx"].diff().interpolate(method='bfill')
    dy = df["fishy"].diff().interpolate(method='bfill')

    angle_wrapped = np.arctan2(dy, dx)
    last = 0
    angles = []
    for phi in angle_wrapped:
        while phi < last - np.pi:
            phi += 2 * np.pi
        while phi > last + np.pi:
            phi -= 2 * np.pi
        last = phi
        angles.append(phi)

    angles_at_peaks = [angles[i] for i in peaks]

    print(f"Number of angles: {len(angles)}")

    return angles, angles_at_peaks


def calculate_avg_velocity(velocity: pd.Series) -> float:
    avg_velocity = velocity.median()
    print(f"Average velocity: {avg_velocity}")
    return avg_velocity


def identify_peaks(velocity: pd.Series, fHz: int) -> Tuple[np.ndarray, np.ndarray]:
    height = (0.1, 0.5)
    frames_btw_2bouts = round(fHz / 10)
    bout_width = round(fHz / 100)
    prominence = 0.05
    peaks, _ = find_peaks(velocity, height=height, distance=frames_btw_2bouts, width=bout_width, prominence=prominence)
    print(f"Number of peaks: {len(peaks)}")
    return peaks, _


def calculate_dangles(angles_at_peaks: List[float]) -> np.ndarray:
    angles_at_peaks_normalized = np.mod(angles_at_peaks, 2 * np.pi) - np.pi
    angles_at_peaks_unwrapped = np.unwrap(angles_at_peaks_normalized)
    angles_at_peaks_diff = np.diff(angles_at_peaks_unwrapped)
    angles_at_peaks_diff = np.mod(angles_at_peaks_diff + np.pi, 2 * np.pi) - np.pi
    dangles = angles_at_peaks_diff
    print(f"Number of dangles: {len(dangles)}")
    return dangles


In [None]:
def calculate_additional_variables(fish_data: dict, fHz: int) -> dict:
    dangles_dict = {}

    for fish_id, data in fish_data.items():
        angles_list = []
        angles_at_peaks_list = []
        avg_velocity_list = []
        peak_times_list = []
        dangles_list = []
        velocity_list = []
        peaks_list = []

        for df in data['df']:
            df["realtime"] = df["realtime"] - df["realtime"].iloc[0]

            df = preprocess_data(df)

            # Apply moving average and interpolate to find 'dx', 'dy', and 'dz'
            df['fishx'], df['dx'] = moving_avg_interpolate(df['fishx'])
            df['fishy'], df['dy'] = moving_avg_interpolate(df['fishy'])
            df['fishz'], df['dz'] = moving_avg_interpolate(df['fishz'])

            velocity = np.sqrt(df["dx"] ** 2 + df["dy"] ** 2 + df["dz"] ** 2) / (1 / fHz)
            peaks, _ = identify_peaks(velocity, fHz)
            peaks_list.append(peaks)
            velocity_list.append(velocity)

            angles, angles_at_peaks = calculate_angles(df, peaks)
            angles_list.append(angles)
            angles_at_peaks_list.append(angles_at_peaks)

            avg_velocity = calculate_avg_velocity(velocity)
            avg_velocity_list.append(avg_velocity)

            peak_times = [df["realtime"].iloc[i] for i in peaks]
            peak_times_list.append(peak_times)

            dangles = calculate_dangles(angles_at_peaks)
            dangles_list.append(dangles)

        data['angles'] = angles_list
        data['avg_velocity'] = avg_velocity_list
        data['angles_at_peaks'] = angles_at_peaks_list
        data['peak_times'] = peak_times_list
        data['dangles'] = dangles_list
        data['velocity'] = velocity_list
        data['peaks'] = peaks_list

        dangles_dict[fish_id] = dangles_list

    return fish_data, dangles_dict

In [None]:
# Usage
all_files = sorted(glob.glob(os.path.join(path, "*.csv.gz")))
desired_cols = ['fishx', 'fishy', 'fishz', 'realtime']
fish_data = load_and_collate_data(all_files, desired_cols)

In [None]:
fish_data

In [None]:
fish_data, variables = calculate_additional_variables(fish_data, fHz=100)

In [None]:
variables

In [None]:
# def calculate_additional_variables(fish_data: dict, fHz: int):
#     # This function takes a dictionary of fish data and a frequency (fHz) as input.
#     # It calculates additional variables based on the fish data and adds them to the dictionary.

#     dangles_dict = {}  # Initialize a dictionary to hold the dangles data for each fish

#     # Iterate over each fish ID and its corresponding data in the fish_data dictionary
#     for fish_id, data in fish_data.items():
#         # Iterate over each DataFrame in the 'df' list of the current fish ID
#         for df in data['df']:
#             # Calculate the time elapsed from the start of recording for each timestamp
#             df["realtime"] = df["realtime"] - df["realtime"].iloc[0]

#             # Preprocessing code integrated here
#             # Calculate step sizes and find large steps
#             df = df.diff(periods=1, axis=0)
#             steps = np.sqrt(df["fishx"]**2 + df["fishy"]**2 + df["fishz"]**2)
#             df["steps"] = steps
#             max_stepsize = 0.02  # m : 1.5 body lengths between successive frames
#             large_steps = df['steps'] > max_stepsize
#             count_large_steps = np.count_nonzero(large_steps)
#             w = 10  # frames (represents half the window for deletion)
#             selected_columns = ['fishz', 'fishy', 'fishx']
#             large_step_indices = large_steps[large_steps].index.values
#             for i in range(0, count_large_steps):
#                 lsi = large_step_indices[i]
#                 df.loc[lsi-w:lsi+w, selected_columns] = np.nan
            
#             # Modify data based on coordinate conversion and thresholds
#             err = 0.001  # accepted error
#             df.loc[df['fishz'] < - (0.09 + err), selected_columns] = np.nan
#             df.loc[df['fishz'] > 0 + err, selected_columns] = np.nan
#             zoffset = 0.11
#             azimuth, elevation, R = cart2sph(df['fishx'], df['fishy'], df['fishz'] - zoffset)
#             err = 0.005  # accepted error
#             df.loc[R > 0.2 + err, selected_columns] = np.nan
#             df.loc[R < 0.11 - err, selected_columns] = np.nan
            
#             # Continue with original code
#             # Calculate the differences in position (velocity) in each dimension
#             dx = df["fishx"].diff().interpolate(method='bfill')
#             dy = df["fishy"].diff().interpolate(method='bfill')
#             dz = df["fishz"].diff().interpolate(method='bfill')

#             # Add the velocity components to the DataFrame
#             df["dx"] = dx
#             df["dy"] = dy
#             df["dz"] = dz

#             # Calculate the angles of motion (wrapped between -pi and pi)
#             angle_wrapped = np.arctan2(dy, dx)
#             last = 0
#             angles = []
#             for phi in angle_wrapped:
#                 while phi < last - np.pi:
#                     phi += 2 * np.pi
#                 while phi > last + np.pi:
#                     phi -= 2 * np.pi
#                 last = phi
#                 angles.append(phi)

#             # Store the angles in the fish_data dictionary under the current fish ID
#             data['angles'] = angles

#             # Calculate average velocity for each fish
#             dt = 1 / fHz
#             velocity = np.sqrt(df["dx"] ** 2 + df["dy"] ** 2 + df["dz"] ** 2) / dt
#             avg_velocity = velocity.median()
#             data['avg_velocity'] = avg_velocity

#             # Find peaks in the velocity signal representing bouts of movement
#             height = (0.1, 0.5)
#             frames_btw_2bouts = round(fHz / 10)
#             bout_width = round(fHz / 100)
#             prominence = 0.05
#             peaks, _ = find_peaks(velocity, height=height, distance=frames_btw_2bouts, width=bout_width, prominence=prominence)

#             # Extract the angles and peak times corresponding to the identified peaks
#             angles_at_peaks = [angles[i] for i in peaks]
#             data['angles_at_peaks'] = angles_at_peaks
#             data['peak_times'] = [df["realtime"].iloc[i] for i in peaks]
#             # Normalization of peak angles
#             angles_at_peaks_normalized = np.mod(angles_at_peaks, 2 * np.pi) - np.pi
#             # Unwrap normalized angles
#             angles_at_peaks_unwrapped = np.unwrap(angles_at_peaks_normalized)
#              # Calculate differences between consecutive unwrapped angles
#             angles_at_peaks_diff = np.diff(angles_at_peaks_unwrapped)
#             # Apply modulo arithmetic to ensure range between -π and π
#             angles_at_peaks_diff = np.mod(angles_at_peaks_diff + np.pi, 2 * np.pi) - np.pi
#             dangles = angles_at_peaks_diff

#             dangles_dict[fish_id] = dangles  # Add the dangles data to the dictionary for the current fish ID

#     return dangles_dict  # Return the dictionary with dangles data for each fish


In [None]:
# def calculate_turning_angle_properties(dangles):
#     # Calculate the number of clockwise and counterclockwise turns
    
#     counterclockwise_turns = np.sum(dangles > 0)
#     clockwise_turns = np.sum(dangles < 0)

#     # Calculate probability of clockwise and counterclockwise turns
#     probability_counterclockwise_turns = counterclockwise_turns / (counterclockwise_turns + clockwise_turns)
#     probability_clockwise_turns = clockwise_turns / (counterclockwise_turns + clockwise_turns)

#     # Get sequence of right and left turns as 1 and -1
#     turns = np.sign(dangles)
    
#     def streak_lengths(turns):
#         if len(turns) == 0:
#             return np.array([])  # return empty array for empty input

#         streaks = []
#         current_streak = 1  # start with a streak of 1

#         for i in range(1, len(turns)):
#             if turns[i] == turns[i - 1]:  # if current turn is same as previous
#                 current_streak += 1  # increment streak count
#             else:  # if current turn is different
#                 streaks.append(current_streak)  # add the streak to the list
#                 current_streak = 1  # reset streak count

#         streaks.append(current_streak)  # add the last streak
#         return np.array(streaks)
    
#     streaks = streak_lengths(turns)
    
#     # Now let's bundle all this data into a dictionary and return it
#     turning_properties = {
#         'counterclockwise_turns': counterclockwise_turns,
#         'clockwise_turns': clockwise_turns,
#         'probability_counterclockwise_turns': probability_counterclockwise_turns,
#         'probability_clockwise_turns': probability_clockwise_turns,
#         'turns': turns,
#         'streaks': streaks
#     }

#     return turning_properties

# dangles_dict = calculate_additional_variables(fish_data, fHz=100)

# turning_properties_dict = {}
# for fish_id, dangles in dangles_dict.items():
#     turning_properties = calculate_turning_angle_properties(dangles)
#     turning_properties_dict[fish_id] = turning_properties
# turning_properties_dict

In [None]:
# def plot_streak_distribution(turning_properties_dict):
#     for fish_id, turning_properties in turning_properties_dict.items():
#         streaks = turning_properties['streaks']

#         # Plot the distribution of streaks
#         plt.hist(streaks, bins='auto')
#         plt.xlabel('Streak Length')
#         plt.ylabel('Frequency')
#         plt.title(f'Fish ID {fish_id} - Streak Distribution')
#         plt.show()

In [None]:
# def generate_fake_fish_streaks(num_streaks, max_streak_length, avg_streak_number):
#     # Generate random turns (-1: left, 1: right)
#     turns = np.random.choice([-1, 1], size=num_streaks)

#     # Calculate streak lengths
#     streak_lengths = []
#     current_streak = 0
#     for turn in turns:
#         if turn == 0:
#             current_streak += 1
#         else:
#             streak_lengths.append(current_streak)
#             current_streak = 0
#     streak_lengths.append(current_streak)

#     # Truncate streak lengths if they exceed the maximum streak length
#     streak_lengths = [min(streak, max_streak_length) for streak in streak_lengths]

#     # Adjust the length of streak_lengths to match the average streak number
#     current_streak_number = len(streak_lengths)
#     if current_streak_number < avg_streak_number:
#         additional_streaks = avg_streak_number - current_streak_number
#         additional_lengths = np.random.randint(1, max_streak_length + 1, size=additional_streaks)
#         streak_lengths += list(additional_lengths)

#     return streak_lengths


# def calculate_average_streak_number(turning_properties_dict):
#     total_streak_number = sum([len(props['streaks']) for props in turning_properties_dict.values()])
#     total_fish_number = len(turning_properties_dict)
#     average_streak_number = total_streak_number // total_fish_number

#     return average_streak_number


# # Calculate the average streak number from the real fish data
# average_streak_number = calculate_average_streak_number(turning_properties_dict)

# # Specify the parameters for the fake fish streaks
# num_streaks = average_streak_number
# max_streak_length = 10

# # Generate fake fish streak lengths with the adjusted number of streaks
# fake_fish_streaks = generate_fake_fish_streaks(num_streaks, max_streak_length, average_streak_number)

# # Plot the streak length distributions with the fake fish streak lengths included
# plot_streak_distribution(turning_properties_dict, fake_fish_streaks)

In [None]:
# # Specify the directory path where you want to save the CSV files
# output_directory = "/home/kkumari/PhD/fish-data/output-long-term-free-swim/"

# import pandas as pd
# import numpy as np

# def save_turning_properties_as_csv(turning_properties_dict, output_directory):
#     # Iterate over the dictionary items
#     for fish_id, turning_properties in turning_properties_dict.items():
#         # Convert the arrays to string representations with proper formatting
#         turning_properties_str = {key: np.array2string(value, separator=', ') for key, value in turning_properties.items()}

#         # Specify the file path for the current fish ID
#         output_path = output_directory + f"turning_properties_{fish_id}.csv"

#         # Save the dictionary as a CSV file
#         df = pd.DataFrame.from_dict(turning_properties_str, orient='index')
#         df.to_csv(output_path)

# def load_turning_properties_from_csv(file_path):
#     # Read the CSV file into a DataFrame
#     df = pd.read_csv(file_path, index_col=0)

#     # Convert the string representations back to arrays
#     turning_properties = {}
#     for key in df.index:
#         value_str = df.loc[key].values[0]
#         value = np.fromstring(value_str[1:-1], sep=', ')
#         turning_properties[key] = value

#     return turning_properties

# # Save the dictionary as CSV files
# save_turning_properties_as_csv(turning_properties_dict, output_directory)

# # Load the CSV files back into a dictionary with correct array data
# loaded_turning_properties_dict = {}
# for fish_id in turning_properties_dict:
#     file_path = output_directory + f"turning_properties_{fish_id}.csv"
#     turning_properties = load_turning_properties_from_csv(file_path)
#     loaded_turning_properties_dict[fish_id] = turning_properties

In [None]:
# read all csv files in the directory
# all_dict_files = sorted(glob.glob(os.path.join(output_directory, "*.csv")))

In [None]:
# def plot_streak_distribution(all_dict_files):
#     # Iterate over the CSV files
#     for file in all_dict_files:
#         # Read the CSV file into a DataFrame
#         df = pd.read_csv(file)

#         # Get the fish ID from the file name
#         fish_id = file.split('/')[-1].split('.')[0].split('_')[-1]

#         # Check if 'streaks' column exists in the DataFrame
#         if 'streaks' not in df.columns:
#             print(f"Streaks column not found in {file}. Skipping...")
#             continue

#         # Extract the streaks column
#         streaks = df['streaks'].values

#         # Plot the distribution of streaks
#         plt.hist(streaks, bins='auto')
#         plt.xlabel('Streak Length')
#         plt.ylabel('Frequency')
#         plt.title(f'Fish ID {fish_id} - Streak Distribution')
#         plt.show()

In [None]:
# plot_streak_distribution(loaded_turning_properties_dict)

In [None]:
#  read first file and show the data
df = pd.read_csv(all_dict_files[0])

In [None]:
# def plot_streak_distribution(all_dict_files):
#     # Iterate over the CSV files
#     for file in all_dict_files:
#         # Read the CSV file into a DataFrame
#         df = pd.read_csv(file)

#         # Get the fish ID from the file name
#         fish_id = file.split('/')[-1].split('.')[0].split('_')[-1]

#         # Check if 'streaks' column exists in the DataFrame
#         if 'streaks' not in df.columns:
#             print(f"Streaks column not found in {file}. Skipping...")
#             continue

#         # Extract the streaks column
#         streaks = df['streaks'].values

#         # Plot the distribution of streaks
#         plt.hist(streaks, bins='auto')
#         plt.xlabel('Streak Length')
#         plt.ylabel('Frequency')
#         plt.title(f'Fish ID {fish_id} - Streak Distribution')
#         plt.show()


In [None]:
# plot_streak_distribution(all_dict_files)

In [None]:
# def initialize_figure(num_plots: int, figsize: tuple=(12,10)) -> plt.Figure:
#     fig, ax = plt.subplots(num_plots, 1, figsize=figsize)
#     return fig, ax
