## This code takes in .csv outputs from the TweetyBERT decoder, then saves the data into cell arrays inside of .mat files, which can be then input into Jeff Markowitz's probabalistic suffix tree (PST). PST pipeline can be found here: https://github.com/jmarkow/pst


### Currently, TweetyBERT assumes ALL recordings were taken in 2024. This cell corrects that bug by using the .json file I generated for each dataset that contains the dates of recordings (found from the file properties), as well as the lesion date (that I input manually)

In [242]:
import pandas as pd
import numpy as np

recording_file_path_name = '/Users/mirandahulsey-vincent/Documents/allPythonCode/syntax_analysis/DECODER_csvs_and_jsons/USA5326_decoded.csv'
decoder_dataframe = pd.read_csv(recording_file_path_name)
path_to_json_file_with_dates = '/Users/mirandahulsey-vincent/Documents/allPythonCode/syntax_analysis/DECODER_csvs_and_jsons/USA5326_creation_data.json'
json_data = pd.read_json(path_to_json_file_with_dates)

print(decoder_dataframe.head(2))

from datetime import datetime
json_surgery_date = json_data['treatment_date'].iloc[0]  # Get the first date if it's a Series
# Convert string to datetime object
date_obj = datetime.strptime(json_surgery_date, "%Y-%m-%d")
# Convert datetime object to desired format
surgery_date = date_obj.strftime("%m.%d.%Y")
lesion_date = surgery_date
print(f"Surgery date: {surgery_date}")


                                file_name  song_present  \
0   USA5326_45324.28882476_2_2_8_1_22.wav          True   
1  USA5326_45324.32074401_2_2_8_54_34.wav         False   

                          syllable_onsets_offsets_ms  \
0  {'35': [[0.0, 83.65079365079366]], '31': [[83....   
1                                                 {}   

                    syllable_onsets_offsets_timebins  
0  {'35': [[0.0, 31]], '31': [[31.0, 72], [112.0,...  
1                                                 {}  
Surgery date: 02.20.2024


### Organize the dataframe to extract animal ID, date and time of recording

In [243]:
import pandas as pd
import json

with open(path_to_json_file_with_dates, 'r') as f:
    json_data = json.load(f)

# Extract subdirectory creation dates from JSON data
subdirectory_dates = {subdir: data['subdirectory_creation_date'] for subdir, data in json_data['subdirectories'].items()}

# Function to extract date, time, and animal_id from the file name
def find_recording_dates_and_times(recording_file_path_name):
    try:
        # Split the full path to isolate the file name
        file_name = recording_file_path_name.split('/')[-1]
        
        # Split the file name by underscores to extract relevant parts
        split_file_name_by_underscores = file_name.split('_')
        
        # Extract animal ID
        animal_id = split_file_name_by_underscores[0]
        
        # Extract month and ensure it has two digits
        month = split_file_name_by_underscores[2].zfill(2)
        
        # Extract day and ensure it has two digits
        day = split_file_name_by_underscores[3].zfill(2)
        
        # Combine month and day to form the date in MM.DD format
        date = f"{month}.{day}"
        
        # Extract time components (hour, minute, second) and ensure proper formatting
        hour = split_file_name_by_underscores[4].zfill(2)
        minute = split_file_name_by_underscores[5].zfill(2)
        second = split_file_name_by_underscores[6].replace('.wav', '').zfill(2)  # Remove file extension
        
        return animal_id, date, hour, minute, second

    except IndexError:
        print(f"Error: Unexpected format in file name {recording_file_path_name}")
        return None, None, None, None, None
    except Exception as e:
        print(f"Error processing file {recording_file_path_name}: {e}")
        return None, None, None, None, None

# Function to update the date with year from the JSON file
def update_date_with_year(row, subdirectory_dates):
    month_day = row['Date']  # Extract the month and day
    month = month_day.split('.')[0]  # Get the month part
    for subdir, date in subdirectory_dates.items():
        year, json_month, json_day = date.split('-')  # Parse the year, month, and day from the JSON date
        if json_month == month:  # Match based on the month
            return f"{json_month}.{month_day.split('.')[1]}.{year}"  # Return the updated date with year
    return month_day  # If no match, return the original date

# Function to create a table from the DataFrame and update the date
def make_table(input_data_frame, subdirectory_dates):
    # Filter rows where song is present and reset index
    only_song_data = input_data_frame[input_data_frame['song_present'] == True].reset_index(drop=True)
    num_files_with_song = only_song_data.shape[0]

    # Initialize organized DataFrame
    organized_data_frame = only_song_data.copy()  # Retain original columns

    # Add new columns for animal ID, date, hour, minute, and second
    organized_data_frame['Animal ID'] = [None] * num_files_with_song
    organized_data_frame['Date'] = [None] * num_files_with_song
    organized_data_frame['Hour'] = [None] * num_files_with_song
    organized_data_frame['Minute'] = [None] * num_files_with_song
    organized_data_frame['Second'] = [None] * num_files_with_song

    for i, row in only_song_data.iterrows():
        recording_file_path_name = row['file_name']
        try:
            animal_id, date, hour, minute, second = find_recording_dates_and_times(recording_file_path_name)
            organized_data_frame.at[i, 'Animal ID'] = animal_id
            organized_data_frame.at[i, 'Date'] = date
            organized_data_frame.at[i, 'Hour'] = hour
            organized_data_frame.at[i, 'Minute'] = minute
            organized_data_frame.at[i, 'Second'] = second
        except Exception as e:
            print(f"Error processing file {recording_file_path_name}: {e}")
            continue

    # Update the 'Date' column with the year using the JSON data
    organized_data_frame['Date'] = organized_data_frame.apply(lambda row: update_date_with_year(row, subdirectory_dates), axis=1)

    # Output the results
    print(f"First rows of organized_data_frame with updated dates: {organized_data_frame.head(3)}")

    return organized_data_frame

# Run the function and generate the table with updated dates
organized_data_frame = make_table(decoder_dataframe, subdirectory_dates)

First rows of organized_data_frame with updated dates:                                 file_name  song_present  \
0   USA5326_45324.28882476_2_2_8_1_22.wav          True   
1  USA5326_45324.34642642_2_2_9_37_22.wav          True   
2   USA5326_45324.34922587_2_2_9_42_2.wav          True   

                          syllable_onsets_offsets_ms  \
0  {'35': [[0.0, 83.65079365079366]], '31': [[83....   
1  {'31': [[0.0, 32.38095238095238], [1443.650793...   
2  {'32': [[0.0, 129.52380952380952]], '30': [[12...   

                    syllable_onsets_offsets_timebins Animal ID        Date  \
0  {'35': [[0.0, 31]], '31': [[31.0, 72], [112.0,...   USA5326  02.02.2024   
1  {'31': [[0.0, 12], [535.0, 579], [608.0, 643],...   USA5326  02.02.2024   
2  {'32': [[0.0, 48]], '30': [[47.99999999999999,...   USA5326  02.02.2024   

  Hour Minute Second  
0   08     01     22  
1   09     37     22  
2   09     42     02  


### Get the syllable order for ONE song

In [244]:
import json
import re

# Assuming this is the string you've printed
sample_one_song = organized_data_frame['syllable_onsets_offsets_ms'][0]

# Remove the extra outer single quotes and replace single quotes with double quotes
cleaned_sample = sample_one_song.strip("''")
cleaned_sample = re.sub(r"'", '"', cleaned_sample)

# Now try to load it as JSON
try:
    sample_one_song_dict = json.loads(cleaned_sample)
    print("Successfully parsed JSON.")
except json.JSONDecodeError as e:
    print(f"JSONDecodeError: {e}")

# Now you can proceed with your original logic
syllable_times = []

# Loop through each syllable label and its corresponding onset/offset pairs
for syllable_label, times in sample_one_song_dict.items():
    for onset_offset in times:
        syllable_times.append([syllable_label, onset_offset[0], onset_offset[1]])

# Convert the onset and offset times to floats if needed
syllable_times_array = np.array([[syllable_label, float(onset), float(offset)] 
                                 for syllable_label, onset, offset in syllable_times])

# Sort the array by the onset times (second column)
syllable_times_array_sorted = syllable_times_array[syllable_times_array[:, 1].argsort()]

# Display the sorted result
syllable_times_array_sorted

Successfully parsed JSON.


array([['35', '0.0', '83.65079365079366'],
       ['31', '1036.1904761904761', '1160.3174603174605'],
       ['26', '1160.3174603174605', '1246.6666666666667'],
       ['25', '1246.6666666666667', '1257.4603174603176'],
       ['32', '1257.4603174603176', '1341.1111111111113'],
       ['15', '1341.1111111111113', '1942.857142857143'],
       ['26', '194.2857142857143', '302.22222222222223'],
       ['32', '1942.857142857143', '2083.1746031746034'],
       ['33', '2083.1746031746034', '2180.3174603174602'],
       ['29', '2180.3174603174602', '2231.5873015873017'],
       ['33', '2231.5873015873017', '3750.793650793651'],
       ['31', '302.22222222222223', '456.0317460317461'],
       ['26', '3750.793650793651', '3850.634920634921'],
       ['26', '456.0317460317461', '558.5714285714286'],
       ['31', '558.5714285714286', '706.984126984127'],
       ['26', '706.984126984127', '809.5238095238096'],
       ['31', '809.5238095238096', '920.1587301587302'],
       ['31', '83.650793650793

In [96]:
# import json
# import re
# import numpy as np

# # Assuming this is the string you've printed
# sample_one_song = organized_data_frame['syllable_onsets_offsets_ms'][0]

# # Remove the extra outer single quotes and replace single quotes with double quotes
# cleaned_sample = sample_one_song.strip("''")
# cleaned_sample = re.sub(r"'", '"', cleaned_sample)

# # Now try to load it as JSON
# try:
#     sample_one_song_dict = json.loads(cleaned_sample)
#     print("Successfully parsed JSON.")
# except json.JSONDecodeError as e:
#     print(f"JSONDecodeError: {e}")

# # Now you can proceed with your original logic
# syllable_times = []

# # Loop through each syllable label and its corresponding onset/offset pairs
# for syllable_label, times in sample_one_song_dict.items():
#     for onset_offset in times:
#         syllable_times.append([syllable_label, onset_offset[0], onset_offset[1]])

# # Convert the onset and offset times to floats if needed
# syllable_times_array = np.array([[syllable_label, float(onset), float(offset)] 
#                                  for syllable_label, onset, offset in syllable_times])

# # Sort the array by the onset times (second column)
# syllable_times_array_sorted = syllable_times_array[syllable_times_array[:, 1].argsort()]

# # Extract only the syllable labels in order of onset times
# syllable_order = syllable_times_array_sorted[:, 0]

# # Display the syllable order array
# syllable_order

Successfully parsed JSON.


array(['10', '10', '9'], dtype='<U32')

## Find the syllable order for each song, and add to the dataframe

In [245]:
import json
import re
import numpy as np
import pandas as pd

# Function to generate syllable order from a single song data (syllable_onsets_offsets_ms)
def get_syllable_order(sample_one_song):
    # Clean the input and parse it as JSON
    cleaned_sample = sample_one_song.strip("''")
    cleaned_sample = re.sub(r"'", '"', cleaned_sample)
    
    try:
        sample_one_song_dict = json.loads(cleaned_sample)
    except json.JSONDecodeError as e:
        print(f"JSONDecodeError: {e}")
        return None

    syllable_times = []
    
    # Loop through each syllable label and its corresponding onset/offset pairs
    for syllable_label, times in sample_one_song_dict.items():
        for onset_offset in times:
            syllable_times.append([syllable_label, onset_offset[0], onset_offset[1]])

    # Convert the onset and offset times to floats
    syllable_times_array = np.array([[syllable_label, float(onset), float(offset)] 
                                     for syllable_label, onset, offset in syllable_times])

    # Sort the array by onset times (second column)
    syllable_times_array_sorted = syllable_times_array[syllable_times_array[:, 1].argsort()]

    # Extract only the syllable labels in order of onset times
    syllable_order = syllable_times_array_sorted[:, 0]
    
    return syllable_order

# Function to iteratively find syllable orders for each row in the DataFrame
def find_syllable_orders(organized_data_frame):
    syllable_orders = []

    for i, row in organized_data_frame.iterrows():
        sample_one_song = row['syllable_onsets_offsets_ms']
        syllable_order = get_syllable_order(sample_one_song)
        if syllable_order is not None:
            syllable_orders.append(syllable_order)
        else:
            syllable_orders.append([])  # In case of a parsing error, append an empty array

    # Add a new column for the syllable orders
    organized_data_frame['syllable_order'] = syllable_orders
    
    return organized_data_frame

# Assuming you've already loaded the data and updated the table
organized_data_frame = make_table(decoder_dataframe, subdirectory_dates)

# Now find syllable orders for each song in the organized_data_frame
organized_data_frame_with_orders = find_syllable_orders(organized_data_frame)

# Output the first few rows with syllable orders
print(organized_data_frame_with_orders[['syllable_order']])

First rows of organized_data_frame with updated dates:                                 file_name  song_present  \
0   USA5326_45324.28882476_2_2_8_1_22.wav          True   
1  USA5326_45324.34642642_2_2_9_37_22.wav          True   
2   USA5326_45324.34922587_2_2_9_42_2.wav          True   

                          syllable_onsets_offsets_ms  \
0  {'35': [[0.0, 83.65079365079366]], '31': [[83....   
1  {'31': [[0.0, 32.38095238095238], [1443.650793...   
2  {'32': [[0.0, 129.52380952380952]], '30': [[12...   

                    syllable_onsets_offsets_timebins Animal ID        Date  \
0  {'35': [[0.0, 31]], '31': [[31.0, 72], [112.0,...   USA5326  02.02.2024   
1  {'31': [[0.0, 12], [535.0, 579], [608.0, 643],...   USA5326  02.02.2024   
2  {'32': [[0.0, 48]], '30': [[47.99999999999999,...   USA5326  02.02.2024   

  Hour Minute Second  
0   08     01     22  
1   09     37     22  
2   09     42     02  
                                         syllable_order
0     [35, 31, 26, 25,

## Map each syllable label to a letter instead of a number (1->A, etc. This is necessary so we can tell labels '1' and '2' apart from '12' once they all get concatonated  into a string)

### Test this code for ONE song

In [117]:
# import json
# import re
# import numpy as np

# # Define a function to map syllable labels to unique characters
# def map_labels_to_characters(syllable_orders):
#     # Get all unique syllable labels across all rows
#     unique_labels = sorted(set(sum(syllable_orders, [])))  # Flatten the list of lists and get unique elements
    
#     # Create a mapping from each unique label to a unique character (uppercase, lowercase, digits)
#     available_chars = [chr(i) for i in range(65, 91)] + [chr(i) for i in range(97, 123)] + [chr(i) for i in range(48, 58)]  # A-Z, a-z, 0-9
#     if len(unique_labels) > len(available_chars):
#         raise ValueError("Too many unique labels to map to single characters!")
    
#     label_to_char = {label: available_chars[i] for i, label in enumerate(unique_labels)}
    
#     # Replace each label in the syllable order with its corresponding character
#     syllable_orders_mapped = []
#     for order in syllable_orders:
#         mapped_order = ''.join([label_to_char[label] for label in order])
#         syllable_orders_mapped.append(mapped_order)
    
#     return syllable_orders_mapped, label_to_char

# # Example syllable order (as shown in the sample)
# syllable_orders = [
#     ['8', '21', '22', '21', '22', '21', '22', '21', '26', '23', '2', '3', '26', '5', '11', '14']
# ]

# # Map the labels and get the string representation
# syllable_orders_mapped, label_to_char = map_labels_to_characters(syllable_orders)

# # Output the mapped syllable orders and the label-to-character mapping
# print("Mapped syllable order:", syllable_orders_mapped)
# print("Label-to-character mapping:", label_to_char)


### Do this for EVERY song in the dataframe

In [246]:
import json
import re
import numpy as np
import pandas as pd

# Function to generate syllable order from a single song data (syllable_onsets_offsets_ms)
def get_syllable_order(sample_one_song):
    # Clean the input and parse it as JSON
    cleaned_sample = sample_one_song.strip("''")
    cleaned_sample = re.sub(r"'", '"', cleaned_sample)
    
    try:
        sample_one_song_dict = json.loads(cleaned_sample)
    except json.JSONDecodeError as e:
        print(f"JSONDecodeError: {e}")
        return None

    syllable_times = []
    
    # Loop through each syllable label and its corresponding onset/offset pairs
    for syllable_label, times in sample_one_song_dict.items():
        for onset_offset in times:
            syllable_times.append([syllable_label, onset_offset[0], onset_offset[1]])

    # Convert the onset and offset times to floats
    syllable_times_array = np.array([[syllable_label, float(onset), float(offset)] 
                                     for syllable_label, onset, offset in syllable_times])

    # Sort the array by onset times (second column)
    syllable_times_array_sorted = syllable_times_array[syllable_times_array[:, 1].argsort()]

    # Extract only the syllable labels in order of onset times
    syllable_order = syllable_times_array_sorted[:, 0]
    
    return syllable_order

# Function to map syllable labels to unique characters
def map_labels_to_characters(syllable_orders):
    # Flatten the list of syllable orders (since syllable_orders is a list of lists)
    flat_syllable_orders = [label for sublist in syllable_orders for label in sublist]
    
    # Get all unique syllable labels
    unique_labels = sorted(set(flat_syllable_orders))
    
    # Create a mapping from each unique label to a unique character (uppercase, lowercase, digits)
    available_chars = [chr(i) for i in range(65, 91)] + [chr(i) for i in range(97, 123)] + [chr(i) for i in range(48, 58)]  # A-Z, a-z, 0-9
    if len(unique_labels) > len(available_chars):
        raise ValueError("Too many unique labels to map to single characters!")
    
    label_to_char = {label: available_chars[i] for i, label in enumerate(unique_labels)}
    
    # Replace each label in the syllable order with its corresponding character
    syllable_orders_mapped = []
    for order in syllable_orders:
        mapped_order = ''.join([label_to_char[label] for label in order])
        syllable_orders_mapped.append(mapped_order)
    
    return syllable_orders_mapped, label_to_char

# Function to iteratively find syllable orders for each row and map them
def find_mapped_syllable_orders(organized_data_frame):
    syllable_orders = []

    for i, row in organized_data_frame.iterrows():
        sample_one_song = row['syllable_onsets_offsets_ms']
        syllable_order = get_syllable_order(sample_one_song)
        if syllable_order is not None:
            syllable_orders.append(syllable_order)
        else:
            syllable_orders.append([])  # In case of a parsing error, append an empty array

    # Map the syllable orders to unique characters
    syllable_orders_mapped, label_to_char = map_labels_to_characters(syllable_orders)

    # Add new columns for both the raw syllable orders and the mapped syllable orders
    organized_data_frame['syllable_order'] = syllable_orders
    organized_data_frame['mapped_syllable_order'] = syllable_orders_mapped
    
    return organized_data_frame, label_to_char

# Assuming you've already loaded the data and updated the table
organized_data_frame = make_table(decoder_dataframe, subdirectory_dates)

# Now find and map syllable orders for each song in the organized_data_frame
organized_data_frame_with_orders, label_to_char = find_mapped_syllable_orders(organized_data_frame)

# Output the first few rows with both syllable orders and mapped syllable orders
# Output only the 'syllable_order' and 'mapped_syllable_order' columns
print(organized_data_frame_with_orders[['syllable_order', 'mapped_syllable_order']])

print("Label-to-character mapping:", label_to_char)

# Output only the 'syllable_order' and 'mapped_syllable_order' columns
print(organized_data_frame_with_orders[['syllable_order', 'mapped_syllable_order']].head(2))
print("Label-to-character mapping:", label_to_char)

First rows of organized_data_frame with updated dates:                                 file_name  song_present  \
0   USA5326_45324.28882476_2_2_8_1_22.wav          True   
1  USA5326_45324.34642642_2_2_9_37_22.wav          True   
2   USA5326_45324.34922587_2_2_9_42_2.wav          True   

                          syllable_onsets_offsets_ms  \
0  {'35': [[0.0, 83.65079365079366]], '31': [[83....   
1  {'31': [[0.0, 32.38095238095238], [1443.650793...   
2  {'32': [[0.0, 129.52380952380952]], '30': [[12...   

                    syllable_onsets_offsets_timebins Animal ID        Date  \
0  {'35': [[0.0, 31]], '31': [[31.0, 72], [112.0,...   USA5326  02.02.2024   
1  {'31': [[0.0, 12], [535.0, 579], [608.0, 643],...   USA5326  02.02.2024   
2  {'32': [[0.0, 48]], '30': [[47.99999999999999,...   USA5326  02.02.2024   

  Hour Minute Second  
0   08     01     22  
1   09     37     22  
2   09     42     02  
                                         syllable_order  \
0     [35, 31, 26, 

## Now, sort the data by into different groups (e.g. by days, by pre vs. post lesion group), then export them into .mat files

### Test: Get all the arrays from one day of recording:

In [134]:
# import pandas as pd

# # Filter the DataFrame for the first day of recording
# def get_first_day_songs(organized_data_frame):
#     # Find the first day of recording
#     first_day = organized_data_frame['Date'].min()
    
#     # Filter the DataFrame for the first day
#     first_day_data = organized_data_frame[organized_data_frame['Date'] == first_day]
    
#     return first_day_data

# # Generate a 1xN cell array where each cell contains the mapped syllable order for the first day
# def generate_first_day_cell_array(organized_data_frame):
#     # Get the filtered data for the first day
#     first_day_data = get_first_day_songs(organized_data_frame)
    
#     # Extract the mapped syllable orders into a list (which acts as a cell array)
#     mapped_syllable_orders = first_day_data['mapped_syllable_order'].tolist()
    
#     # Create a 1xN cell array (as a list of lists for flexibility)
#     cell_array = [mapped_syllable_orders]
    
#     return cell_array

# # Assuming organized_data_frame_with_orders contains the data
# first_day_cell_array = generate_first_day_cell_array(organized_data_frame_with_orders)

# # Output the result
# print("1xN cell array of mapped syllable orders for the first day of recording:")
# print(first_day_cell_array)


### Get the syllable string arrays from each day of recording

In [247]:
# import pandas as pd

# # Function to get the unique dates in the DataFrame
# def get_unique_dates(organized_data_frame):
#     return sorted(organized_data_frame['Date'].unique())

# # Function to generate a cell array for each day of recording
# def generate_cell_arrays_for_each_day(organized_data_frame):
#     # Get all unique dates
#     unique_dates = get_unique_dates(organized_data_frame)
    
#     # Iterate through each unique date
#     for date in unique_dates:
#         # Filter the DataFrame for the current date
#         day_data = organized_data_frame[organized_data_frame['Date'] == date]
        
#         # Extract the mapped syllable orders into a list (which acts as a cell array)
#         mapped_syllable_orders = day_data['mapped_syllable_order'].tolist()
        
#         # Create a 1xN cell array (as a list of lists for flexibility)
#         cell_array = [mapped_syllable_orders]
        
#         # Print the date and the corresponding cell array
#         print(f"Recording Date: {date}")
#         print("1xN cell array of mapped syllable orders for the day:")
#         print(cell_array)
#         print("\n")

# # Assuming organized_data_frame_with_orders contains the data
# generate_cell_arrays_for_each_day(organized_data_frame_with_orders)

Recording Date: 01.19.2024
1xN cell array of mapped syllable orders for the day:
[['aOU', 'T', 'gF', 'adIdcbHaHaHa', 'BV', 'aVaVaVaVaVaVIVIVa', 'aTZTZTZTZTbTaVa', 'adIaIaIdUdUdUdHFbWbVPa', 'PJ', 'aabdaMaHWHWHWHPab', 'aIHbdbd', 'aOdadaU', 'aTZTOPaTZTZ', 'MJVIdcdYWYdHFbWb', 'aNONOU', 'CQDhCDhB', 'aOUPVPVPVP', 'aTZTZTZIdcbWbPSPSPSPSaTZ', 'dIdcdcdcdbWXi', 'aXNPY', 'aTZTZTZPTOaZTZa', 'aGPaCVa', 'T', 'Va', 'YO', 'abdbdbddaSZHbd', 'adadadadbdbdbdbdbdbdIPa', 'agM', 'aVPghVAC', 'geB', 'aUOY', 'adbdbdbdZdZaZSbFHaYdbdb', 'aTcdcIWbWbWHadI', 'aOdUdaU', 'aCVPada', 'baOdaU', 'HUdUaUadadUONONOaY', 'T', 'daIaVPad', 'aFIdcdcdcbWXNdI', 'VaPUO', 'aOU', 'aOUZaU']]


Recording Date: 01.20.2024
1xN cell array of mapped syllable orders for the day:
[['aFbWbPSadI', 'aaYXdiWjHFWV', 'CfLKLKLKLKLKLKLKLKLE', 'aYUO', 'adPYVaVij', 'aCDhHFdbWbVBQEh', 'VadYdJcVdcF', 'abHFTPTFbW', 'aDPhahC', 'dbWbTVZBHZQ', 'adcdWdWdWbWIWaHadWIWFIFIFIbWbPI', 'aZTZTZIcbWYOPZTZT', 'IabWbdI', 'aXYadbaY', 'aadTIZbWbTZTZQ', 'adbJYVdcPRYcFbWb

## Save into three groups with an equal number of songs in each: 
1) 3.5k songs post-lesion
2) 3.5k songs from immediately pre-lesion
3) 3.5k songs from earlier pre-lesion recordings 

From TG: "Any comparison should have a balanced number of songs."

### First, make sure each song contains at least 2 syllables (this is likely just a trill that's been marked as a song).

In [248]:
# Assuming the syllables for each song are stored in a column named 'syllable_order'
# Replace 'syllable_order' with the actual column name if different.

# Define a function to count the number of syllables in each song
def count_syllables(row):
    # Assuming the syllable order is stored as a list in the 'syllable_order' column
    return len(row['syllable_order'])

# Apply the function to create a new column with the syllable count
organized_data_frame_with_orders['syllable_count'] = organized_data_frame_with_orders.apply(count_syllables, axis=1)

# Filter the DataFrame to only include rows with 2 or more syllables
filtered_data_frame = organized_data_frame_with_orders[organized_data_frame_with_orders['syllable_count'] >= 2]

# Drop the 'syllable_count' column if it's no longer needed
filtered_data_frame = filtered_data_frame.drop(columns=['syllable_count'])

# Display the filtered DataFrame
filtered_data_frame

Unnamed: 0,file_name,song_present,syllable_onsets_offsets_ms,syllable_onsets_offsets_timebins,Animal ID,Date,Hour,Minute,Second,syllable_order,mapped_syllable_order
0,USA5326_45324.28882476_2_2_8_1_22.wav,True,"{'35': [[0.0, 83.65079365079366]], '31': [[83....","{'35': [[0.0, 31]], '31': [[31.0, 72], [112.0,...",USA5326,02.02.2024,08,01,22,"[35, 31, 26, 25, 32, 15, 26, 32, 33, 29, 33, 3...",dZTSaHTabWbZTTZTZZT
1,USA5326_45324.34642642_2_2_9_37_22.wav,True,"{'31': [[0.0, 32.38095238095238], [1443.650793...","{'31': [[0.0, 12], [535.0, 579], [608.0, 643],...",USA5326,02.02.2024,09,37,22,"[31, 26, 31, 26, 31, 26, 31, 26, 32]",ZTZTZTZTa
2,USA5326_45324.34922587_2_2_9_42_2.wav,True,"{'32': [[0.0, 129.52380952380952]], '30': [[12...","{'32': [[0.0, 48]], '30': [[47.99999999999999,...",USA5326,02.02.2024,09,42,02,"[32, 30, 27, 30, 27, 30, 27, 30, 27, 30, 28, 2...",aYUYUYUYUYVTPTSZSOZS
3,USA5326_45324.29466848_2_2_8_11_6.wav,True,"{'10': [[0.0, 1457.1428571428573]], '11': [[14...","{'10': [[0.0, 540]], '11': [[540.0, 990]], '7'...",USA5326,02.02.2024,08,11,06,"[10, 11, 7, 1, 31, 23]",CDhBZQ
4,USA5326_45324.37364950_2_2_10_22_44.wav,True,"{'34': [[0.0, 62.06349206349207], [2517.619047...","{'34': [[0.0, 23], [933.0, 1004]], '35': [[23....",USA5326,02.02.2024,10,22,44,"[34, 16, 13, 16, 13, 34, 33, 29, 3, 8, 12, 35,...",cIFIFcbWXiEdVG
...,...,...,...,...,...,...,...,...,...,...,...
8002,USA5326_45359.34421464_3_8_9_33_41.wav,True,"{'32': [[0.0, 188.8888888888889], [2533.809523...","{'32': [[0.0, 70], [939.0, 940], [941.0, 942],...",USA5326,03.08.2024,09,33,41,"[32, 35, 16, 32, 16, 32, 16, 32, 16]",adIaIaIaI
8003,USA5326_45313.31158371_1_22_8_39_18.wav,True,"{'32': [[0.0, 393.968253968254], [747.46031746...","{'32': [[0.0, 146], [277.0, 282]], '0': [[146....",USA5326,01.22.2024,08,39,18,"[32, 0, 26, 33, 27, 33, 27, 32, 33, 26, 25, 26]",aATbUbUabTST
8004,USA5326_45313.41676612_1_22_11_34_36.wav,True,"{'32': [[0.0, 97.14285714285715]], '30': [[97....","{'32': [[0.0, 36]], '30': [[36.0, 351]], '8': ...",USA5326,01.22.2024,11,34,36,"[32, 15, 9, 15, 9, 15, 9, 13, 6, 10, 11, 7, 1,...",aHjHjHjFgCDhBQiY
8005,USA5326_45313.41798028_1_22_11_36_38.wav,True,"{'32': [[0.0, 2.6984126984126986], [5.39682539...","{'32': [[0.0, 1], [2.0, 3], [15.00000000000000...",USA5326,01.22.2024,11,36,38,"[32, 28, 34, 17, 28, 28, 13, 22, 28, 34, 35, 3...",aVcJVVFPVcdYVJaHaFbVaVTaV


### Figure out how many songs to include in each group:
1) Check the number of post-lesion songs, and the number of pre-lesion songs.
2) If the number of post-lesion songs is less than 1/2 of the number of pre-lesion songs, use that as the group size.
3) If the number of post-lesion songs is greater than 1/2 of the number of pre-lesin songs, use 0.5*the number of pre-lesion soongs as the group size.

In [249]:
# Function to calculate group size based on pre- and post-lesion song counts
def calculate_group_size(organized_data_frame, surgery_date):
    # Filter the DataFrame for songs recorded before and after the lesion date
    pre_lesion_songs = organized_data_frame[organized_data_frame['Date'] < surgery_date]
    post_lesion_songs = organized_data_frame[organized_data_frame['Date'] > surgery_date]
    
    # Count the number of pre-lesion and post-lesion songs
    num_pre_lesion_songs = pre_lesion_songs.shape[0]
    num_post_lesion_songs = post_lesion_songs.shape[0]
    
    # Determine the group size based on the condition
    if num_post_lesion_songs < 0.5 * num_pre_lesion_songs:
        group_size = num_post_lesion_songs  # Use the number of post-lesion songs as the group size
    else:
        group_size = int(0.5 * num_pre_lesion_songs)  # Use half the number of pre-lesion songs as the group size
    
    # Print the results
    print(f"Number of pre-lesion songs: {num_pre_lesion_songs}")
    print(f"Number of post-lesion songs: {num_post_lesion_songs}")
    print(f"Calculated group size: {group_size}")
    
    return group_size

# Example usage
group_size = calculate_group_size(filtered_data_frame, surgery_date)

# Function to calculate group size based on pre- and post-lesion song counts
def calculate_group_size(organized_data_frame, surgery_date):
    # Filter the DataFrame for songs recorded before and after the lesion date
    pre_lesion_songs = organized_data_frame[organized_data_frame['Date'] < surgery_date]
    post_lesion_songs = organized_data_frame[organized_data_frame['Date'] > surgery_date]
    
    # Count the number of pre-lesion and post-lesion songs
    num_pre_lesion_songs = pre_lesion_songs.shape[0]
    num_post_lesion_songs = post_lesion_songs.shape[0]
    
    # Determine the group size based on the condition
    if num_post_lesion_songs < 0.5 * num_pre_lesion_songs:
        group_size = num_post_lesion_songs  # Use the number of post-lesion songs as the group size
    else:
        group_size = int(0.5 * num_pre_lesion_songs)  # Use half the number of pre-lesion songs as the group size
    
    # Print the results
    print(f"Number of pre-lesion songs: {num_pre_lesion_songs}")
    print(f"Number of post-lesion songs: {num_post_lesion_songs}")
    print(f"Calculated group size: {group_size}")
    
    return group_size

# Example usage
group_size = calculate_group_size(filtered_data_frame, surgery_date)


Number of pre-lesion songs: 4329
Number of post-lesion songs: 3234
Calculated group size: 2164
Number of pre-lesion songs: 4329
Number of post-lesion songs: 3234
Calculated group size: 2164


In [227]:
def filter_post_surgery_songs(organized_data_frame, lesion_date, group_size):
    # Filter the DataFrame for songs recorded after the lesion date (post-lesion)
    post_lesion_songs = organized_data_frame[organized_data_frame['Date'] > surgery_date]
    
    # If the number of post-lesion songs is greater than or equal to the group size,
    # select the first 'group_size' number of rows (songs)
    filtered_songs = post_lesion_songs.head(group_size)
    
    # Print the results for verification
    print(f"Filtered {group_size} post-surgery songs:")
    print(filtered_songs)
    
    return filtered_songs

# Example usage
group_size = calculate_group_size(filtered_data_frame, surgery_date)  # Calculate group size
post_surgery_songs = filter_post_surgery_songs(filtered_data_frame, surgery_date, group_size)


Number of pre-lesion songs: 4429
Number of post-lesion songs: 1771
Calculated group size: 1771
Filtered 1771 post-surgery songs:
                                    file_name  song_present  \
3437   USA5509_45416.47364572_5_4_13_9_24.wav          True   
3438  USA5509_45416.47436330_5_4_13_10_36.wav          True   
3439  USA5509_45416.47541692_5_4_13_12_21.wav          True   
3440  USA5509_45416.47690761_5_4_13_14_50.wav          True   
3441  USA5509_45416.63943551_5_4_17_45_43.wav          True   
...                                       ...           ...   
5789   USA5509_45433.6200445_5_21_1_43_20.wav          True   
5790   USA5509_45433.6311298_5_21_1_45_11.wav          True   
5791   USA5509_45433.6328153_5_21_1_45_28.wav          True   
5792   USA5509_45433.6379459_5_21_1_46_19.wav          True   
5793   USA5509_45433.6410309_5_21_1_46_50.wav          True   

                             syllable_onsets_offsets_ms  \
3437  {'6': [[0.0, 277.93650793650795], [323.8095238...

In [250]:
# def filter_surgery_songs(organized_data_frame, surgery_date, group_size):
#     # Filter the DataFrame for songs recorded after the lesion date (post-lesion)
#     post_lesion_songs = organized_data_frame[organized_data_frame['Date'] > surgery_date]
    
#     # Select the first 'group_size' number of rows from the post-lesion songs
#     filtered_post_surgery_songs = post_lesion_songs.head(group_size)
    
#     # Filter the DataFrame for songs recorded on or before the lesion date (pre-lesion)
#     pre_lesion_songs = organized_data_frame[organized_data_frame['Date'] <= surgery_date]
    
#     total_pre_lesion_songs = len(pre_lesion_songs)
#     print(f"Total number of pre-lesion songs: {total_pre_lesion_songs}")
    
#     if total_pre_lesion_songs < group_size * 2:
#         raise ValueError(f"Not enough pre-lesion songs to split into two groups of size {group_size}. Total available: {total_pre_lesion_songs}")

#     # Select the last 'group_size' number of rows for the latest pre-surgery songs
#     latest_pre_surgery_songs = pre_lesion_songs.tail(group_size)
    
#     # Select the 'group_size' rows just before the latest pre-surgery songs
#     earlier_pre_surgery_songs = pre_lesion_songs.iloc[-(group_size * 2):-group_size]
    
#     print(f"Number of latest pre-surgery songs: {len(latest_pre_surgery_songs)}")
#     print(f"Number of earlier pre-surgery songs: {len(earlier_pre_surgery_songs)}")
    
#     # Create cell arrays (lists) to hold the mapped syllable orders for each group
#     mapped_syllable_orders_post = list(filtered_post_surgery_songs['mapped_syllable_order'])
#     mapped_syllable_orders_latest_pre = list(latest_pre_surgery_songs['mapped_syllable_order'])
#     mapped_syllable_orders_earlier_pre = list(earlier_pre_surgery_songs['mapped_syllable_order'])

#     # Print the results for verification
#     print(f"Filtered {group_size} post-surgery songs' mapped syllable orders:")
#     print(mapped_syllable_orders_post)
    
#     print(f"Filtered {group_size} latest pre-surgery songs' mapped syllable orders:")
#     print(mapped_syllable_orders_latest_pre)
    
#     print(f"Filtered {group_size} earlier pre-surgery songs' mapped syllable orders:")
#     print(mapped_syllable_orders_earlier_pre)
    
#     # Return the lists of mapped syllable orders for each group
#     return mapped_syllable_orders_post, mapped_syllable_orders_latest_pre, mapped_syllable_orders_earlier_pre

# # Example usage
# group_size = calculate_group_size(filtered_data_frame, surgery_date)  # Calculate group size
# mapped_post, mapped_latest_pre, mapped_earlier_pre = filter_surgery_songs(filtered_data_frame, surgery_date, group_size)


Number of pre-lesion songs: 4329
Number of post-lesion songs: 3234
Calculated group size: 2164
Total number of pre-lesion songs: 4510
Number of latest pre-surgery songs: 2164
Number of earlier pre-surgery songs: 2164
Filtered 2164 post-surgery songs' mapped syllable orders:
['gaMghghghghghghghghghghAGhR', 'aPT', 'abWXSciP', 'AGhR', 'aPVadbdOdbdbdbPdbdbaV', 'YRAGVQEdGaAGAQEadWIJWIVadcdcOcOJbWUXdiPVPV', 'aDCDhPTDSPSPCC', 'aPaPCdZaTaV', 'aBQEGCdcdMDh', 'VMYFJVYaYa', 'aPWbWbWbWbWbWbaWTSZTSIPa', 'aPTaC', 'YUNO', 'DBCQDh', 'ajdY', 'cbWXdcNfLKLKLKLKLKLKLKLKLKLKLKLKLKLKLKLKLKLKLKEQ', 'aPC', 'agMGQ', 'TVYUIdcdcbWXdUdUdaVaVa', 'adWaSaWbWbWbWabWbWaSbcNPZPZaPZaTSaWbTSa', 'CBDh', 'aZSZSZQYMCDXNVSZS', 'YAEAYGhRbWbiQMREGQa', 'aPFC', 'aUUYUYOJVa', 'aFJVM', 'CDDACGhRDC', 'aDhMCDC', 'aJWHYMFaTZTZTZTZSZSZSZQ', 'aijY', 'VVYajFYPa', 'THZWHWbWbTZTTZZTVRbJbWSZHWHW', 'ajPVPVY', 'aPVPVUdYabdbIYFcaYXVXVXai', 'aTPTPSZC', 'aPTFaC', 'adadVTPa', 'aM', 'aba', 'aPFJTPaM', 'aTUC', 'aM', 'YUVdUddUbdUdTUdZRUdbWbWbiaYaUd

## Divide data and save as a .mat file in the designated folder (NEED TO UPDATE FOLDER PATH IN LINE 65)

In [251]:
import pandas as pd
import scipy.io as sio
import os

# Function to save syllable orders as cell arrays
def save_group_to_mat(mapped_syllable_orders, group_name, animal_id, save_dir):
    # Ensure the save directory exists
    os.makedirs(save_dir, exist_ok=True)
    
    # Wrap each mapped syllable order in a list (for MATLAB cell array compatibility)
    mapped_syllable_orders_cell = [[order] for order in mapped_syllable_orders]
    
    # Prepare the data as a dictionary for saving to .mat format
    data_dict = {f'mapped_syllable_order_{group_name}': mapped_syllable_orders_cell}
    
    # Generate a file name based on the animal ID and group name
    file_name = f"{animal_id}_{group_name}.mat"
    file_path = os.path.join(save_dir, file_name)
    
    # Save the .mat file
    sio.savemat(file_path, data_dict)
    
    # Print confirmation
    print(f"Saved {file_name} to {file_path}")

# Function to process and save pre- and post-surgery syllable orders
def filter_surgery_songs(organized_data_frame, surgery_date, group_size, save_dir):
    # Filter the DataFrame for songs recorded after the lesion date (post-lesion)
    post_lesion_songs = organized_data_frame[organized_data_frame['Date'] > surgery_date]
    
    # Select the first 'group_size' number of rows from the post-lesion songs
    filtered_post_surgery_songs = post_lesion_songs.head(group_size)
    
    # Filter the DataFrame for songs recorded on or before the lesion date (pre-lesion)
    pre_lesion_songs = organized_data_frame[organized_data_frame['Date'] <= surgery_date]
    
    total_pre_lesion_songs = len(pre_lesion_songs)
    print(f"Total number of pre-lesion songs: {total_pre_lesion_songs}")
    
    if total_pre_lesion_songs < group_size * 2:
        raise ValueError(f"Not enough pre-lesion songs to split into two groups of size {group_size}. Total available: {total_pre_lesion_songs}")

    # Select the last 'group_size' number of rows for the latest pre-surgery songs
    latest_pre_surgery_songs = pre_lesion_songs.tail(group_size)
    
    # Select the 'group_size' rows just before the latest pre-surgery songs
    earlier_pre_surgery_songs = pre_lesion_songs.iloc[-(group_size * 2):-group_size]
    
    print(f"Number of latest pre-surgery songs: {len(latest_pre_surgery_songs)}")
    print(f"Number of earlier pre-surgery songs: {len(earlier_pre_surgery_songs)}")
    
    # Create lists to hold the mapped syllable orders for each group
    mapped_syllable_orders_post = list(filtered_post_surgery_songs['mapped_syllable_order'])
    mapped_syllable_orders_latest_pre = list(latest_pre_surgery_songs['mapped_syllable_order'])
    mapped_syllable_orders_earlier_pre = list(earlier_pre_surgery_songs['mapped_syllable_order'])

    # Get the animal ID (assuming it's consistent across the dataset)
    animal_id = organized_data_frame['Animal ID'].iloc[0]
    
    # Save each group of syllable orders into a separate .mat file
    save_group_to_mat(mapped_syllable_orders_post, 'post_lesion_songs', animal_id, save_dir)
    save_group_to_mat(mapped_syllable_orders_latest_pre, 'latest_pre_lesion_songs', animal_id, save_dir)
    save_group_to_mat(mapped_syllable_orders_earlier_pre, 'earlier_pre_lesion_songs', animal_id, save_dir)

save_directory = "/Users/mirandahulsey-vincent/Desktop/DECODER_cell_arrays"
# Assuming organized_data_frame_with_orders contains the data
filter_surgery_songs(organized_data_frame_with_orders, surgery_date, group_size, save_directory)


Total number of pre-lesion songs: 4608
Number of latest pre-surgery songs: 2164
Number of earlier pre-surgery songs: 2164
Saved USA5326_post_lesion_songs.mat to /Users/mirandahulsey-vincent/Desktop/DECODER_cell_arrays/USA5326_post_lesion_songs.mat
Saved USA5326_latest_pre_lesion_songs.mat to /Users/mirandahulsey-vincent/Desktop/DECODER_cell_arrays/USA5326_latest_pre_lesion_songs.mat
Saved USA5326_earlier_pre_lesion_songs.mat to /Users/mirandahulsey-vincent/Desktop/DECODER_cell_arrays/USA5326_earlier_pre_lesion_songs.mat
