## This code takes in .csv outputs from the TweetyBERT decoder, then saves the data into cell arrays inside of .mat files, which can be then input into Jeff Markowitz's probabalistic suffix tree (PST). PST pipeline can be found here: https://github.com/jmarkow/pst


In [14]:
import json
import pandas as pd
import ast
from datetime import datetime

class temp:
    def __init__(self):
        # Define the path to the JSON file
        recording_file_path_name = '/Users/mirandahulsey-vincent/Documents/allPythonCode/syntax_analysis/data_inputs/DOI_study/USA5508/DOI_USA5508_decoded_database.json'

        # Read the JSON file
        print(f"Reading JSON file: {recording_file_path_name}")
        with open(recording_file_path_name, 'r') as f:
            decoder_data = json.load(f)['results']  # Adjusted to extract the 'results' key

        # Print the total number of songs (assuming each item in 'results' is a song)
        print(f"Total songs in JSON: {len(decoder_data)}")

        # Convert to DataFrame
        decoder_dataframe = pd.DataFrame(decoder_data)
        decoder_dataframe['syllable_onsets_offsets_ms'] = decoder_dataframe['syllable_onsets_offsets_ms'].apply(self.parse_json_safe)
        decoder_dataframe['syllable_onsets_offsets_timebins'] = decoder_dataframe['syllable_onsets_offsets_timebins'].apply(self.parse_json_safe)

        self.dataframe = decoder_dataframe

    def return_dataframe(self):
        return self.dataframe


    def parse_json_safe(self, s):
            """
            Safely parse a string representation of a JSON object.
            Handles extra quotes and converts single quotes to double quotes.
            """
            if isinstance(s, dict):
                return s  # If it's already a dictionary, return it as is
            
            if pd.isna(s):
                return {}
            
            # Remove surrounding single quotes
            s = s.strip()
            if s.startswith("''") and s.endswith("''"):
                s = s[2:-2]
            elif s.startswith("'") and s.endswith("'"):
                s = s[1:-1]
            
            if not s:
                return {}
            
            try:
                # First, attempt to parse using json
                s_json = s.replace("'", '"')
                return json.loads(s_json)
            except json.JSONDecodeError:
                try:
                    # If json fails, attempt using ast.literal_eval
                    return ast.literal_eval(s)
                except (ValueError, SyntaxError) as e:
                    print(f"Error parsing string: {s}\nError: {e}")
                    return {}
    
TEMP = temp()

decoder_dataframe = TEMP.return_dataframe()

print(decoder_dataframe.head)

Reading JSON file: /Users/mirandahulsey-vincent/Documents/allPythonCode/syntax_analysis/data_inputs/DOI_study/USA5508/DOI_USA5508_decoded_database.json
Total songs in JSON: 16689
<bound method NDFrame.head of                                        file_name  song_present  \
0       USA5508_45547.42460255_9_12_11_47_40.wav         False   
1       USA5508_45547.44088145_9_12_12_14_48.wav         False   
2       USA5508_45547.44240304_9_12_12_17_20.wav         False   
3       USA5508_45547.45080348_9_12_12_31_20.wav         False   
4       USA5508_45547.45230683_9_12_12_33_50.wav         False   
...                                          ...           ...   
16684  USA5508_45595.63970474_10_30_17_46_10.wav         False   
16685  USA5508_45595.64130755_10_30_17_48_50.wav         False   
16686  USA5508_45595.64189267_10_30_17_49_49.wav         False   
16687  USA5508_45595.64458876_10_30_17_54_18.wav         False   
16688   USA5508_45595.64854736_10_30_18_0_54.wav         False   

In [15]:
import pandas as pd
import numpy as np
from datetime import datetime
import json

# Load the CSV and JSON creation_date data
save_output_to_this_file_path = '/Users/mirandahulsey-vincent/Documents/allPythonCode/syntax_analysis/data_inputs/DOI_study/USA5506'
# USE IF THE DECODER OUTPUT A CSV FILE:
#recording_file_path_name = '/Users/mirandahulsey-vincent/Documents/allPythonCode/syntax_analysis/DECODER_csvs_and_jsons/USA5509_decoded.csv'
#decoder_dataframe = pd.read_csv(recording_file_path_name)
#USE IF THE DECODER OUTPUT IS A .JSON FILE:
# recording_file_path_name = '/Users/mirandahulsey-vincent/Documents/allPythonCode/syntax_analysis/DECODER_sham_lesions/USA5271_decoded.json'
# print(f"Reading JSON file: {recording_file_path_name}")
# with open(recording_file_path_name, 'r') as f:
#     decoder_data = json.load(f)['results']  # Adjusted to extract the 'results' key
# # Convert to DataFrame
# decoder_dataframe = pd.DataFrame(decoder_data)

path_to_json_file_with_dates = '/Users/mirandahulsey-vincent/Documents/allPythonCode/syntax_analysis/data_inputs/DOI_study/USA5508/DOI_USA5508_creation_data.json'


with open(path_to_json_file_with_dates, 'r') as f:
    json_data = json.load(f)
    # Extract surgery date and format it
json_surgery_date = json_data['treatment_date']  # Assuming single treatment date
date_obj = datetime.strptime(json_surgery_date, "%Y-%m-%d")
surgery_date = date_obj.strftime("%Y.%m.%d")  # Adjust to year-first format for consistency
print(f"Surgery date: {surgery_date}")
# Extract subdirectory creation dates from JSON data
subdirectory_dates = {subdir: data['subdirectory_creation_date'] for subdir, data in json_data['subdirectories'].items()}


# Function to extract date, time, and animal_id from the file name
def find_recording_dates_and_times(recording_file_path_name):
    try:
        file_name = recording_file_path_name.split('/')[-1]
        split_file_name_by_underscores = file_name.split('_')
        animal_id = split_file_name_by_underscores[0]
        month = split_file_name_by_underscores[2].zfill(2)
        day = split_file_name_by_underscores[3].zfill(2)
        date = f"{month}.{day}"
        hour = split_file_name_by_underscores[4].zfill(2)
        minute = split_file_name_by_underscores[5].zfill(2)
        second = split_file_name_by_underscores[6].replace('.wav', '').zfill(2)
        return animal_id, date, hour, minute, second
    except IndexError:
        print(f"Error: Unexpected format in file name {recording_file_path_name}")
        return None, None, None, None, None

# Function to update the date with year from the JSON file
def update_date_with_year(row, subdirectory_dates):
    month_day = row['Date']
    month = month_day.split('.')[0]
    for subdir, date in subdirectory_dates.items():
        year, json_month, json_day = date.split('-')
        if json_month == month:
            # Return the date in the format YYYY.MM.DD
            return f"{year}.{month_day}"
    return None

# Function to create a table and filter data where song_present is True
def make_table(input_data_frame, subdirectory_dates):
    only_song_data = input_data_frame[input_data_frame['song_present'] == True].reset_index(drop=True)
    num_files_with_song = only_song_data.shape[0]
    
    organized_data_frame = only_song_data.copy()
    organized_data_frame['Animal ID'] = [None] * num_files_with_song
    organized_data_frame['Date'] = [None] * num_files_with_song
    organized_data_frame['Hour'] = [None] * num_files_with_song
    organized_data_frame['Minute'] = [None] * num_files_with_song
    organized_data_frame['Second'] = [None] * num_files_with_song

    for i, row in only_song_data.iterrows():
        recording_file_path_name = row['file_name']
        try:
            animal_id, date, hour, minute, second = find_recording_dates_and_times(recording_file_path_name)
            organized_data_frame.at[i, 'Animal ID'] = animal_id
            organized_data_frame.at[i, 'Date'] = date
            organized_data_frame.at[i, 'Hour'] = hour
            organized_data_frame.at[i, 'Minute'] = minute
            organized_data_frame.at[i, 'Second'] = second
        except Exception as e:
            print(f"Error processing file {recording_file_path_name}: {e}")
            continue

    # Apply the year correction to the date
    organized_data_frame['Date'] = organized_data_frame.apply(lambda row: update_date_with_year(row, subdirectory_dates), axis=1)
    
    # Convert the updated 'Date' column to a proper datetime object for further handling
    organized_data_frame['Date'] = pd.to_datetime(organized_data_frame['Date'], format='%Y.%m.%d', errors='coerce')
    
    print(f"First rows of organized_data_frame with updated dates: {organized_data_frame.head(3)}")
    return organized_data_frame

# Generate the organized data table
organized_data_frame = make_table(decoder_dataframe, subdirectory_dates)

# Find the unique dates from the recording
def find_unique_recording_dates(data_frame):
    # Extract the unique values from the 'Date' column
    unique_dates = data_frame['Date'].dt.strftime('%Y.%m.%d').unique()  # Convert back to string for display purposes
    return unique_dates

# Example usage
unique_dates = find_unique_recording_dates(organized_data_frame)
print("Unique recording dates:", unique_dates)

# ### Clean syllable labels formatting, then find the unique syllable labels in the dataframe
# def clean_and_convert_to_dict(row):
#     try:
#         # Strip leading and trailing single quotes and whitespace
#         row_cleaned = row.strip("''").strip()
#         # Replace single quotes with double quotes for valid JSON syntax
#         row_cleaned = row_cleaned.replace("'", '"')
#         # Convert the cleaned string to a dictionary
#         return json.loads(row_cleaned)
#     except json.JSONDecodeError as e:
#         print(f"JSON decoding failed for row: {row}\nError: {e}")
#         return None  # Return None or some default value if decoding fails

# Apply the function to all rows in the 'syllable_onsets_offsets_ms' column
organized_data_frame['syllable_onsets_offsets_ms_dict'] = organized_data_frame['syllable_onsets_offsets_ms']

# Now 'syllable_onsets_offsets_ms_dict' contains the cleaned and converted dictionaries
# Create a set to store all unique syllable labels
unique_syllable_labels = set()

# Iterate through each row in the 'syllable_onsets_offsets_ms_dict' column
for row in organized_data_frame['syllable_onsets_offsets_ms_dict']:
    if row:  # Check if the row is not None or empty
        # Add the keys (syllable labels) to the set
        unique_syllable_labels.update(row.keys())

# Convert the set to a sorted list (optional, for easier viewing)
unique_syllable_labels = sorted(unique_syllable_labels)

# Print or return the unique syllable labels
print("Unique syllable labels:", unique_syllable_labels)


Surgery date: 2024.10.21
First rows of organized_data_frame with updated dates:                                   file_name  song_present  \
0    USA5508_45548.32222481_9_13_8_57_2.wav          True   
1    USA5508_45548.32654069_9_13_9_4_14.wav          True   
2  USA5508_45548.44913589_9_13_12_28_33.wav          True   

                          syllable_onsets_offsets_ms  \
0  {'9': [[0.0, 666.5079365079365], [7226.3492063...   
1  {'30': [[0.0, 331.9047619047619], [844.6031746...   
2                  {'8': [[0.0, 2048.095238095238]]}   

                    syllable_onsets_offsets_timebins Animal ID       Date  \
0  {'9': [[0.0, 247], [2678.0, 2949]], '8': [[246...   USA5508 2024-09-13   
1  {'30': [[0.0, 123], [313.0, 502], [966.0, 1017...   USA5508 2024-09-13   
2                                {'8': [[0.0, 759]]}   USA5508 2024-09-13   

  Hour Minute Second  
0   08     57     02  
1   09     04     14  
2   12     28     33  
Unique recording dates: ['2024.09.13' '2024.09.14

### Get the syllable order for ONE song

In [16]:
# import json
# import re

# # Assuming this is the string you've printed
# sample_one_song = organized_data_frame['syllable_onsets_offsets_ms'][0]

# # Remove the extra outer single quotes and replace single quotes with double quotes
# cleaned_sample = sample_one_song.strip("''")
# cleaned_sample = re.sub(r"'", '"', cleaned_sample)

# # Now try to load it as JSON
# try:
#     sample_one_song_dict = json.loads(cleaned_sample)
#     print("Successfully parsed JSON.")
# except json.JSONDecodeError as e:
#     print(f"JSONDecodeError: {e}")

# # Now you can proceed with your original logic
# syllable_times = []

# # Loop through each syllable label and its corresponding onset/offset pairs
# for syllable_label, times in sample_one_song_dict.items():
#     for onset_offset in times:
#         syllable_times.append([syllable_label, onset_offset[0], onset_offset[1]])

# # Convert the onset and offset times to floats if needed
# syllable_times_array = np.array([[syllable_label, float(onset), float(offset)] 
#                                  for syllable_label, onset, offset in syllable_times])

# # Sort the array by the onset times (second column)
# syllable_times_array_sorted = syllable_times_array[syllable_times_array[:, 1].argsort()]

# # Display the sorted result
# syllable_times_array_sorted

In [17]:
import numpy as np

# Assuming this is the first row of your DataFrame
sample_one_song = organized_data_frame['syllable_onsets_offsets_ms'][0]

# Check if sample_one_song is already a dictionary
if isinstance(sample_one_song, dict):
    sample_one_song_dict = sample_one_song
    print("Successfully accessed the dictionary.")
else:
    print("The sample_one_song variable is not a dictionary. Please check its format.")

# Now you can proceed with your original logic
syllable_times = []

# Loop through each syllable label and its corresponding onset/offset pairs
for syllable_label, times in sample_one_song_dict.items():
    for onset_offset in times:
        syllable_times.append([syllable_label, onset_offset[0], onset_offset[1]])

# Convert the onset and offset times to floats if needed
syllable_times_array = np.array([[syllable_label, float(onset), float(offset)] 
                                 for syllable_label, onset, offset in syllable_times])

# Sort the array by the onset times (second column)
syllable_times_array_sorted = syllable_times_array[syllable_times_array[:, 1].argsort()]

# Display the sorted result
syllable_times_array_sorted


Successfully accessed the dictionary.


array([['9', '0.0', '666.5079365079365'],
       ['3', '2871.1111111111113', '3945.0793650793653'],
       ['0', '3945.0793650793653', '4522.539682539683'],
       ['15', '4522.539682539683', '6605.714285714286'],
       ['24', '6605.714285714286', '6959.20634920635'],
       ['8', '666.5079365079365', '2871.1111111111113'],
       ['18', '6959.20634920635', '7107.619047619048'],
       ['17', '7107.619047619048', '7226.349206349207'],
       ['9', '7226.349206349207', '7957.619047619048'],
       ['8', '7957.619047619048', '8537.77777777778'],
       ['11', '8537.77777777778', '8969.523809523811'],
       ['5', '8969.523809523811', '8988.4126984127'],
       ['8', '8988.4126984127', '9541.587301587302'],
       ['1', '9541.587301587302', '10434.761904761905']], dtype='<U32')

In [18]:
# import json
# import re
# import numpy as np

# # Assuming this is the string you've printed
# sample_one_song = organized_data_frame['syllable_onsets_offsets_ms'][0]

# # Remove the extra outer single quotes and replace single quotes with double quotes
# cleaned_sample = sample_one_song.strip("''")
# cleaned_sample = re.sub(r"'", '"', cleaned_sample)

# # Now try to load it as JSON
# try:
#     sample_one_song_dict = json.loads(cleaned_sample)
#     print("Successfully parsed JSON.")
# except json.JSONDecodeError as e:
#     print(f"JSONDecodeError: {e}")

# # Now you can proceed with your original logic
# syllable_times = []

# # Loop through each syllable label and its corresponding onset/offset pairs
# for syllable_label, times in sample_one_song_dict.items():
#     for onset_offset in times:
#         syllable_times.append([syllable_label, onset_offset[0], onset_offset[1]])

# # Convert the onset and offset times to floats if needed
# syllable_times_array = np.array([[syllable_label, float(onset), float(offset)] 
#                                  for syllable_label, onset, offset in syllable_times])

# # Sort the array by the onset times (second column)
# syllable_times_array_sorted = syllable_times_array[syllable_times_array[:, 1].argsort()]

# # Extract only the syllable labels in order of onset times
# syllable_order = syllable_times_array_sorted[:, 0]

# # Display the syllable order array
# syllable_order

## Find the syllable order for each song, and add to the dataframe

In [19]:
# import json
# import re
# import numpy as np
# import pandas as pd

# # Function to generate syllable order from a single song data (syllable_onsets_offsets_ms)
# def get_syllable_order(sample_one_song):
#     # Clean the input and parse it as JSON
#     cleaned_sample = sample_one_song.strip("''")
#     cleaned_sample = re.sub(r"'", '"', cleaned_sample)
    
#     try:
#         sample_one_song_dict = json.loads(cleaned_sample)
#     except json.JSONDecodeError as e:
#         print(f"JSONDecodeError: {e}")
#         return None

#     syllable_times = []
    
#     # Loop through each syllable label and its corresponding onset/offset pairs
#     for syllable_label, times in sample_one_song_dict.items():
#         for onset_offset in times:
#             syllable_times.append([syllable_label, onset_offset[0], onset_offset[1]])

#     # Convert the onset and offset times to floats
#     syllable_times_array = np.array([[syllable_label, float(onset), float(offset)] 
#                                      for syllable_label, onset, offset in syllable_times])

#     # Sort the array by onset times (second column)
#     syllable_times_array_sorted = syllable_times_array[syllable_times_array[:, 1].argsort()]

#     # Extract only the syllable labels in order of onset times
#     syllable_order = syllable_times_array_sorted[:, 0]
    
#     return syllable_order

# # Function to iteratively find syllable orders for each row in the DataFrame
# def find_syllable_orders(organized_data_frame):
#     syllable_orders = []

#     for i, row in organized_data_frame.iterrows():
#         sample_one_song = row['syllable_onsets_offsets_ms']
#         syllable_order = get_syllable_order(sample_one_song)
#         if syllable_order is not None:
#             syllable_orders.append(syllable_order)
#         else:
#             syllable_orders.append([])  # In case of a parsing error, append an empty array

#     # Add a new column for the syllable orders
#     organized_data_frame['syllable_order'] = syllable_orders
    
#     return organized_data_frame

# # Assuming you've already loaded the data and updated the table
# organized_data_frame = make_table(decoder_dataframe, subdirectory_dates)

# # Now find syllable orders for each song in the organized_data_frame
# organized_data_frame_with_orders = find_syllable_orders(organized_data_frame)

# # Output the first few rows with syllable orders
# print(organized_data_frame_with_orders[['syllable_order']])

## Map each syllable label to a letter instead of a number (1->A, etc. This is necessary so we can tell labels '1' and '2' apart from '12' once they all get concatonated  into a string)

### Test this code for ONE song

In [20]:
# import json
# import re
# import numpy as np

# # Define a function to map syllable labels to unique characters
# def map_labels_to_characters(syllable_orders):
#     # Get all unique syllable labels across all rows
#     unique_labels = sorted(set(sum(syllable_orders, [])))  # Flatten the list of lists and get unique elements
    
#     # Create a mapping from each unique label to a unique character (uppercase, lowercase, digits)
#     available_chars = [chr(i) for i in range(65, 91)] + [chr(i) for i in range(97, 123)] + [chr(i) for i in range(48, 58)]  # A-Z, a-z, 0-9
#     if len(unique_labels) > len(available_chars):
#         raise ValueError("Too many unique labels to map to single characters!")
    
#     label_to_char = {label: available_chars[i] for i, label in enumerate(unique_labels)}
    
#     # Replace each label in the syllable order with its corresponding character
#     syllable_orders_mapped = []
#     for order in syllable_orders:
#         mapped_order = ''.join([label_to_char[label] for label in order])
#         syllable_orders_mapped.append(mapped_order)
    
#     return syllable_orders_mapped, label_to_char

# # Example syllable order (as shown in the sample)
# syllable_orders = [
#     ['8', '21', '22', '21', '22', '21', '22', '21', '26', '23', '2', '3', '26', '5', '11', '14']
# ]

# # Map the labels and get the string representation
# syllable_orders_mapped, label_to_char = map_labels_to_characters(syllable_orders)

# # Output the mapped syllable orders and the label-to-character mapping
# print("Mapped syllable order:", syllable_orders_mapped)
# print("Label-to-character mapping:", label_to_char)


In [21]:
import json
import re
import numpy as np
import pandas as pd

# Function to generate syllable order from a single song data (syllable_onsets_offsets_ms)
def get_syllable_order(sample_one_song):
    # Check if sample_one_song is already a dictionary
    if isinstance(sample_one_song, dict):
        sample_one_song_dict = sample_one_song
    elif isinstance(sample_one_song, str):
        # If it's a string, attempt to clean and parse it as JSON
        cleaned_sample = sample_one_song.strip("''")
        cleaned_sample = re.sub(r"'", '"', cleaned_sample)
        
        try:
            sample_one_song_dict = json.loads(cleaned_sample)
        except json.JSONDecodeError as e:
            print(f"JSONDecodeError: {e}")
            return None
    else:
        # If it's neither a dict nor a string, return None
        print(f"Unexpected data type: {type(sample_one_song)}")
        return None

    syllable_times = []
    
    # Loop through each syllable label and its corresponding onset/offset pairs
    for syllable_label, times in sample_one_song_dict.items():
        for onset_offset in times:
            syllable_times.append([syllable_label, onset_offset[0], onset_offset[1]])

    # Convert the onset and offset times to floats
    syllable_times_array = np.array([[syllable_label, float(onset), float(offset)] 
                                     for syllable_label, onset, offset in syllable_times])

    # Sort the array by onset times (second column)
    syllable_times_array_sorted = syllable_times_array[syllable_times_array[:, 1].argsort()]

    # Extract only the syllable labels in order of onset times
    syllable_order = syllable_times_array_sorted[:, 0]
    
    return syllable_order

# Function to iteratively find syllable orders for each row in the DataFrame
def find_syllable_orders(organized_data_frame):
    syllable_orders = []

    for i, row in organized_data_frame.iterrows():
        sample_one_song = row['syllable_onsets_offsets_ms']
        syllable_order = get_syllable_order(sample_one_song)
        if syllable_order is not None:
            syllable_orders.append(syllable_order)
        else:
            syllable_orders.append([])  # In case of a parsing error, append an empty array

    # Add a new column for the syllable orders
    organized_data_frame['syllable_order'] = syllable_orders
    
    return organized_data_frame

# Assuming you've already loaded the data and updated the table
# organized_data_frame = make_table(decoder_dataframe, subdirectory_dates)

# Now find syllable orders for each song in the organized_data_frame
organized_data_frame_with_orders = find_syllable_orders(organized_data_frame)

# Output the first few rows with syllable orders
print(organized_data_frame_with_orders[['syllable_order']])


                                         syllable_order
0       [9, 3, 0, 15, 24, 8, 18, 17, 9, 8, 11, 5, 8, 1]
1        [30, 16, 30, 9, 8, 30, 8, 30, 3, 0, 30, 30, 8]
2                                                   [8]
3                                         [16, 3, 0, 8]
4                 [8, 3, 0, 15, 17, 28, 13, 17, 24, 30]
...                                                 ...
4572                                             [8, 1]
4573             [8, 5, 0, 15, 8, 18, 17, 18, 17, 8, 5]
4574               [16, 8, 11, 24, 3, 16, 24, 5, 0, 17]
4575  [8, 1, 11, 2, 24, 2, 24, 2, 24, 2, 24, 2, 24, ...
4576               [16, 24, 16, 24, 16, 24, 8, 3, 5, 0]

[4577 rows x 1 columns]


### Do this for EVERY song in the dataframe

In [22]:
# import json
# import re
# import numpy as np
# import pandas as pd

# # Function to generate syllable order from a single song data (syllable_onsets_offsets_ms)
# def get_syllable_order(sample_one_song):
#     # Clean the input and parse it as JSON
#     cleaned_sample = sample_one_song.strip("''")
#     cleaned_sample = re.sub(r"'", '"', cleaned_sample)
    
#     try:
#         sample_one_song_dict = json.loads(cleaned_sample)
#     except json.JSONDecodeError as e:
#         print(f"JSONDecodeError: {e}")
#         return None

#     syllable_times = []
    
#     # Loop through each syllable label and its corresponding onset/offset pairs
#     for syllable_label, times in sample_one_song_dict.items():
#         for onset_offset in times:
#             syllable_times.append([syllable_label, onset_offset[0], onset_offset[1]])

#     # Convert the onset and offset times to floats
#     syllable_times_array = np.array([[syllable_label, float(onset), float(offset)] 
#                                      for syllable_label, onset, offset in syllable_times])

#     # Sort the array by onset times (second column)
#     syllable_times_array_sorted = syllable_times_array[syllable_times_array[:, 1].argsort()]

#     # Extract only the syllable labels in order of onset times
#     syllable_order = syllable_times_array_sorted[:, 0]
    
#     return syllable_order

# # Function to map syllable labels to unique characters
# def map_labels_to_characters(syllable_orders):
#     # Flatten the list of syllable orders (since syllable_orders is a list of lists)
#     flat_syllable_orders = [label for sublist in syllable_orders for label in sublist]
    
#     # Get all unique syllable labels
#     unique_labels = sorted(set(flat_syllable_orders))
    
#     # Create a mapping from each unique label to a unique character (uppercase, lowercase, digits)
#     available_chars = [chr(i) for i in range(65, 91)] + [chr(i) for i in range(97, 123)] + [chr(i) for i in range(48, 58)]  # A-Z, a-z, 0-9
#     if len(unique_labels) > len(available_chars):
#         raise ValueError("Too many unique labels to map to single characters!")
    
#     label_to_char = {label: available_chars[i] for i, label in enumerate(unique_labels)}
    
#     # Replace each label in the syllable order with its corresponding character
#     syllable_orders_mapped = []
#     for order in syllable_orders:
#         mapped_order = ''.join([label_to_char[label] for label in order])
#         syllable_orders_mapped.append(mapped_order)
    
#     return syllable_orders_mapped, label_to_char

# # Function to iteratively find syllable orders for each row and map them
# def find_mapped_syllable_orders(organized_data_frame):
#     syllable_orders = []

#     for i, row in organized_data_frame.iterrows():
#         sample_one_song = row['syllable_onsets_offsets_ms']
#         syllable_order = get_syllable_order(sample_one_song)
#         if syllable_order is not None:
#             syllable_orders.append(syllable_order)
#         else:
#             syllable_orders.append([])  # In case of a parsing error, append an empty array

#     # Map the syllable orders to unique characters
#     syllable_orders_mapped, label_to_char = map_labels_to_characters(syllable_orders)

#     # Add new columns for both the raw syllable orders and the mapped syllable orders
#     organized_data_frame['syllable_order'] = syllable_orders
#     organized_data_frame['mapped_syllable_order'] = syllable_orders_mapped
    
#     return organized_data_frame, label_to_char

# # Assuming you've already loaded the data and updated the table
# organized_data_frame = make_table(decoder_dataframe, subdirectory_dates)

# # Now find and map syllable orders for each song in the organized_data_frame
# organized_data_frame_with_orders, label_to_char = find_mapped_syllable_orders(organized_data_frame)

# # Output the first few rows with both syllable orders and mapped syllable orders
# # Output only the 'syllable_order' and 'mapped_syllable_order' columns
# print(organized_data_frame_with_orders[['syllable_order', 'mapped_syllable_order']])

# print("Label-to-character mapping:", label_to_char)

# # Output only the 'syllable_order' and 'mapped_syllable_order' columns
# print(organized_data_frame_with_orders[['syllable_order', 'mapped_syllable_order']].head(2))
# print("Label-to-character mapping:", label_to_char)

In [23]:
import json
import re
import numpy as np
import pandas as pd

# Function to generate syllable order from a single song data (syllable_onsets_offsets_ms)
def get_syllable_order(sample_one_song):
    # Check if sample_one_song is already a dictionary
    if isinstance(sample_one_song, dict):
        sample_one_song_dict = sample_one_song
    elif isinstance(sample_one_song, str):
        # If it's a string, attempt to clean and parse it as JSON
        cleaned_sample = sample_one_song.strip("''")
        cleaned_sample = re.sub(r"'", '"', cleaned_sample)
        
        try:
            sample_one_song_dict = json.loads(cleaned_sample)
        except json.JSONDecodeError as e:
            print(f"JSONDecodeError: {e}")
            return None
    else:
        # If it's neither a dict nor a string, return None
        print(f"Unexpected data type: {type(sample_one_song)}")
        return None

    syllable_times = []
    
    # Loop through each syllable label and its corresponding onset/offset pairs
    for syllable_label, times in sample_one_song_dict.items():
        for onset_offset in times:
            syllable_times.append([syllable_label, onset_offset[0], onset_offset[1]])

    # Convert the onset and offset times to floats
    syllable_times_array = np.array([[syllable_label, float(onset), float(offset)] 
                                     for syllable_label, onset, offset in syllable_times])

    # Sort the array by onset times (second column)
    syllable_times_array_sorted = syllable_times_array[syllable_times_array[:, 1].argsort()]

    # Extract only the syllable labels in order of onset times
    syllable_order = syllable_times_array_sorted[:, 0]
    
    return syllable_order

# Function to map syllable labels to unique characters
def map_labels_to_characters(syllable_orders):
    # Flatten the list of syllable orders (since syllable_orders is a list of lists)
    flat_syllable_orders = [label for sublist in syllable_orders for label in sublist]
    
    # Get all unique syllable labels
    unique_labels = sorted(set(flat_syllable_orders))
    
    # Create a mapping from each unique label to a unique character (uppercase, lowercase, digits)
    available_chars = [chr(i) for i in range(65, 91)] + [chr(i) for i in range(97, 123)] + [chr(i) for i in range(48, 58)]  # A-Z, a-z, 0-9
    if len(unique_labels) > len(available_chars):
        raise ValueError("Too many unique labels to map to single characters!")
    
    label_to_char = {label: available_chars[i] for i, label in enumerate(unique_labels)}
    
    # Replace each label in the syllable order with its corresponding character
    syllable_orders_mapped = []
    for order in syllable_orders:
        mapped_order = ''.join([label_to_char[label] for label in order])
        syllable_orders_mapped.append(mapped_order)
    
    return syllable_orders_mapped, label_to_char

# Function to iteratively find syllable orders for each row and map them
def find_mapped_syllable_orders(organized_data_frame):
    syllable_orders = []

    for i, row in organized_data_frame.iterrows():
        sample_one_song = row['syllable_onsets_offsets_ms']
        syllable_order = get_syllable_order(sample_one_song)
        if syllable_order is not None:
            syllable_orders.append(syllable_order)
        else:
            syllable_orders.append([])  # In case of a parsing error, append an empty array

    # Map the syllable orders to unique characters
    syllable_orders_mapped, label_to_char = map_labels_to_characters(syllable_orders)

    # Add new columns for both the raw syllable orders and the mapped syllable orders
    organized_data_frame['syllable_order'] = syllable_orders
    organized_data_frame['mapped_syllable_order'] = syllable_orders_mapped
    
    return organized_data_frame, label_to_char

# Assuming you've already loaded the data and updated the table
# Example: organized_data_frame = make_table(decoder_dataframe, subdirectory_dates)

# Now find and map syllable orders for each song in the organized_data_frame
organized_data_frame_with_orders, label_to_char = find_mapped_syllable_orders(organized_data_frame)

# Output the first few rows with both syllable orders and mapped syllable orders
# Output only the 'syllable_order' and 'mapped_syllable_order' columns
print(organized_data_frame_with_orders[['syllable_order', 'mapped_syllable_order']])

# Print label-to-character mapping
print("Label-to-character mapping:", label_to_char)

# Display the first 2 rows as an example
print(organized_data_frame_with_orders[['syllable_order', 'mapped_syllable_order']].head(2))

                                         syllable_order  \
0       [9, 3, 0, 15, 24, 8, 18, 17, 9, 8, 11, 5, 8, 1]   
1        [30, 16, 30, 9, 8, 30, 8, 30, 3, 0, 30, 30, 8]   
2                                                   [8]   
3                                         [16, 3, 0, 8]   
4                 [8, 3, 0, 15, 17, 28, 13, 17, 24, 30]   
...                                                 ...   
4572                                             [8, 1]   
4573             [8, 5, 0, 15, 8, 18, 17, 18, 17, 8, 5]   
4574               [16, 8, 11, 24, 3, 16, 24, 5, 0, 17]   
4575  [8, 1, 11, 2, 24, 2, 24, 2, 24, 2, 24, 2, 24, ...   
4576               [16, 24, 16, 24, 16, 24, 8, 3, 5, 0]   

                      mapped_syllable_order  
0                            ZTAGPYJIZYDWYB  
1                             UHUZYUYUTAUUY  
2                                         Y  
3                                      HTAY  
4                                YTAGIRFIPU  
...            

## Now, sort the data by into different groups (e.g. by days, by pre vs. post lesion group), then export them into .mat files

### Test: Get all the arrays from one day of recording:

In [24]:
# import pandas as pd

# # Filter the DataFrame for the first day of recording
# def get_first_day_songs(organized_data_frame):
#     # Find the first day of recording
#     first_day = organized_data_frame['Date'].min()
    
#     # Filter the DataFrame for the first day
#     first_day_data = organized_data_frame[organized_data_frame['Date'] == first_day]
    
#     return first_day_data

# # Generate a 1xN cell array where each cell contains the mapped syllable order for the first day
# def generate_first_day_cell_array(organized_data_frame):
#     # Get the filtered data for the first day
#     first_day_data = get_first_day_songs(organized_data_frame)
    
#     # Extract the mapped syllable orders into a list (which acts as a cell array)
#     mapped_syllable_orders = first_day_data['mapped_syllable_order'].tolist()
    
#     # Create a 1xN cell array (as a list of lists for flexibility)
#     cell_array = [mapped_syllable_orders]
    
#     return cell_array

# # Assuming organized_data_frame_with_orders contains the data
# first_day_cell_array = generate_first_day_cell_array(organized_data_frame_with_orders)

# # Output the result
# print("1xN cell array of mapped syllable orders for the first day of recording:")
# print(first_day_cell_array)


### Get the syllable string arrays from each day of recording

In [25]:
import pandas as pd

# Function to get the unique dates in the DataFrame
def get_unique_dates(organized_data_frame):
    return sorted(organized_data_frame['Date'].unique())

# Function to generate a cell array for each day of recording
def generate_cell_arrays_for_each_day(organized_data_frame):
    # Get all unique dates
    unique_dates = get_unique_dates(organized_data_frame)
    
    # Iterate through each unique date
    for date in unique_dates:
        # Filter the DataFrame for the current date
        day_data = organized_data_frame[organized_data_frame['Date'] == date]
        
        # Extract the mapped syllable orders into a list (which acts as a cell array)
        mapped_syllable_orders = day_data['mapped_syllable_order'].tolist()
        
        # Create a 1xN cell array (as a list of lists for flexibility)
        cell_array = [mapped_syllable_orders]
        
        # Print the date and the corresponding cell array
        print(f"Recording Date: {date}")
        print("1xN cell array of mapped syllable orders for the day:")
        print(cell_array)
        print("\n")

# Assuming organized_data_frame_with_orders contains the data
generate_cell_arrays_for_each_day(organized_data_frame_with_orders)

Recording Date: 2024-09-13 00:00:00
1xN cell array of mapped syllable orders for the day:
[['ZTAGPYJIZYDWYB', 'UHUZYUYUTAUUY', 'Y', 'HTAY', 'YTAGIRFIPU', 'Y', 'Y', 'YTUYBCDYU', 'YTVG', 'YTAGAG', 'ZTWYDYDGU', 'HUYLHPHPGPUYPAUB', 'HYTWYB', 'ZYVBYZCUYDY', 'YDY', 'Y', 'HUJGUL', 'Y', 'ZWYTY', 'YU']]


Recording Date: 2024-09-14 00:00:00
1xN cell array of mapped syllable orders for the day:
[['YTBAPFGPNHNHBLNHPHPHPUYUBUBUYUBYTABABGZYTAGYJGYTAVGJGAGJPQYUBYDTTAGYZBYDY', 'YAGHTUYUYDYTVUYZCZCZCUPAUGTYUYBGHGVJQGQJGYTABUYDYAGBPHYZYWYT', 'UUDTAGYHTAUYBYUYNHVYZAGCPUZCY', 'WZGIGYYDWYBU', 'YUZ', 'HDTZABYUYUYGBUYTAGJGJY', 'YPGYZGJYDWYWYABGPYUYUYCZUHU', 'YDTAY', 'ARIJGYTAGYCDWYWYGBNHNHUYDYTGAG', 'C', 'DBGY', 'YGVWTAGPHPUYWYA', 'C', 'Y', 'HYAGIJRGUYBULPHUYATTABUGYWDT', 'HYTABYWYTUGVVZYUHPHPUZUYDYDYD', 'YDYTAHPHPYTGVGIFIJRGIQIUAGPHUY', 'HDYGYUDWYTBZHY', 'ZUYTAGCLPQLQHD', 'YUZC', 'VQZCUDWYP', 'YTUG', 'HAPGJUYUWYTAVGBYYDYTATAGYVT', 'VCUYGUZ', 'YDYBYBGRGP', 'Y', 'YUHUYZCDWYUYUYUYUYVUHVY', 'HPGUYUZPNHPLG', 'H

### Save each day's syllable cell arrays into separate .mat files (I don't recommend this, there are too few songs for PST to find a pattern from canary song, since it's so probabalistic):

In [26]:
# import pandas as pd
# import scipy.io as sio
# import os

# # Function to get the unique dates in the DataFrame
# def get_unique_dates(organized_data_frame):
#     return sorted(organized_data_frame['Date'].unique())

# # Function to generate and save .mat files for each day's recording
# def generate_and_save_mat_files(organized_data_frame, save_dir):
#     # Ensure the save directory exists
#     os.makedirs(save_dir, exist_ok=True)
    
#     # Get all unique dates
#     unique_dates = get_unique_dates(organized_data_frame)
    
#     # Get the animal ID from the first row (assuming it's consistent across the dataset)
#     animal_id = organized_data_frame['Animal ID'].iloc[0]
    
#     # Iterate through each unique date
#     for date in unique_dates:
#         # Filter the DataFrame for the current date
#         day_data = organized_data_frame[organized_data_frame['Date'] == date]
        
#         # Extract the mapped syllable orders into a list (which acts as a cell array for MATLAB)
#         mapped_syllable_orders = day_data['mapped_syllable_order'].tolist()
        
#         # Prepare the data as a dictionary for saving to .mat format
#         data_dict = {'mapped_syllable_order': mapped_syllable_orders}
        
#         # Generate a file name based on the animal ID and date
#         date_formatted = date.replace('.', '_')  # Replace dots with underscores for the file name
#         file_name = f"{animal_id}_{date_formatted}.mat"
#         file_path = os.path.join(save_dir, file_name)
        
#         # Save the .mat file
#         sio.savemat(file_path, data_dict)
        
#         # Print confirmation
#         print(f"Saved {file_name} to {file_path}")

# # Directory where .mat files will be saved (change this path as needed)
# save_directory = "/Users/mirandahulsey-vincent/Desktop/USA5288_song_cell_arrays"

# # Assuming organized_data_frame_with_orders contains the data
# generate_and_save_mat_files(organized_data_frame_with_orders, save_directory)


In [27]:
# import pandas as pd
# import scipy.io as sio
# import os

# # Function to get the unique dates in the DataFrame
# def get_unique_dates(organized_data_frame):
#     return sorted(organized_data_frame['Date'].unique())

# # Function to generate and save .mat files for each day's recording
# def generate_and_save_mat_files(organized_data_frame, save_dir):
#     # Ensure the save directory exists
#     os.makedirs(save_dir, exist_ok=True)
    
#     # Get all unique dates
#     unique_dates = get_unique_dates(organized_data_frame)
    
#     # Get the animal ID from the first row (assuming it's consistent across the dataset)
#     animal_id = organized_data_frame['Animal ID'].iloc[0]
    
#     # Iterate through each unique date
#     for date in unique_dates:
#         # Filter the DataFrame for the current date
#         day_data = organized_data_frame[organized_data_frame['Date'] == date]
        
#         # Extract the mapped syllable orders and store them as a cell array (list of lists)
#         mapped_syllable_orders = day_data['mapped_syllable_order'].tolist()
        
#         # Ensure the mapped_syllable_orders are formatted as a MATLAB cell array (list of cells in Python)
#         mapped_syllable_orders_cell = [[order] for order in mapped_syllable_orders]  # Wrap each string in a list
        
#         # Prepare the data as a dictionary for saving to .mat format
#         data_dict = {'mapped_syllable_order': mapped_syllable_orders_cell}
        
#         # Generate a file name based on the animal ID and date
#         date_formatted = date.replace('.', '_')  # Replace dots with underscores for the file name
#         file_name = f"{animal_id}_{date_formatted}.mat"
#         file_path = os.path.join(save_dir, file_name)
        
#         # Save the .mat file
#         sio.savemat(file_path, data_dict)
        
#         # Print confirmation
#         print(f"Saved {file_name} to {file_path}")

# # Directory where .mat files will be saved (change this path as needed)
# save_directory = "/Users/mirandahulsey-vincent/Desktop/USA5288_song_cell_arrays"

# # Assuming organized_data_frame_with_orders contains the data
# generate_and_save_mat_files(organized_data_frame_with_orders, save_directory)


### Save ALL data into one cell array

In [28]:
# import pandas as pd
# import scipy.io as sio
# import os

# # Function to get the unique dates in the DataFrame
# def get_unique_dates(organized_data_frame):
#     return sorted(organized_data_frame['Date'].unique())

# # Function to generate and save one .mat file with all mapped syllable orders
# def generate_and_save_combined_mat_file(organized_data_frame, save_dir):
#     # Ensure the save directory exists
#     os.makedirs(save_dir, exist_ok=True)
    
#     # Get all unique dates
#     unique_dates = get_unique_dates(organized_data_frame)
    
#     # Get the animal ID from the first row (assuming it's consistent across the dataset)
#     animal_id = organized_data_frame['Animal ID'].iloc[0]
    
#     # Initialize a list to store all mapped syllable orders from each day
#     all_mapped_syllable_orders = []
    
#     # Iterate through each unique date
#     for date in unique_dates:
#         # Filter the DataFrame for the current date
#         day_data = organized_data_frame[organized_data_frame['Date'] == date]
        
#         # Extract the mapped syllable orders and store them in the list
#         mapped_syllable_orders = day_data['mapped_syllable_order'].tolist()
        
#         # Ensure the mapped_syllable_orders are formatted as a MATLAB cell array (list of lists)
#         mapped_syllable_orders_cell = [[order] for order in mapped_syllable_orders]  # Wrap each order in a list
        
#         # Append the day's syllable orders to the overall list
#         all_mapped_syllable_orders.extend(mapped_syllable_orders_cell)
    
#     # Prepare the data as a dictionary for saving to .mat format
#     data_dict = {'all_mapped_syllable_order': all_mapped_syllable_orders}
    
#     # Generate a file name based on the animal ID
#     file_name = f"{animal_id}_all_mapped_syllable_orders.mat"
#     file_path = os.path.join(save_dir, file_name)
    
#     # Save the .mat file
#     sio.savemat(file_path, data_dict)
    
#     # Print confirmation
#     print(f"Saved combined syllable orders to {file_name} at {file_path}")

# # Directory where the combined .mat file will be saved (change this path as needed)
# save_directory = "/Users/mirandahulsey-vincent/Desktop/USA5288_song_cell_arrays"

# # Assuming organized_data_frame_with_orders contains the data
# generate_and_save_combined_mat_file(organized_data_frame_with_orders, save_directory)


### Save 2 groups: pre-lesion, and post-lesion

In [29]:
# import pandas as pd
# import scipy.io as sio
# import os

# # Function to get the unique dates in the DataFrame
# def get_unique_dates(organized_data_frame):
#     return sorted(organized_data_frame['Date'].unique())

# # Function to split and save pre-surgery and post-surgery syllable orders
# def generate_and_save_pre_post_surgery_mat_files(organized_data_frame, surgery_date, save_dir):
#     # Ensure the save directory exists
#     os.makedirs(save_dir, exist_ok=True)
    
#     # Get all unique dates
#     unique_dates = get_unique_dates(organized_data_frame)
    
#     # Get the animal ID from the first row (assuming it's consistent across the dataset)
#     animal_id = organized_data_frame['Animal ID'].iloc[0]
    
#     # Initialize lists to store pre-surgery and post-surgery mapped syllable orders
#     pre_surgery_orders = []
#     post_surgery_orders = []
    
#     # Iterate through each unique date
#     for date in unique_dates:
#         # Filter the DataFrame for the current date
#         day_data = organized_data_frame[organized_data_frame['Date'] == date]
        
#         # Extract the mapped syllable orders and store them in a list
#         mapped_syllable_orders = day_data['mapped_syllable_order'].tolist()
        
#         # Ensure the mapped_syllable_orders are formatted as a MATLAB cell array (list of lists)
#         mapped_syllable_orders_cell = [[order] for order in mapped_syllable_orders]  # Wrap each order in a list
        
#         # Compare the current date with the surgery date and store accordingly
#         if date < surgery_date:
#             pre_surgery_orders.extend(mapped_syllable_orders_cell)
#         elif date > surgery_date:
#             post_surgery_orders.extend(mapped_syllable_orders_cell)
    
#     # Prepare the data for saving in .mat format
#     data_dict_pre = {'pre_surgery_mapped_syllable_order': pre_surgery_orders}
#     data_dict_post = {'post_surgery_mapped_syllable_order': post_surgery_orders}
    
#     # Generate file names based on the animal ID and pre/post surgery
#     file_name_pre = f"{animal_id}_pre_surgery_mapped_syllable_orders.mat"
#     file_name_post = f"{animal_id}_post_surgery_mapped_syllable_orders.mat"
    
#     # Save the .mat files
#     file_path_pre = os.path.join(save_dir, file_name_pre)
#     file_path_post = os.path.join(save_dir, file_name_post)
    
#     sio.savemat(file_path_pre, data_dict_pre)
#     sio.savemat(file_path_post, data_dict_post)
    
#     # Print confirmation
#     print(f"Saved pre-surgery syllable orders to {file_name_pre} at {file_path_pre}")
#     print(f"Saved post-surgery syllable orders to {file_name_post} at {file_path_post}")

# # Directory where the .mat files will be saved (change this path as needed)
# save_directory = "/Users/mirandahulsey-vincent/Desktop/DECODER_cell_arrays"

# # Define the surgery date (change this to the actual surgery date as needed)
# #Format like so: 03.15.2024 MMDDYYY
# # surgery_date = '04.30.2024'

# # Assuming organized_data_frame_with_orders contains the data
# generate_and_save_pre_post_surgery_mat_files(organized_data_frame_with_orders, surgery_date, save_directory)


## Save into three groups with an equal number of songs in each: 
1) 3.5k songs post-lesion
2) 3.5k songs from immediately pre-lesion
3) 3.5k songs from earlier pre-lesion recordings 

From TG: "Any comparison should have a balanced number of songs."

### First, make sure each song contains at least 2 syllables (there will be no syntax trees if you just have songs with one syllabe. Plus this is likely just a trill that's been marked as a song).

In [30]:
# Assuming the syllables for each song are stored in a column named 'syllable_order'
# Replace 'syllable_order' with the actual column name if different.

# Define a function to count the number of syllables in each song
def count_syllables(row):
    # Assuming the syllable order is stored as a list in the 'syllable_order' column
    return len(row['syllable_order'])

# Apply the function to create a new column with the syllable count
organized_data_frame_with_orders['syllable_count'] = organized_data_frame_with_orders.apply(count_syllables, axis=1)

# Filter the DataFrame to only include rows with 2 or more syllables
filtered_data_frame = organized_data_frame_with_orders[organized_data_frame_with_orders['syllable_count'] >= 2]

# Drop the 'syllable_count' column if it's no longer needed
filtered_data_frame = filtered_data_frame.drop(columns=['syllable_count'])

# Display the filtered DataFrame
filtered_data_frame

Unnamed: 0,file_name,song_present,syllable_onsets_offsets_ms,syllable_onsets_offsets_timebins,Animal ID,Date,Hour,Minute,Second,syllable_onsets_offsets_ms_dict,syllable_order,mapped_syllable_order
0,USA5508_45548.32222481_9_13_8_57_2.wav,True,"{'9': [[0.0, 666.5079365079365], [7226.3492063...","{'9': [[0.0, 247], [2678.0, 2949]], '8': [[246...",USA5508,2024-09-13,08,57,02,"{'9': [[0.0, 666.5079365079365], [7226.3492063...","[9, 3, 0, 15, 24, 8, 18, 17, 9, 8, 11, 5, 8, 1]",ZTAGPYJIZYDWYB
1,USA5508_45548.32654069_9_13_9_4_14.wav,True,"{'30': [[0.0, 331.9047619047619], [844.6031746...","{'30': [[0.0, 123], [313.0, 502], [966.0, 1017...",USA5508,2024-09-13,09,04,14,"{'30': [[0.0, 331.9047619047619], [844.6031746...","[30, 16, 30, 9, 8, 30, 8, 30, 3, 0, 30, 30, 8]",UHUZYUYUTAUUY
3,USA5508_45548.45040250_9_13_12_30_40.wav,True,"{'16': [[0.0, 747.4603174603175]], '8': [[747....","{'16': [[0.0, 277]], '8': [[277.0, 1128]], '3'...",USA5508,2024-09-13,12,30,40,"{'16': [[0.0, 747.4603174603175]], '8': [[747....","[16, 3, 0, 8]",HTAY
4,USA5508_45548.45118581_9_13_12_31_58.wav,True,"{'8': [[0.0, 1122.5396825396826]], '3': [[1122...","{'8': [[0.0, 416]], '3': [[416.0, 809]], '0': ...",USA5508,2024-09-13,12,31,58,"{'8': [[0.0, 1122.5396825396826]], '3': [[1122...","[8, 3, 0, 15, 17, 28, 13, 17, 24, 30]",YTAGIRFIPU
7,USA5508_45548.45186354_9_13_12_33_6.wav,True,"{'8': [[0.0, 304.92063492063494], [2207.301587...","{'8': [[0.0, 113], [818.0, 1114], [1865.0, 196...",USA5508,2024-09-13,12,33,06,"{'8': [[0.0, 304.92063492063494], [2207.301587...","[8, 3, 30, 8, 1, 10, 11, 8, 30]",YTUYBCDYU
...,...,...,...,...,...,...,...,...,...,...,...,...
4572,USA5508_45595.54203032_10_30_15_3_23.wav,True,"{'8': [[0.0, 2703.809523809524]], '1': [[2703....","{'8': [[0.0, 1002]], '1': [[1002.0, 1513]]}",USA5508,2024-10-30,15,03,23,"{'8': [[0.0, 2703.809523809524]], '1': [[2703....","[8, 1]",YB
4573,USA5508_45595.54213990_10_30_15_3_33.wav,True,"{'8': [[0.0, 1845.7142857142858], [5191.746031...","{'8': [[0.0, 684], [1924.0, 2014], [2305.0, 37...",USA5508,2024-10-30,15,03,33,"{'8': [[0.0, 1845.7142857142858], [5191.746031...","[8, 5, 0, 15, 8, 18, 17, 18, 17, 8, 5]",YWAGYJIJIYW
4574,USA5508_45595.54229930_10_30_15_3_49.wav,True,"{'16': [[0.0, 412.8571428571429], [504.6031746...","{'16': [[0.0, 153], [187.0, 241]], '24': [[153...",USA5508,2024-10-30,15,03,49,"{'16': [[0.0, 412.8571428571429], [504.6031746...","[16, 8, 11, 24, 3, 16, 24, 5, 0, 17]",HYDPTHPWAI
4575,USA5508_45595.54247739_10_30_15_4_7.wav,True,"{'8': [[0.0, 1146.8253968253969], [3812.857142...","{'8': [[0.0, 425], [1413.0, 2195], [2668.0, 38...",USA5508,2024-10-30,15,04,07,"{'8': [[0.0, 1146.8253968253969], [3812.857142...","[8, 1, 11, 2, 24, 2, 24, 2, 24, 2, 24, 2, 24, ...",YBDLPLPLPLPLPYBLPMLPLPTLPLPLPLYAIGYGY


### Figure out how many songs to include in each group: (!!! Line 25 needs to be updated witht the surgery date!!!)
1) Check the number of post-lesion songs, and the number of pre-lesion songs.
2) If the number of post-lesion songs is less than 1/2 of the number of pre-lesion songs, use that as the group size.
3) If the number of post-lesion songs is greater than 1/2 of the number of pre-lesin songs, use 0.5*the number of pre-lesion soongs as the group size.

In [31]:
# Function to calculate group size based on pre- and post-lesion song counts
def calculate_group_size(organized_data_frame, surgery_date):
    # Filter the DataFrame for songs recorded before and after the lesion date
    pre_lesion_songs = organized_data_frame[organized_data_frame['Date'] < surgery_date]
    post_lesion_songs = organized_data_frame[organized_data_frame['Date'] > surgery_date]
    
    # Count the number of pre-lesion and post-lesion songs
    num_pre_lesion_songs = pre_lesion_songs.shape[0]
    num_post_lesion_songs = post_lesion_songs.shape[0]
    
    # Determine the group size based on the condition
    if num_post_lesion_songs < 0.5 * num_pre_lesion_songs:
        group_size = num_post_lesion_songs  # Use the number of post-lesion songs as the group size
    else:
        group_size = int(0.5 * num_pre_lesion_songs)  # Use half the number of pre-lesion songs as the group size
    
    # Print the results
    print(f"Number of pre-lesion songs: {num_pre_lesion_songs}")
    print(f"Number of post-lesion songs: {num_post_lesion_songs}")
    print(f"Calculated group size: {group_size}")
    
    return group_size

# Example usage
group_size = calculate_group_size(filtered_data_frame, surgery_date)

# Function to calculate group size based on pre- and post-lesion song counts
def calculate_group_size(organized_data_frame, surgery_date):
    # Filter the DataFrame for songs recorded before and after the lesion date
    pre_lesion_songs = organized_data_frame[organized_data_frame['Date'] < surgery_date]
    post_lesion_songs = organized_data_frame[organized_data_frame['Date'] > surgery_date]
    
    # Count the number of pre-lesion and post-lesion songs
    num_pre_lesion_songs = pre_lesion_songs.shape[0]
    num_post_lesion_songs = post_lesion_songs.shape[0]
    
    # Determine the group size based on the condition
    if num_post_lesion_songs < 0.5 * num_pre_lesion_songs:
        group_size = num_post_lesion_songs  # Use the number of post-lesion songs as the group size
    else:
        group_size = int(0.5 * num_pre_lesion_songs)  # Use half the number of pre-lesion songs as the group size
    
    # Print the results
    print(f"Number of pre-lesion songs: {num_pre_lesion_songs}")
    print(f"Number of post-lesion songs: {num_post_lesion_songs}")
    print(f"Calculated group size: {group_size}")
    
    return group_size

# Example usage
group_size = calculate_group_size(filtered_data_frame, surgery_date)


Number of pre-lesion songs: 2961
Number of post-lesion songs: 1374
Calculated group size: 1374
Number of pre-lesion songs: 2961
Number of post-lesion songs: 1374
Calculated group size: 1374


In [32]:
def filter_post_surgery_songs(organized_data_frame, lesion_date, group_size):
    # Filter the DataFrame for songs recorded after the lesion date (post-lesion)
    post_lesion_songs = organized_data_frame[organized_data_frame['Date'] > surgery_date]
    
    # If the number of post-lesion songs is greater than or equal to the group size,
    # select the first 'group_size' number of rows (songs)
    filtered_songs = post_lesion_songs.head(group_size)
    
    # Print the results for verification
    print(f"Filtered {group_size} post-surgery songs:")
    print(filtered_songs)
    
    return filtered_songs

# Example usage
group_size = calculate_group_size(filtered_data_frame, surgery_date)  # Calculate group size
post_surgery_songs = filter_post_surgery_songs(filtered_data_frame, surgery_date, group_size)


Number of pre-lesion songs: 2961
Number of post-lesion songs: 1374
Calculated group size: 1374
Filtered 1374 post-surgery songs:
                                     file_name  song_present  \
3138  USA5508_45587.32362304_10_22_8_59_22.wav          True   
3139  USA5508_45587.32378803_10_22_8_59_38.wav          True   
3140    USA5508_45587.32408585_10_22_9_0_8.wav          True   
3141   USA5508_45587.32417841_10_22_9_0_17.wav          True   
3142   USA5508_45587.32455362_10_22_9_0_55.wav          True   
...                                        ...           ...   
4572  USA5508_45595.54203032_10_30_15_3_23.wav          True   
4573  USA5508_45595.54213990_10_30_15_3_33.wav          True   
4574  USA5508_45595.54229930_10_30_15_3_49.wav          True   
4575   USA5508_45595.54247739_10_30_15_4_7.wav          True   
4576  USA5508_45595.54294520_10_30_15_4_54.wav          True   

                             syllable_onsets_offsets_ms  \
3138  {'8': [[0.0, 2253.1746031746034], [51

In [33]:
def filter_surgery_songs(organized_data_frame, surgery_date, group_size):
    # Filter the DataFrame for songs recorded after the lesion date (post-lesion)
    post_lesion_songs = organized_data_frame[organized_data_frame['Date'] > surgery_date]
    
    # Select the first 'group_size' number of rows from the post-lesion songs
    filtered_post_surgery_songs = post_lesion_songs.head(group_size)
    
    # Filter the DataFrame for songs recorded on or before the lesion date (pre-lesion)
    pre_lesion_songs = organized_data_frame[organized_data_frame['Date'] <= surgery_date]
    
    total_pre_lesion_songs = len(pre_lesion_songs)
    print(f"Total number of pre-lesion songs: {total_pre_lesion_songs}")
    
    if total_pre_lesion_songs < group_size * 2:
        raise ValueError(f"Not enough pre-lesion songs to split into two groups of size {group_size}. Total available: {total_pre_lesion_songs}")

    # Select the last 'group_size' number of rows for the latest pre-surgery songs
    latest_pre_surgery_songs = pre_lesion_songs.tail(group_size)
    
    # Select the 'group_size' rows just before the latest pre-surgery songs
    earlier_pre_surgery_songs = pre_lesion_songs.iloc[-(group_size * 2):-group_size]
    
    print(f"Number of latest pre-surgery songs: {len(latest_pre_surgery_songs)}")
    print(f"Number of earlier pre-surgery songs: {len(earlier_pre_surgery_songs)}")
    
    # Create cell arrays (lists) to hold the mapped syllable orders for each group
    mapped_syllable_orders_post = list(filtered_post_surgery_songs['mapped_syllable_order'])
    mapped_syllable_orders_latest_pre = list(latest_pre_surgery_songs['mapped_syllable_order'])
    mapped_syllable_orders_earlier_pre = list(earlier_pre_surgery_songs['mapped_syllable_order'])

    # Print the results for verification
    print(f"Filtered {group_size} post-surgery songs' mapped syllable orders:")
    print(mapped_syllable_orders_post)
    
    print(f"Filtered {group_size} latest pre-surgery songs' mapped syllable orders:")
    print(mapped_syllable_orders_latest_pre)
    
    print(f"Filtered {group_size} earlier pre-surgery songs' mapped syllable orders:")
    print(mapped_syllable_orders_earlier_pre)
    
    # Return the lists of mapped syllable orders for each group
    return mapped_syllable_orders_post, mapped_syllable_orders_latest_pre, mapped_syllable_orders_earlier_pre

# Example usage
group_size = calculate_group_size(filtered_data_frame, surgery_date)  # Calculate group size
mapped_post, mapped_latest_pre, mapped_earlier_pre = filter_surgery_songs(filtered_data_frame, surgery_date, group_size)


Number of pre-lesion songs: 2961
Number of post-lesion songs: 1374
Calculated group size: 1374
Total number of pre-lesion songs: 2987
Number of latest pre-surgery songs: 1374
Number of earlier pre-surgery songs: 1374
Filtered 1374 post-surgery songs' mapped syllable orders:
['YBVPMLPY', 'YB', 'PY', 'YTYAD', 'YDWAIATYBLPLPLPY', 'PWADYTYGYCIJIYD', 'AWLPYBLLPLPPULPLPLUPYVPY', 'ADTHPLPLYY', 'PDTYWOYA', 'AGYIA', 'AYBWLPLPLPY', 'HAZDTYPHPHPYDCY', 'YCDWY', 'HGYGIPJIJYHDPTYNBLPLPLPLPYYBLPLPUDTYWAI', 'PWYGIJIIJIPYDTOYAGPBPLPLPY', 'YCDTYBLPLPLPU', 'HPYNHDTHOPHPLYQO', 'HPHPOPOYDYBLPLPLU', 'OAPWDTYWAYIFIJIJIJIPOPY', 'HAIJIFIJIJIPOPOPHPOPYDYTW', 'POPY', 'HAPHPHPYVYDY', 'AAGW', 'GBLPLPY', 'YB', 'VWAWPAHPHPYY', 'YDTUAYCG', 'HPQPOPYDTYBU', 'GYBVPLPLPUY', 'YYAWYWGIJI', 'HUNCHDTAWZYGP', 'HYPW', 'ZLPLPLPOPYWDYAWYTAYCWYB', 'HCDNZ', 'ZYBCVPD', 'PIPYWYWATYYG', 'HHPYWAP', 'HNHYCY', 'ZBLPLPLDPTHPHPYC', 'ZTCD', 'YBVZV', 'HCDTYWAUZ', 'HCDTYZB', 'ZDTYCWA', 'YCDTYWAIZJIGYDY', 'ZWACD', 'YDZTHPC', 'YCDTYZ', 'YBLUCD

In [34]:
# print(len(mapped_post))
# print(len(mapped_latest_pre))
# print(len(mapped_earlier_pre))

# import pandas as pd
# import scipy.io as sio
# import os

# import pandas as pd
# import scipy.io as sio
# import os

# # Function to save each group into separate .mat files
# def save_group_to_mat(mapped_syllable_orders, group_name, animal_id, save_dir):
#     # Ensure the save directory exists
#     os.makedirs(save_dir, exist_ok=True)
    
#     # Prepare the data as a dictionary for saving to .mat format
#     data_dict = {'mapped_syllable_order': mapped_syllable_orders}
    
#     # Generate a file name based on the animal ID and group name
#     file_name = f"{animal_id}_{group_name}.mat"
#     file_path = os.path.join(save_dir, file_name)
    
#     # Save the .mat file
#     sio.savemat(file_path, data_dict)
    
#     # Print confirmation
#     print(f"Saved {file_name} to {file_path}")

# # Example animal ID (assuming it's consistent across the dataset)
# animal_id = "USA5443"

# # Directory where .mat files will be saved (change this path as needed)
# save_directory = "/Users/mirandahulsey-vincent/Desktop/DECODER_cell_arrays"

# # Save the post-lesion data
# save_group_to_mat(mapped_post, 'post_lesion_songs', animal_id, save_directory)

# # Save the latest pre-lesion group
# save_group_to_mat(mapped_latest_pre, 'latest_pre_lesion_songs', animal_id, save_directory)

# # Save the earlier pre-lesion group
# save_group_to_mat(mapped_earlier_pre, 'earlier_pre_lesion_songs', animal_id, save_directory)

In [35]:
import pandas as pd
import scipy.io as sio
import os

# Function to save syllable orders as cell arrays
def save_group_to_mat(mapped_syllable_orders, group_name, animal_id, save_dir):
    # Ensure the save directory exists
    os.makedirs(save_dir, exist_ok=True)
    
    # Wrap each mapped syllable order in a list (for MATLAB cell array compatibility)
    mapped_syllable_orders_cell = [[order] for order in mapped_syllable_orders]
    
    # Prepare the data as a dictionary for saving to .mat format
    data_dict = {f'mapped_syllable_order_{group_name}': mapped_syllable_orders_cell}
    
    # Generate a file name based on the animal ID and group name
    file_name = f"{animal_id}_{group_name}.mat"
    file_path = os.path.join(save_dir, file_name)
    
    # Save the .mat file
    sio.savemat(file_path, data_dict)
    
    # Print confirmation
    print(f"Saved {file_name} to {file_path}")

# Function to process and save pre- and post-surgery syllable orders
def filter_surgery_songs(organized_data_frame, surgery_date, group_size, save_dir):
    # Filter the DataFrame for songs recorded after the lesion date (post-lesion)
    post_lesion_songs = organized_data_frame[organized_data_frame['Date'] > surgery_date]
    
    # Select the first 'group_size' number of rows from the post-lesion songs
    filtered_post_surgery_songs = post_lesion_songs.head(group_size)
    
    # Filter the DataFrame for songs recorded on or before the lesion date (pre-lesion)
    pre_lesion_songs = organized_data_frame[organized_data_frame['Date'] <= surgery_date]
    
    total_pre_lesion_songs = len(pre_lesion_songs)
    print(f"Total number of pre-lesion songs: {total_pre_lesion_songs}")
    
    if total_pre_lesion_songs < group_size * 2:
        raise ValueError(f"Not enough pre-lesion songs to split into two groups of size {group_size}. Total available: {total_pre_lesion_songs}")

    # Select the last 'group_size' number of rows for the latest pre-surgery songs
    latest_pre_surgery_songs = pre_lesion_songs.tail(group_size)
    
    # Select the 'group_size' rows just before the latest pre-surgery songs
    earlier_pre_surgery_songs = pre_lesion_songs.iloc[-(group_size * 2):-group_size]
    
    print(f"Number of latest pre-surgery songs: {len(latest_pre_surgery_songs)}")
    print(f"Number of earlier pre-surgery songs: {len(earlier_pre_surgery_songs)}")
    
    # Create lists to hold the mapped syllable orders for each group
    mapped_syllable_orders_post = list(filtered_post_surgery_songs['mapped_syllable_order'])
    mapped_syllable_orders_latest_pre = list(latest_pre_surgery_songs['mapped_syllable_order'])
    mapped_syllable_orders_earlier_pre = list(earlier_pre_surgery_songs['mapped_syllable_order'])

    # Get the animal ID (assuming it's consistent across the dataset)
    animal_id = organized_data_frame['Animal ID'].iloc[0]
    
    # Save each group of syllable orders into a separate .mat file
    save_group_to_mat(mapped_syllable_orders_post, 'post_lesion_songs', animal_id, save_dir)
    save_group_to_mat(mapped_syllable_orders_latest_pre, 'latest_pre_lesion_songs', animal_id, save_dir)
    save_group_to_mat(mapped_syllable_orders_earlier_pre, 'earlier_pre_lesion_songs', animal_id, save_dir)

save_directory = "/Users/mirandahulsey-vincent/Desktop/DECODER_cell_arrays"
# Assuming organized_data_frame_with_orders contains the data
filter_surgery_songs(organized_data_frame_with_orders, surgery_date, group_size, save_directory)


Total number of pre-lesion songs: 3138
Number of latest pre-surgery songs: 1374
Number of earlier pre-surgery songs: 1374
Saved USA5508_post_lesion_songs.mat to /Users/mirandahulsey-vincent/Desktop/DECODER_cell_arrays/USA5508_post_lesion_songs.mat
Saved USA5508_latest_pre_lesion_songs.mat to /Users/mirandahulsey-vincent/Desktop/DECODER_cell_arrays/USA5508_latest_pre_lesion_songs.mat
Saved USA5508_earlier_pre_lesion_songs.mat to /Users/mirandahulsey-vincent/Desktop/DECODER_cell_arrays/USA5508_earlier_pre_lesion_songs.mat
