# This code takes a .json file from the TweetyBERT decoder from a bird with ONLY pre-treatment data. Then, it generates a .csv summary file that I can use to graph against the other lesion data.

In [1]:
import json
import pandas as pd
import numpy as np
from datetime import datetime

class DecoderDataProcessor:
    def __init__(self, json_file_path, json_date_file_path):
        self.json_file_path = json_file_path
        self.json_date_file_path = json_date_file_path
        self.decoder_dataframe = self.load_data()
        self.subdirectory_dates = self.load_subdirectory_dates()

    def load_data(self):
        print(f"Reading JSON file: {self.json_file_path}")
        with open(self.json_file_path, 'r') as f:
            decoder_data = json.load(f)['results']
        return pd.DataFrame(decoder_data)

    def load_subdirectory_dates(self):
        with open(self.json_date_file_path, 'r') as f:
            json_data = json.load(f)
            subdirectory_dates = {subdir: datetime.strptime(data['subdirectory_creation_date'], "%Y-%m-%d").date() for subdir, data in json_data['subdirectories'].items()}
        return subdirectory_dates

    def parse_json_safe(self, s):
        if isinstance(s, dict):
            return s
        if pd.isna(s):
            return {}
        s = s.strip().strip("''")
        s = s.replace("'", '"')
        try:
            return json.loads(s)
        except json.JSONDecodeError:
            return {}

    def organize_data(self):
        self.decoder_dataframe['syllable_onsets_offsets_ms'] = self.decoder_dataframe['syllable_onsets_offsets_ms'].apply(self.parse_json_safe)
        self.decoder_dataframe['syllable_onsets_offsets_timebins'] = self.decoder_dataframe['syllable_onsets_offsets_timebins'].apply(self.parse_json_safe)
        
        self.decoder_dataframe = self.decoder_dataframe[self.decoder_dataframe['song_present'] == True].reset_index(drop=True)
        for i, row in self.decoder_dataframe.iterrows():
            file_name = row['file_name']
            animal_id, date_str, hour, minute, second = self.extract_date_time(file_name)
            self.decoder_dataframe.at[i, 'Animal ID'] = animal_id
            self.decoder_dataframe.at[i, 'Date'] = self.apply_year_correction(date_str)
            self.decoder_dataframe.at[i, 'Hour'] = hour
            self.decoder_dataframe.at[i, 'Minute'] = minute
            self.decoder_dataframe.at[i, 'Second'] = second
        self.decoder_dataframe['Date'] = pd.to_datetime(self.decoder_dataframe['Date'], format='%Y.%m.%d', errors='coerce')

    def extract_date_time(self, file_name):
        try:
            parts = file_name.split('_')
            return parts[0], f"{parts[2]}.{parts[3]}", parts[4], parts[5], parts[6].replace('.wav', '')
        except IndexError:
            return None, None, None, None, None

    def apply_year_correction(self, month_day):
        if month_day is None:
            return None
        try:
            month, day = month_day.split('.')
            for subdir, date in self.subdirectory_dates.items():
                if date.month == int(month) and date.day == int(day):
                    return date.strftime('%Y.%m.%d')
        except ValueError:
            return None
        return None

# Usage
json_file_path = '/Users/mirandahulsey-vincent/Documents/allPythonCode/syntax_analysis/data_inputs/Area_X_screening_phrase_durations/USA5506/USA5506_decoded_database.json'
json_date_file_path = '/Users/mirandahulsey-vincent/Documents/allPythonCode/syntax_analysis/data_inputs/Area_X_screening_phrase_durations/USA5506/USA5506_Comp2_treatment_and_recording_dates.json'

processor = DecoderDataProcessor(json_file_path, json_date_file_path)
processor.organize_data()
print(processor.decoder_dataframe.head(3))

Reading JSON file: /Users/mirandahulsey-vincent/Documents/allPythonCode/syntax_analysis/data_inputs/Area_X_screening_phrase_durations/USA5506/USA5506_decoded_database.json
                                  file_name        creation_date  \
0  USA5506_45700.49992595_2_12_13_53_12.wav  2025-02-12T13:53:38   
1  USA5506_45700.50054645_2_12_13_54_14.wav  2025-02-12T13:54:36   
2  USA5506_45700.50081602_2_12_13_54_41.wav  2025-02-12T13:55:00   

   song_present                         syllable_onsets_offsets_ms  \
0          True  {'6': [[0.0, 2255.873015873016], [2655.2380952...   
1          True  {'6': [[0.0, 1451.7460317460318]], '5': [[1451...   
2          True  {'7': [[0.0, 790.6349206349207]], '4': [[790.6...   

                    syllable_onsets_offsets_timebins Animal ID       Date  \
0  {'6': [[0.0, 836], [984.0, 985], [986.0, 987],...   USA5506 2025-02-12   
1  {'6': [[0.0, 538]], '5': [[538.0, 1126]], '4':...   USA5506 2025-02-12   
2  {'7': [[0.0, 293]], '4': [[293.0, 399], 

In [2]:
import os
import json
# Load the JSON file and extract the treatment date
with open(json_date_file_path, 'r') as file:
    json_data = json.load(file)

#find the animal ID from the file name:
file_name = os.path.basename(json_date_file_path)
animal_id = file_name.split('_')[0]  # Assuming the ID is always the first part of the file name

# Extract and print the treatment date and type
treatment_date = json_data.get("treatment_date", "Treatment date not found")
treatment_type = json_data.get("treatment_type", "Treatment type not found")

print(f"Animal ID: {animal_id}")
print(f"Treatment Date: {treatment_date}")
print(f"Treatment Type: {treatment_type}")

Animal ID: USA5506
Treatment Date: 2026-01-01
Treatment Type: Baseline recordings only


In [3]:
df = pd.DataFrame(processor.decoder_dataframe)

# Function to extract unique syllable labels from a DataFrame
def extract_unique_syllable_labels(df):
    unique_syllable_labels = set()
    for syllable_dict in df['syllable_onsets_offsets_ms']:
        if isinstance(syllable_dict, dict):
            unique_syllable_labels.update(syllable_dict.keys())
    return sorted(unique_syllable_labels)  # Return a sorted list of unique labels for easier viewing

# Extract and print unique syllable labels
unique_labels = extract_unique_syllable_labels(df)
print("Unique syllable labels:", unique_labels)
pre_treatment = df

Unique syllable labels: ['1', '10', '11', '12', '13', '15', '16', '17', '18', '2', '3', '4', '5', '6', '7', '9']


In [4]:
import pandas as pd

# Function to calculate durations for each syllable type
def calculate_durations(df, syllable_labels):
    durations_dict = {label: [] for label in syllable_labels}  # Initialize a dictionary to store durations for each label
    for index, row in df.iterrows():
        syllable_data = row['syllable_onsets_offsets_ms']
        for label in syllable_labels:
            if label in syllable_data:
                for interval in syllable_data[label]:
                    duration = interval[1] - interval[0]  # Calculate duration
                    durations_dict[label].append(duration)  # Append duration to the corresponding list
    return durations_dict

# Calculate durations for all syllable types in the pre-treatment data
syllable_durations_pre_treatment = calculate_durations(pre_treatment, unique_labels)

# Convert the dictionary to a DataFrame for pre-treatment data
max_length_pre = max(len(v) for v in syllable_durations_pre_treatment.values())
df_durations_pre = pd.DataFrame({k: pd.Series(v) for k, v in syllable_durations_pre_treatment.items()}, index=range(max_length_pre))

# Display the Pre-treatment Durations DataFrame
print("Pre-treatment Durations DataFrame:")
print(df_durations_pre)

# Save the pre-treatment durations DataFrame as a CSV file on the desktop
df_durations_pre.to_csv(f'/Users/mirandahulsey-vincent/Desktop/{animal_id}pre_treatment_durations.csv', index=False)

print("CSV file has been saved to your desktop.")


Pre-treatment Durations DataFrame:
               1         10          11          12          13          15  \
0       2.698413  78.253968   89.047619   26.984127   89.047619  132.222222   
1      24.285714   5.396825   72.857143  132.222222  223.968254  118.730159   
2      18.888889  21.587302  107.936508   64.761905  269.841270  105.238095   
3      72.857143   8.095238  159.206349   10.793651  294.126984  107.936508   
4      37.777778  40.476190   97.142857  267.142857  291.428571   80.952381   
...          ...        ...         ...         ...         ...         ...   
12813        NaN        NaN         NaN         NaN         NaN   10.793651   
12814        NaN        NaN         NaN         NaN         NaN   37.777778   
12815        NaN        NaN         NaN         NaN         NaN   59.365079   
12816        NaN        NaN         NaN         NaN         NaN   32.380952   
12817        NaN        NaN         NaN         NaN         NaN  196.984127   

              16

In [5]:
import pandas as pd
import numpy as np

# Function to calculate statistics for each syllable type in a given DataFrame
def calculate_statistics(df):
    # Dictionary to hold the statistics for each syllable
    stats_dict = {}
    
    # Iterate through each syllable column in the DataFrame
    for column in df:
        # Drop NaN values for accurate statistics
        data = df[column].dropna()
        
        # Calculate required statistics
        mean = data.mean()
        sem = data.sem()
        median = data.median()
        std_dev = data.std()
        variance = data.var()
        
        # Store the statistics in the dictionary
        stats_dict[column] = {
            'Mean': mean,
            'SEM': sem,
            'Median': median,
            'Standard Deviation': std_dev,
            'Variance': variance
        }
        
    return stats_dict

# Calculate statistics for pre-treatment and post-treatment DataFrames
stats_pre_treatment = calculate_statistics(df_durations_pre)

# Optionally, convert these dictionaries to DataFrames for better visualization
df_stats_pre_treatment = pd.DataFrame(stats_pre_treatment).T

# Display the statistics DataFrames
print("Pre-treatment Syllable Duration Statistics:")
print(df_stats_pre_treatment.head(10))


Pre-treatment Syllable Duration Statistics:
          Mean        SEM      Median  Standard Deviation      Variance
1    64.398997   0.944487   72.857143           43.630800  1.903647e+03
10   32.263630   6.178715   24.285714           29.632077  8.780600e+02
11   94.939502   2.504794   99.841270           49.275106  2.428036e+03
12  182.055018   2.065835  180.793651          114.500199  1.311030e+04
13  196.505610   1.067521  223.968254           88.617109  7.852992e+03
15   79.210349   0.376390   83.650794           42.613615  1.815920e+03
16   45.566927   1.725090   48.571429           31.574339  9.969389e+02
17  183.596710   0.714745  196.984127           59.748593  3.569894e+03
18   44.369528   0.939018   43.174603           28.045071  7.865260e+02
2   670.392385  13.805977   78.253968         1408.074288  1.982673e+06


In [6]:
import pandas as pd
import os
import numpy as np

# Given code: Function to calculate statistics
def calculate_statistics(df):
    stats_dict = {}
    for column in df:
        data = df[column].dropna()
        mean = data.mean()
        sem = data.sem()
        median = data.median()
        std_dev = data.std()
        variance = data.var()
        stats_dict[column] = {'Mean': mean, 'SEM': sem, 'Median': median, 'Standard Deviation': std_dev, 'Variance': variance}
    return stats_dict

# Calculate statistics for pre-treatment DataFrame
stats_pre_treatment = calculate_statistics(df_durations_pre)
df_stats_pre_treatment = pd.DataFrame(stats_pre_treatment).T

# Zero vectors for post-treatment placeholders
zero_data_mean = [0] * len(df_stats_pre_treatment)
zero_data_median = [0] * len(df_stats_pre_treatment)
zero_data_variance = [0] * len(df_stats_pre_treatment)

# Extracting pre-treatment means and variances
mean_pre_treatment = df_stats_pre_treatment['Mean']
median_pre_treatment = df_stats_pre_treatment['Median']
variance_pre_treatment = df_stats_pre_treatment['Variance']

# Combining the data into a new DataFrame
combined_data = pd.DataFrame({
    'Mean_Pre_Treatment': mean_pre_treatment,
    'Mean_Post_Treatment': zero_data_mean,
    'Median_Pre_Treatment': median_pre_treatment,
    'Median_Post_Treatment': zero_data_median,
    'Variance_Pre_Treatment': variance_pre_treatment,
    'Variance_Post_Treatment': zero_data_variance
})

# Adding the animal_id as the first column of the DataFrame
combined_data.insert(0, 'Animal_ID', animal_id)

# Extract the directory from the provided JSON file path
directory = os.path.dirname(json_file_path)

# Save to CSV in the same directory as the JSON file
combined_data.to_csv(os.path.join(directory, f'{animal_id}_treatment_comparison.csv'), index=False)

print("CSV file has been saved to the directory of the JSON file.")


CSV file has been saved to the directory of the JSON file.
