In [2]:
%%writefile stripped_analysis_utility.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

frame_rate = 0.55
EXPERIMENT_DURATION = 420
#total number of seconds that the experiment lasts (tends to be flexible so we will have to figure out how to set this)
FRAME_INTERVAL = 0.55
#frame_interval is calculated as 1 / frame_rate
#we are not doing binned experiments so this is not necessary; again, will comment out later, but for now too scared of bugs
BIN_WIDTH = 0.55

FILTER_NEURONS = True
#whether or not you want to remove the filtered ROIs from the analysis pipeline (default is yes)


def list_all_files_of_type(input_path, filetype):
    """Iterator function to  find all the files in a given path ending with '.ext' """
    return [file for file in os.listdir(input_path) if file.endswith(filetype)]

def string_to_list_translator(input_string, strip_before_split="[ ]", split_on=" "):
    """translator function to load the peak times and corresponding amplitudes from the csv files into 
    numerical data for python """
    split_string = input_string.strip(strip_before_split).split(split_on)
    return list(filter(None, split_string))

def spike_list_translator(input_string):
    """This function calls the previous string_to_list_function specifically to convert peak times into an array.
    An array is essentially a matrix where each row is now an ROI and each column is its own peak time """
    string_list = string_to_list_translator(input_string)
    return np.array(string_list).astype(int) * FRAME_INTERVAL

def amplitude_list_translator(input_string):
    """same as the previous function, but repeats for amplitude; The function uses 'float' as a datatype rather than integer 
    because the amplitudes are not whole numbers like frames, and have decimal values"""
    amp_string_list = string_to_list_translator(input_string)
    amp_string_list = np.array(amp_string_list).astype(float)
    return np.around(amp_string_list)

def peak_df_iterator(input_path, return_name=True):
    "Used to show all of the csv files that are located at a given location on your computer"
    for csv_file in list_all_files_of_type(input_path, "csv"):
        csv_path = os.path.join(input_path, csv_file)
        csv_df = pd.read_csv(csv_path, converters={"PeakTimes":spike_list_translator , "Amplitudes":amplitude_list_translator})
#         as we load the csv files we use the peak/amplitude translators to load them as arrays of numbers, not as lists
        yield csv_df, csv_file if return_name else csv_df

        
   #Again, binned stats are not necessary for synapses*, but regardless, we can leave this here for now
def calculate_binned_stats(input_df):
    local_df = input_df.copy()

    bins = np.arange(0, EXPERIMENT_DURATION + BIN_WIDTH, BIN_WIDTH)
    population_spikes, _ = np.histogram(np.hstack(local_df["PeakTimes"].values), bins=bins)
    population_frequency = population_spikes / BIN_WIDTH

    bin_stats = pd.DataFrame.from_dict({
        "Bin_Limits": [(bins[bin_index], bins[bin_index + 1]) for bin_index in range(len(bins) - 1)],
        "Spikes": population_spikes,
        "Frequency": population_frequency})
        
    return bin_stats


def calculate_peak_freq(input_df):
    """This function creates a copy of the input dataframe for manipulation and calculations.
    Here we calculate the number of Peaks (PeakCount), the frequency (PeakCount divided by the product
    of total frames and the frame rate)"""
    output_df = input_df.copy()
    output_df["PeakCount"] = output_df["PeakTimes"].str.len()
    output_df["PeakFreq"] = output_df["PeakCount"] / (input_df["Total Frames"] * frame_rate) #divide by total # of frames NOT framerate
    return output_df

def calculate_cell_isi(input_df): #isi == interspike interval
    """This function computes interspike interval which is not applicable for synapses
    ...These are not taken into account for our analysis so they can be ignored by using the 'calculate_isi variable
    and setting this variable to 'False' """
    output_df = input_df.copy()
    output_df["PeakDiff"] = output_df["PeakTimes"].apply(lambda x: list(pd.Series(x).diff().dropna()))
    output_df["DiffAvg"] = output_df["PeakDiff"].apply(lambda x: pd.Series(x).mean())
    output_df["DiffMedian"] = output_df["PeakDiff"].apply(lambda x: pd.Series(x).median())
    output_df["DiffCV"] = output_df["PeakDiff"].apply(lambda x: pd.Series(x).std()) / output_df["DiffAvg"] * 100
    return output_df

#below I will need to accurately figure out how to integrate this in, I should meet with Marti Ritter next week to do so

def calculate_peak_amplitudes(input_df):
    """calculation functions for amplitude average and other amplitude based statistics"""
    output_df = input_df.copy()
    output_df["AvgAmplitude"] = output_df["Amplitudes"].apply(lambda x: pd.Series(x).mean())
    output_df["PeakAmpMedian"] = output_df["Amplitudes"].apply(lambda x: pd.Series(x).median())
    output_df["PeakAmpCV"] = output_df["Amplitudes"].apply(lambda x: pd.Series(x).std()) / output_df["AvgAmplitude"] * 100
    return output_df


def calculate_syn_stats(input_df, calculate_freq=True, calculate_isi=False, calculate_amplitudes=True):
    """This function specifically allows you to tell which statistics you want to calculate; here if you dont want interspike 
    interval; you would set 'calculat_isi = False' before you commit this script to a python file 
    (see first line at top of page)"""
    output_df = input_df.copy()
    if calculate_freq:
        output_df = calculate_peak_freq(output_df)
    if calculate_isi:
        output_df = calculate_cell_isi(output_df)
    if calculate_amplitudes:
        output_df = calculate_peak_amplitudes(output_df)
    return output_df



def process_peak_csvs_to_pkl(input_path, output_path, overwrite=False):
    """This will convert .csv files into pickle files which behave like dataframes; but are faster and preserve CPU RAM
    pickle files keep the same name as the csv but have the experiment duration, frame interval and bin width (here one frame)
    listed in the title as well. Here we also filter neurons and build dictionaries to contain the summary statistics"""
    for peak_df, file_name in peak_df_iterator(input_path):
        processed_path = os.path.join(output_path, 
                                      f"{os.path.splitext(file_name)[0]}"
                                      f"Dur{int(EXPERIMENT_DURATION)}s"
                                      f"Int{int(FRAME_INTERVAL*1000)}ms"
                                      f"Bin{int(BIN_WIDTH*1000)}ms"
                                        + ("_filtered" if FILTER_NEURONS else "") +
                                      ".pkl")

        if os.path.exists(processed_path) and not overwrite:
            print(f"Processed file {processed_path} already exists!")
            continue
            
        if FILTER_NEURONS:
            peak_df = peak_df[peak_df["IsUsed"]]
            
        processed_dict = {
            "cell_stats": calculate_cell_stats(peak_df),
            "binned_stats": calculate_binned_stats(peak_df)}
#         binned_stats are not taken into account, but for the moment, they must be calculated along with the cell stats
        

        pd.to_pickle(processed_dict, processed_path)

# Next go to the processing_utility file

Overwriting stripped_analysis_utility.py
