In [1]:
import bokeh.io
import bokeh.plotting
bokeh.io.output_notebook()
import numpy as np
import pandas as pd
import scipy.stats as stats
from pathlib import Path  
from scipy.stats import zscore



**PRE-DATA PROCESSING**

In [2]:
def df_format(raw_file):
    
    """
    Takes in raw csv file and converts it to usable format
    """
    
    df = pd.read_csv(raw_file)

    df.columns

    last_num = list(df['location'].values)[-1][3:]

    dict_fish = {}

    for num in range(1, int(last_num) + 1):
        if num < 10:
            num = '0' + str(num)
            
        if list(df['location'].values)[0][0] == 'L':
            location = "Loc" + str(num)
            dict_fish['FISH' + str(num)] = list(df.loc[df['location'] == location, "lardist"].values)
            
        elif list(df['location'].values)[0][0] == 'N':
            location = "Noc" + str(num)
            dict_fish['FISH' + str(num)] = list(df.loc[df['location'] == location, "lardist"].values)
    
    return pd.DataFrame(dict_fish)

**SPECIFYING CATEGORIES**

In [3]:
def categories(genotype_file, fish_types = ['wt', 'het', 'null']):
    
    """
    
    Makes a dictionary matching each fish to its type
    
    Parameters:
      genotype_file (csv file): genotype data for fish (assuming order is wt, het, null; if order is different, input fish types in order into fish_types kwarg)
      fish_types (list) : genotypes of fish (fish_type = ['wt', 'het', 'null'] is set unless specificied)
    
    Returns 
    dictionary (dict) : dictionary with each fish type
    i.e. {WT : [FISH1, FISH5, FISh6, FISH7], HET : [FISH9, FISH10], NULL : [FISH16, FISH7]}
    
    """
    
    df = pd.read_csv(genotype_file)
    
    dictionary = {}

    for i, ele in enumerate(df.columns):
        dict_key = ele
        values = []
        for index, val in enumerate(df[ele]):
            if val > 0:
                values.append('FISH' + str(int(val)))
            dictionary[fish_types[i]] = values

    return dictionary

**CALCULATING THRESHHOLD CUTOFF**

In [4]:
def calc_thresh(baseline_file, genotype_file, categories_dictionary,  p = 99):
    
    """
    
    Calculates percentile value for WT baseline fish
    
    Parameters:
    
      baseline_file (csv file) : file for baseline 
      p (int) : Integer percentile to calculate, default = 99
      genotype_file (csv file) : file with fish genotype data 
    
    Returns
    
      np.percentile( WT_values, p) (int) : calculted percentile value
      
    """
        
    wt_key = list(categories_dictionary.keys())[0]
    
    WT_values = []
    
    df = pd.read_csv(baseline_file)
    
    for ele in df.columns:
        if ele in categories_dictionary[wt_key]:
            for index2, i in enumerate(df[ele]):
                if index2 < 600:
                    WT_values.append(i)
                    
    return np.percentile(WT_values, p) 

**THRESHHOLD PROCESSING**

In [5]:
def thresh_data_processing(file, baseline_file, genotype_file, p = 99, fish_types = ['wt', 'het', 'null']):
    
    """
    
    For each fish calculates the number of values above the calculated threshold (Time in Seizure (s)), the length of each bout of continuous values above threshold (# seizures), 
    average length of all seizures (average seizure length), and creates a dictionary matching each condition with a tuple indicating the time count, seizure count, and 
    average seizure length for each fish

    Parameters:
       file (csv file) : data for one condition
       p (int) : Integer percentile to calculate, default = 99
       genotype_file (csv file) : file with fish genotype data
    
    Returns
       output_dict (dict) : dictionary sorting fish time in sezire/seizue count/avg seizure lenvalues for each fish
       i.e. {{'FISH1' : (1, 1, 1), 'FISH2' : (0, 1, 0)}
       
    """

    categories_dictionary = categories(genotype_file)
    
    thresh = calc_thresh(baseline_file, gen, categories_dictionary)
        
    df = pd.read_csv(file)
    output_dict = {}
        
    for index1, ele in enumerate(df.columns):
            
        if ele[0].upper() != 'F': 
            continue

        movement_list = list(np.where(list(df[ele][0:600]) > thresh, 1, 0))

        time_in_seizure = sum(movement_list)
                    
        for index, num in enumerate(movement_list):

            if index != len(movement_list)-1 and num == 1 and movement_list[index+1] == 1:
                movement_list[index] = 0

        num_seizures = sum(movement_list)

        if num_seizures != 0:

            average_seizure_length = time_in_seizure / num_seizures

        else:

            average_seizure_length = 0
        
        output_dict[ele] = (time_in_seizure, num_seizures, average_seizure_length) 
                
    return output_dict

**LOAD AND COMPILE DATAFRAME**

In [6]:
def compile_df(base, water, one, two, five):
    
    """
    
    Compiles the dictionaries from all four conditions into one dataframe

    Parameters:
    one, two, five, base, water (dictionary) : four dictionaries, one for each contition

    Returns:
    comb_df (df): combined dataframe
    
    """
    
    measurement = ['Time in Seizure (s)', '# Seizures', 'Average Seizure Length']

    base_df = pd.DataFrame(base)
    base_df.insert(0, 'Measurement', measurement)
    base_df.insert(0, 'Condition', 'Baseline')

    water_df = pd.DataFrame(water)
    water_df.insert(0, 'Measurement', measurement)
    water_df.insert(0, 'Condition', 'Water')

    one_df = pd.DataFrame(one)
    one_df.insert(0, 'Measurement', measurement)
    one_df.insert(0, 'Condition', '1M')

    two_df = pd.DataFrame(two)
    two_df.insert(0, 'Measurement', measurement)
    two_df.insert(0, 'Condition', '2M')

    five_df = pd.DataFrame(five)
    five_df.insert(0, 'Measurement', measurement)
    five_df.insert(0, 'Condition', '5M')

    comb_df = pd.concat([base_df, water_df, one_df, two_df, five_df], ignore_index=True)
    
    return comb_df

**SEPARATING DFS BY GENOTYPE**

In [7]:
def separate_genotypes(comb_df, categories_dictionary):

    """
    Separates the combined dataframe into a list of dictionaries, one for each condition

    Parameters:
    comb_df (df) : combined datafram
    catagories dictionary (dict) : catagories separating each fish by conditions

    Returns:
    all_dicts (list) : list of dictionaries 

    """
    
    all_dicts = []
    cat_dict = {}

    for diction in categories_dictionary:
        cat_dict.clear
        cat_dict = {'Condition' : list(comb_df['Condition'].values), 'Measurement' : list(comb_df['Measurement'].values)}
        for fish in categories_dictionary[diction]:
            if len(fish) <= 5:
                fish = fish[:4] + '0' + fish[len(fish)-1]
            cat_dict[fish] = list(comb_df[fish].values)
            
        all_dicts.append(cat_dict)
    
    return all_dicts

**ADDING Z-SCORE**

In [8]:
def get_WT_vals(wt_df):

    """
    Obtains the time in seizure, seizure count, and average seizure length counts for all fish in the wildtype baseline condition (controls)

    Parameters:
    wt_df (df): dataframe for WT fish

    Returns:
    time_mean_sd (dict): dictionary matching each condition to a tuple with the (standard deviation, mean) for the seizure count from each WT-baseline fish 
    (i.e. '1M': (22, 4), '5M': (52, 138), 'Baseline': (49, 35), 'Water': (12, 18)}
    seizure_count_mean_sd dict) : does the same as previous except calculates from number of seizure values
    avg_mean_seizure len (dict) : does the same as previous except calculates from average seizure length values

    """

    WT_dict_time_count = {}
    WT_dict_seizure_count = {}
    WT_dict_avg_seizure_len = {}
    
    conditions = ['Baseline', 'Water', '1M', '2M', '5M']

    dicts = [WT_dict_time_count, WT_dict_seizure_count, WT_dict_avg_seizure_len]
    
    for condition in conditions:
        for i, measurement in enumerate(['Time in Seizure (s)', '# Seizures', 'Average Seizure Length']):
            x = wt_df.loc[wt_df['Condition'] == condition]
            y = x.loc[x['Measurement'] == measurement].values
            y_list = y.tolist()[0][2:]
            z_score = list(zscore(y_list))

            y_updated = []
            ## removing outliers
            for yval, zval in zip(y_list, z_score):
                if zval >= 3.4 or zval <= -3.4:
                    print('hi')
                    pass
                y_updated.append(yval)
                    
            dicts[i]['WT_' + condition] = y_updated

    time_mean_sd = {}

    for condition, val_list in zip(conditions, list(WT_dict_time_count.values())):
        time_mean_sd[condition] = (np.std(val_list, ddof=1), np.mean(val_list))
    
    #for number of seizures
    
    seizure_count_mean_sd = {}
        
    for condition, val_list in zip(conditions, list(WT_dict_seizure_count.values())):
        seizure_count_mean_sd[condition] = (np.std(val_list, ddof=1), np.mean(val_list))
        
    #for average seizure length
        
    avg_mean_sd = {}

    values = []

    for condition, val_list in zip(conditions, list(WT_dict_avg_seizure_len.values())):
        avg_mean_sd[condition] = (np.std(val_list, ddof=1), np.mean(val_list))

    return time_mean_sd, seizure_count_mean_sd, avg_mean_sd

In [9]:
def add_z_score(condition_dictionary, wt_df):

    """
    Adds the z-score for the time in seizure, # seizures, and average seizure length count for each fish under one condition

    Parameters:
    condition_dictionary (dict): dictionary for one condition, generated from separate_genotypes function
    wt_df (df) : df for WT fish

    Returns:
    final_dictionary (dict) : new dictionary with z-scores

    """

    time_mean_sd, seizure_count_mean_sd, avg_mean_sd = get_WT_vals(wt_df)
    
    final_dictionary = {}
    
    final_dictionary['Condition'] = ['Baseline']*6 + ['Water']*6 + ['1M']*6 + ['2M']*6 + ['5M']*6 

    final_dictionary['Measurement'] = ['Time in Seizure (s)', 'Time in Seizure (s) Z-Score', '# Seizures',  '# Seizures Z-score', 'Average Seizure Length',  'Average Seizure Length Z-score'] *5

    for key in condition_dictionary.keys():
        
        if key[0] != 'F':
            
            continue
    
        new_vals = []
    
        for condition, measurement, value in zip(list(compiled_df['Condition'].values), list(compiled_df['Measurement'].values), list(compiled_df[key].values)):

            new_vals.append(value)

            if measurement == 'Time in Seizure (s)':

                dict_choice = time_mean_sd

            elif measurement == '# Seizures':

                dict_choice = seizure_count_mean_sd

            elif measurement == 'Average Seizure Length':

                dict_choice = avg_mean_sd

            #calculating z-scores:

            mean = dict_choice[condition][1]

            sd = dict_choice[condition][0]
            
            if sd != 0:
                
                new_vals.append((value - mean)/sd)

            else:

                new_vals.append(0)
    
        final_dictionary[key] = new_vals
        
    return final_dictionary

**FILE UPLOAD**

In [None]:
import os

pathname = '[insert path here]'

folder1 = os.fsencode(path1)

all_trials = {}

for file in os.listdir(folder1):
    file_name = file.decode("utf-8")

    if file_name == ".DS_Store":
        continue
        
    filenames = []

    path2 = pathname + '/' + file_name
    folder2 = os.fsencode(path2)
    for file2 in os.listdir(folder2):
        filename = os.fsdecode(file2)
        if filename == ".DS_Store":
            continue
             
        filenames.append(filename)
    all_trials[file_name] = filenames
    
all_trials

In [11]:
for trial in all_trials:

    file_names = all_trials[trial]
    
    for file in file_names: 
        
        if 'water' in file and 'pro' in file:
            
            watercsv = pathname +  '/' + trial + '/' + file

        elif 'baseline' in file and 'pro' in file:
            
            baselinecsv = pathname +  '/' +  trial + '/' + file

        elif '1mM' in file and 'pro' in file:
            
            onecsv = pathname +   '/' + trial + '/' + file

        elif '2mM' in file and 'pro' in file:
            
            twocsv = pathname +  '/' +  trial + '/' + file

        elif '5mM' in file and 'pro' in file:
            
            fivecsv = pathname +  '/' +  trial + '/' + file

        elif 'genotype' in file and 'pro' in file:
            
            gen = pathname +  '/' +  trial + '/' + file

        else:
            pass

    categories_dictionary = categories(gen)
    print(trial)

    base = thresh_data_processing(baselinecsv, baselinecsv, gen)
    water = thresh_data_processing(watercsv, baselinecsv, gen)
    one = thresh_data_processing(onecsv, baselinecsv, gen)
    two = thresh_data_processing(twocsv, baselinecsv, gen)
    five = thresh_data_processing(fivecsv, baselinecsv, gen)

    compiled_df = compile_df(base, water, one, two, five)

    wt_dict, het_dict, null_dict = separate_genotypes(compiled_df, categories_dictionary)
    wt_df = pd.DataFrame(wt_dict)
    het_df = pd.DataFrame(het_dict)
    null_df = pd.DataFrame(null_dict)

    # WT
    wt_new_dict = add_z_score(wt_dict, wt_df)
    new_wt_df = pd.DataFrame(wt_new_dict).transpose()
        
    filepath = Path(pathname +  trial + '/WT.csv')  
    filepath.parent.mkdir(parents=True, exist_ok=True)  
    new_wt_df.to_csv(filepath)  

    # HET 
    het_new_dict = add_z_score(het_dict, wt_df)
    new_het_df = pd.DataFrame(het_new_dict).transpose()
        
    filepath = Path(pathname +  trial + '/HET.csv')  
    filepath.parent.mkdir(parents=True, exist_ok=True)  
    new_het_df.to_csv(filepath)

    # NULL 
    null_new_dict = add_z_score(null_dict, wt_df)
    new_null_df = pd.DataFrame(null_new_dict).transpose()
        
    filepath = Path(pathname +  trial + '/NULL.csv')  
    filepath.parent.mkdir(parents=True, exist_ok=True)  
    new_null_df.to_csv(filepath)

In [None]:
for trial in all_trials:

    file_names = all_trials[trial]

    pathname = '/Users/junipolansky/zebrafish_PTZ/PTZ_final_files/'
    
    for file in file_names: 
        
        if 'water' in file and 'pro' in file:
            
            watercsv = pathname +  trial + '/' + file

        elif 'baseline' in file and 'pro' in file:
            
            baselinecsv = pathname +  trial + '/' + file

        elif '1mM' in file and 'pro' in file:
            
            onecsv = pathname +  trial + '/' + file

        elif '2mM' in file and 'pro' in file:
            
            twocsv = pathname +  trial + '/' + file

        elif '5mM' in file and 'pro' in file:
            
            fivecsv = pathname +  trial + '/' + file

        elif 'genotype' in file and 'pro' in file:
            
            gen = pathname +  trial + '/' + file

        else:
            pass

    categories_dictionary = categories(gen)
    print(trial)

    base = thresh_data_processing(baselinecsv, baselinecsv, gen)
    water = thresh_data_processing(watercsv, baselinecsv, gen)
    one = thresh_data_processing(onecsv, baselinecsv, gen)
    two = thresh_data_processing(twocsv, baselinecsv, gen)
    five = thresh_data_processing(fivecsv, baselinecsv, gen)

    compiled_df = compile_df(base, water, one, two, five)

    wt_dict, het_dict, null_dict = separate_genotypes(compiled_df, categories_dictionary)
    wt_df = pd.DataFrame(wt_dict)
    het_df = pd.DataFrame(het_dict)
    null_df = pd.DataFrame(null_dict)

    # WT
    wt_new_dict = add_z_score(wt_dict, wt_df)
    new_wt_df = pd.DataFrame(wt_new_dict).transpose()
        
    filepath = Path(pathname +  trial + '/WT2.csv')  
    filepath.parent.mkdir(parents=True, exist_ok=True)  
    new_wt_df.to_csv(filepath)  

    # HET 
    het_new_dict = add_z_score(het_dict, wt_df)
    new_het_df = pd.DataFrame(het_new_dict).transpose()
        
    filepath = Path(pathname +  trial + '/HET2.csv')  
    filepath.parent.mkdir(parents=True, exist_ok=True)  
    new_het_df.to_csv(filepath)

    # NULL 
    null_new_dict = add_z_score(null_dict, wt_df)
    new_null_df = pd.DataFrame(null_new_dict).transpose()
        
    filepath = Path(pathname +  trial + '/NULL2.csv')  
    filepath.parent.mkdir(parents=True, exist_ok=True)  
    new_null_df.to_csv(filepath)