In [None]:
import trackpy as tp
import os
import pandas as pd
import numpy as np
import numba
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.lines as mlines
import pickle
import cv2
import math
from datetime import date
import datetime
import time
from IPython.display import display
import ipywidgets as widgets
import re
import scipy.optimize as sco
import seaborn as sns
import av
from tqdm.notebook import tqdm
import winsound
import tkinter as tk
from tkinter import filedialog
import warnings
warnings.filterwarnings(action='once')

In [None]:
create_folder_list = True

def find_analysis_folders(root_folder):
    analysis_folders = []
    for root, dirs, files in os.walk(root_folder):
        for folder in dirs:
            if folder.lower() == "analysis":
                analysis_folders.append(os.path.join(root, folder))
    return analysis_folders


def get_subfolders(analysis_folders):
    subfolders = []
    for folder in analysis_folders:
        for subitem in os.listdir(folder):
            subitem_path = os.path.join(folder, subitem)
            if os.path.isdir(subitem_path):
                subfolders.append(subitem_path)
    return subfolders


avi_folder = r'C:\Users\vhorowit\Desktop\BessLawrence\Data'
# avi_folder = r'G:\Shared drives\Horowitz Lab Notes\Rebecca Dalphin and Alex Axton'
AlexRebecca = False

info_csv = r'C:\Users\vhorowit\Desktop\BessLawrence\2023 Subdiffusion project data - Sheet1.csv'


#r'G:\Shared drives\Horowitz Lab Notes\Bess Lawrence Data'

if create_folder_list:
    analysis_folders = find_analysis_folders(r'C:\Users\vhorowit\Desktop\BessLawrence')
    folderlist = get_subfolders(analysis_folders)

In [None]:
info_df = pd.read_csv(info_csv)

In [None]:
this_info_df = info_df[info_df['Blocking'] == "Bess"]
this_info_df

In [None]:
#folderlist = [r'G:\Shared drives\Horowitz Lab Notes\Rebecca Dalphin and Alex Axton\Analysis\2023-05-27',
#             r'G:\Shared drives\Horowitz Lab Notes\Rebecca Dalphin and Alex Axton\Analysis\2023-05-28']

#savefolder = r'G:\Shared drives\Horowitz Lab Notes\Rebecca Dalphin and Alex Axton\Analysis\viva-analysis'
savefolder = r'C:\Users\vhorowit\Desktop\BessLawrence\Viva-Analysis'
saving = True

subtract_drift_slow = True
calculate_van_hove_slow = True

# scaling, measured in microns per pixel
#scaling = 330 / 1247.96 # 20x1.0, measured 2021-06-17
#scaling = 220 / 1250.04 # 20x1.5, measured 2021-06-17
scaling = 150 / 1127.54 # 40x1.0, measured 2021-06-16
#scaling = 100 / 1130.61 # 40x1.5, measured 2021-06-16
#scaling = 80 / 914.92 # 60x1.0, measured 2021-05-28
#scaling = 60 / 1031.07 # 60x1.5, measured 2021-05-28

In [None]:
def datestring():
    return datetime.datetime.today().strftime('%Y-%m-%d %H;%M;%S')

#remove parentheses and numbers from filenames
def remove_number_parentheses(filename):
    return re.sub(r'\(\d\)', '', filename)

def savefigure(savename):
    try:
        plt.savefig(savename + '.svg', dpi = 600, bbox_inches='tight', transparent=True)
    except:
        print('Could not save svg')
    try:
        plt.savefig(savename + '.pdf', dpi = 600, bbox_inches='tight', transparent=True)
           # transparent true source: https://jonathansoma.com/lede/data-studio/matplotlib/exporting-from-matplotlib-to-open-in-adobe-illustrator/
    except:
        print('Could not save pdf')
    plt.savefig(savename + '.png', dpi = 600, bbox_inches='tight', transparent=True)
    print("Saved:\n", savename + '.png')

def beep():
    try:
        winsound.PlaySound(r'C:\Windows\Media\Speech Disambiguation.wav', flags = winsound.SND_ASYNC)
        return
    except:
        pass
    try:
        winsound.PlaySound("SystemHand", winsound.SND_ALIAS)
        return
    except:
        pass
    try:
        winsound.Beep(450,150)
        return
    except:
        pass

In [None]:
df_list = []
filename_list = []

"""
Note about file names
unfiltered.pkl = every particle trajectory, immediately after it was linked.
.pkl = filtered to remove stubs
nodrift.pkl = drift has been subtracted
"""



for folder in folderlist:
    os.chdir(folder)
    # Iterate over files in the folder
    for filename in os.listdir(folder):
        if filename.endswith('control_tracer.pkl'):
            #print(os.path.join(folder,filename))
            if os.path.isfile(filename):
                df_list.append(pd.read_pickle(filename))
                filename_list.append(os.path.join(folder,filename))
            else:
                print('is not a file. **************')
            
#filename_list

In [None]:
def extract_short_file(filepath):
    match = re.search(r'Data Taken (\d{4}-\d{2}-\d{2}), (.*)control_tracer.pkl', filepath)
    if match:
        extracted_part = match.group(2).strip()
    else:
        base_name = os.path.basename(filepath)  # Get the base name from the path
        start_index = base_name.rfind(',') + 2  # Find the index of the comma and add 2 to skip the comma and space
        extracted_part = base_name[start_index:].replace("control_tracer.pkl", "").strip()  # Remove "control_tracer.pkl" and leading/trailing spaces
    return extracted_part

shortfiles = [extract_short_file(item) for item in filename_list]


analysis_files_df = pd.DataFrame({'Analysis file': filename_list, 'control_tracer.pkl': shortfiles })

In [None]:
def get_fps_from_avi(avi_file): # avi_file is a file path
    assert(os.path.exists(avi_file))
    try:
        video = cv2.VideoCapture(avi_file)
        fps = video.get(cv2.CAP_PROP_FPS)
        video.release()
        if fps == 0:
            raise Exception(f"fps must not be zero: {avi_file}")
    except:
        print('Using backup method')
        import av
        container = av.open(avi_file)
        fps = container.streams.video[0].average_rate
        container.close()
    return float(fps)

data_files = []
fps_list = []
#os.chdir(r'G:\Shared drives\Horowitz Lab Notes\Rebecca Dalphin and Alex Axton')
#for filename in os.listdir(r'G:\Shared drives\Horowitz Lab Notes\Rebecca Dalphin and Alex Axton'):
for root, dirs, files in os.walk(avi_folder):
    for filename in files:
        full_path = os.path.join(root, filename)
        if filename.endswith('.avi'):
            #print(full_path)
            fps = get_fps_from_avi(full_path)
            data_files.append(full_path)
            fps_list.append(fps)
            #print(fps)

datafiles_df = pd.DataFrame({"Data file":data_files, "fps": fps_list })

datafiles_df['data_file name'] = datafiles_df['Data file'].apply(lambda x: x.split('\\')[-1])

"""with pd.option_context('display.max_rows', None, 
                       'display.max_columns', None, 
                       'display.max_colwidth', None, 
                       'display.expand_frame_repr', False):
    display(datafiles_df)"""

    


In [None]:
combined_df = datafiles_df.merge(this_info_df, on='data_file name')
combined_df = analysis_files_df.merge(combined_df, on='control_tracer.pkl')

combined_df

In [None]:
len(datafiles_df)

In [None]:
len(combined_df)

In [None]:
len(this_info_df)

In [None]:
len(analysis_files_df)

In [None]:
combined_df.fps

In [None]:
os.chdir(savefolder)

start = time.time() # about 3 minutes

if subtract_drift_slow:
    from rotational_drift_subtraction import drift_subtract
    
    warnings.filterwarnings("ignore", category=PendingDeprecationWarning)
    # Some day this will come back to bite me. Today is not that day.


    df_list_nodrift = []
    for df in tqdm(df_list):
        _, df_nodrift = drift_subtract(tracer = df, show_plots=False )
        df_list_nodrift.append(df_nodrift)


    # Re-enable the warning
    warnings.filterwarnings("default", category=PendingDeprecationWarning)
    
    data_dict = {'filename_list': filename_list, 'df_list_nodrift':df_list_nodrift}
    with open('drift_subtracted_trajectories_dictionary.pkl', 'wb') as file:
        pickle.dump(data_dict, file)
else:
    with open('drift_subtracted_trajectories_dictionary.pkl', 'rb') as file:
        data_dict = pickle.load(file)
    # Extract elements from the dictionary
    filename_list = data_dict['filename_list']
    df_list_nodrift = data_dict['df_list_nodrift']
    
    
end = time.time()
print((end-start)/60, 'minutes elapsed')

In [None]:
df_series_nodrift = pd.DataFrame(data_dict)['df_list_nodrift']

In [None]:
debug = False

if debug and len(filename_list) == len(df_list_nodrift):
    df_list_nodrift_orig = df_list_nodrift.copy()
    df_list_nodrift = df_list_nodrift[10:15]
else:
    ## Skip the beginning of the movie: it's the most likely to have dropped frames.
    df_list_nodrift = df_list_nodrift[5:]

In [None]:
"""
Normalizes histogram data so that the sum of probabilities is one.

@param histdata - the starting histogram

@return the normalized histogram of probabilites
"""
def manualnorm(histdata):
     return (1/(histdata.sum()*binwidth))*histdata
    
"""
Outputs f(x) where f is a Gaussian curve.

@param x - the independent variable
@param a [0] - Gaussian amplitude
@param center [1] - Gaussian center
@param sigma [2] - Gaussian standard deviation (width)

@return f(x)
"""
def gaussian(x,a,center,sigma):
    return a*(np.exp(-((x-center)**2)/(2*(sigma**2)))) # sigma is standard deviation.

In [None]:
len(shortfiles)

In [None]:
len(df_list_nodrift)

In [None]:
shortfiles

In [None]:
shortfiles == combined_df['control_tracer.pkl']

In [None]:
calculate_van_hove_slow = True
# It takes about two minutes to do 5 lagtimes.

In [None]:
## Calculate probability distribution function (van hove)

if calculate_van_hove_slow:
    maxlagtime = 120#40 # in number of frames  
            # for 120, maybe vanhove_max of 7. for 40, perhaps 4.  (120 took 99 minutes.)
    skip = int(maxlagtime/5)
    vanhove_max_x = 7
    binwidth = 0.04
    figsize = [8, 8]
    
    show_plot = False

    start_time = time.time()
    warnings.filterwarnings("ignore", category=RuntimeWarning)
    warnings.filterwarnings("ignore", category=DeprecationWarning)

    # initializations before for loop
    binsequence = np.arange(-vanhove_max_x, vanhove_max_x, binwidth)
    trackshistlistlist = []
    trackshistlistlisty = []
    kurtosisdflist = []
    kurtosisdflisty = []
    lagtimelist = []
    color_mapping = {}
    numplots = int(maxlagtime / skip)
    i = 0
    big_klist = []
    big_xylist = []
    big_tlist = []
    big_filepathlist = []
    big_descriptionlist = []
    if show_plot:
        fig, axs = plt.subplots(int(math.ceil(numplots / 5)), 5, figsize=figsize, dpi=300)
        fig2, ax2 = plt.subplots(1, 1, figsize=figsize, dpi=300)

    assert(len(shortfiles) ==len(df_series_nodrift))   

    for (row_tuple, file, filepath, df, fps, ii) in tqdm(zip(combined_df.itertuples(), 
                                                             combined_df['control_tracer.pkl'], 
                                                             combined_df['Analysis file'], 
                                                             df_series_nodrift, 
                                                             combined_df.fps, 
                                                             range(len(combined_df)))):
        if AlexRebecca:
            this_description = re.sub(r'\(\d\)$', '', file).strip()
        else:
            this_description = row_tuple.Description

        if this_description not in color_mapping:
            color_mapping[this_description] = plt.rcParams['axes.prop_cycle'].by_key()['color'][i % len(plt.rcParams['axes.prop_cycle'])]
            i += 1

        color = color_mapping[this_description]

        tracksbyframex = df.set_index(['frame', 'particle'])['x'].unstack()
        tracksbyframey = df.set_index(['frame', 'particle'])['y'].unstack()
        trackshistlist = []
        trackshistlisty = []
        klist = []
        kylist = []
        tlist = []
        

        j = 0
        for vanhove_lagtime_inframes in range(1, maxlagtime + 1, skip):
            if ii == 0:
                lagtimelist.append(vanhove_lagtime_inframes/fps)
                
            if show_plot:
                plt.sca(axs.flat[j])
                plt.title("{:.2f} ms".format(1000 * vanhove_lagtime_inframes / fps))

            # calculate vanhove / probability distribution function
            trackshist = manualnorm(tp.motion.vanhove(
                tracksbyframex, lagtime=vanhove_lagtime_inframes, mpp=scaling, bins=binsequence, ensemble=True))
            trackshist.name = 'x Probability distribution'
            trackshisty = manualnorm(tp.motion.vanhove(
                tracksbyframey, lagtime=vanhove_lagtime_inframes, mpp=scaling, bins=binsequence, ensemble=True))
            trackshisty.name = 'y Probability distribution'
            if show_plot:
                trackshist.plot(ax=axs.flat[j], label=file, color=color)
                trackshisty.plot(ax=axs.flat[j], label=file, color=color)
            trackshistlist.append(trackshist)
            trackshistlisty.append(trackshisty)

            kx = trackshist.kurtosis()
            klist.append(kx)
            ky = trackshisty.kurtosis()
            kylist.append(ky)
            
            big_klist.append(kx)
            big_xylist.append('x')
            big_klist.append(ky)
            big_xylist.append('y')
            tlist.append(vanhove_lagtime_inframes / fps)
            big_tlist.append(vanhove_lagtime_inframes / fps)
            big_tlist.append(vanhove_lagtime_inframes / fps)
            big_filepathlist.append(filepath)
            big_filepathlist.append(filepath)
            big_descriptionlist.append(this_description)
            big_descriptionlist.append(this_description)
            j += 1

        kurtosisdf = pd.DataFrame({'time (s)': tlist, file + ' kurtosis': klist})
        kurtosisdfy = pd.DataFrame({'time (s)': tlist, file + ' kurtosis': kylist})
        if show_plot:
            plt.sca(ax2)
            plt.xlabel('lag time (s)')
            plt.ylabel('kurtosis')
            kurtosisdf.set_index('time (s)').plot(label=file, ax=ax2, color=color)

        trackshistlistlist.append(trackshistlist)
        trackshistlistlisty.append(trackshistlisty)
        kurtosisdflist.append(kurtosisdf)
        kurtosisdflisty.append(kurtosisdfy)

    big_kurtosis_dict = {
        'File': big_filepathlist,
        'Description': big_descriptionlist,
        'time (s)': big_tlist,
        'kurtosis': big_klist,
        'xy': big_xylist,
    }
    big_kurtosis_df = pd.DataFrame(big_kurtosis_dict, index=[big_tlist, big_filepathlist ])
    if show_plot:
        ax2.set_yscale('log')
        plt.ylabel('kurtosis')

        ## I want to consolidate the figure legend to just the unique filenames (not counting (1), (2), (3))
        # Get the legend handles and labels
        legend_handles, legend_labels = ax2.get_legend_handles_labels()

        # Extract the short file names using regular expressions
        short_labels = []
        handle_mapping = {}

        for handle, label in zip(legend_handles, legend_labels):
            match = re.search(r'(.+?) \(\d+\) kurtosis', label)
            if match:
                short_label = match.group(1)
                if short_label not in handle_mapping:
                    handle_mapping[short_label] = handle
                    short_labels.append(short_label)
            else:
                short_labels.append(label)

        # Create new handles using the handle mapping dictionary
        new_handles = [handle_mapping[label] for label in short_labels]

        # Set the modified handles and labels
        ax2.legend(new_handles, short_labels, bbox_to_anchor=(1, 1), loc='upper left')  # Move the legend to the right


        plt.show()

    warnings.filterwarnings("default", category=RuntimeWarning)
    warnings.filterwarnings("default", category=DeprecationWarning)

    ## Calculate the Gaussian fitting parameters and store the information
    do_curvefit = True
    show_curvefit = False

    gaussian_fit_paramsxlistlist = []
    gaussian_fit_paramsylistlist = []
    gaussian_fit_covmxlistlist = []
    gaussian_fit_covmylistlist = []
    
    big_sigmalist = []
    big_tlist = []
    big_filepathlist = []
    big_descriptionlist = []
    big_sigma_stderr = []
    big_xy_list = []

    sigma_index = 2 # Index of the parameter
    
    if show_curvefit:
        fig, axs = plt.subplots(int(math.ceil(numplots / 5)), 5, figsize=figsize, dpi=300)
        fig3, axsy = plt.subplots(int(math.ceil(numplots / 5)), 5, figsize=figsize, dpi=300)
        fig.suptitle('Probability distribution in x direction')
        fig3.suptitle('Probability distribution in y direction')

    if do_curvefit:
        for filename, file, trackshistlist, trackshistlisty, kurtosisdf, kurtosisdfy in tqdm(zip(
            filename_list, shortfiles, trackshistlistlist, trackshistlistlisty, kurtosisdflist, kurtosisdflisty
        )):
            gaussian_fit_paramsxlist = []
            gaussian_fit_paramsylist = []
            gaussian_fit_covmxlist = []
            gaussian_fit_covmylist = []

            j = 0
            for vanhove_lagtime_insecs, trackshist, trackshisty in zip(
                lagtimelist, trackshistlist, trackshistlisty
            ):

                gaussian_fit_paramsx, gaussian_fit_covmx = sco.curve_fit(gaussian, trackshist.index, trackshist.values)
                gaussian_fit_paramsy, gaussian_fit_covmy = sco.curve_fit(gaussian, trackshisty.index, trackshisty.values)

                sigma_x_stderr = np.sqrt(gaussian_fit_covmx[sigma_index, sigma_index])
                sigma_y_stderr = np.sqrt(gaussian_fit_covmy[sigma_index, sigma_index])

                
                for p in range(2):
                    big_filepathlist.append(filename)
                    big_descriptionlist.append(remove_number_parentheses(file))
                    big_tlist.append(vanhove_lagtime_insecs)

                big_xy_list.append('x')
                big_sigmalist.append(gaussian_fit_paramsx[sigma_index])
                big_sigma_stderr.append(sigma_x_stderr)
                
                big_xy_list.append('y')
                big_sigmalist.append(gaussian_fit_paramsy[sigma_index])
                big_sigma_stderr.append(sigma_y_stderr)
                
                if show_curvefit:
                    # Vary colors if this_description is different
                    this_description = re.sub(r"\(\d\)$", "", file).strip()
                    if this_description not in color_mapping:
                        color_mapping[this_description] = plt.rcParams["axes.prop_cycle"].by_key()["color"][
                            i % len(plt.rcParams["axes.prop_cycle"])
                        ]
                        i += 1
                    color = color_mapping[this_description]

                    gaussian_fitx = gaussian(trackshist.index, gaussian_fit_paramsx[0],
                                             gaussian_fit_paramsx[1], gaussian_fit_paramsx[2])
                    gaussian_fity = gaussian(trackshisty.index, gaussian_fit_paramsy[0],
                                             gaussian_fit_paramsy[1], gaussian_fit_paramsy[2])
                    (axs.flat[j]).plot(trackshist.index, gaussian_fitx, color=color, linewidth=0.3, alpha=0.3)
                    (axsy.flat[j]).plot(trackshisty.index, gaussian_fity, color=color, linewidth=0.3, alpha=0.3)

                    # Plot vanhove datapoints 
                    trackshist.plot(
                        marker=".",
                        markersize=1,
                        linestyle="",
                        ax=axs.flat[j],
                        label=file + " x",
                        color=color,
                        alpha=0.3,
                    )

                    trackshisty.plot(
                        marker=".",
                        markersize=1,
                        linestyle="",
                        ax=axsy.flat[j],
                        label=file + " y",
                        color=color,
                        alpha=0.3,
                    )

                    for ax in [axs.flat[j], axsy.flat[j]]:
                        ax.set_title("{:.2f} ms".format(1000 * vanhove_lagtime_insecs))
                        ax.set_xticks([int(trackshist.index.min())+1, 0, int(trackshist.index.max())])
                        ax.set_yscale("log")
                    try:
                        (axs.flat[j]).set_xlim(xlimx[j])
                        (axs.flat[j]).set_ylim(ylimx[j])
                        (axsy.flat[j]).set_xlim(xlimy[j])
                        (axsy.flat[j]).set_ylim(ylimy[j])
                    except:
                        pass

                gaussian_fit_paramsxlist.append(gaussian_fit_paramsx)
                gaussian_fit_paramsylist.append(gaussian_fit_paramsy)
                gaussian_fit_covmxlist.append(gaussian_fit_covmx)
                gaussian_fit_covmylist.append(gaussian_fit_covmy)

                j += 1
                

            # Store the lists for each file and lag time
            gaussian_fit_paramsxlistlist.append(gaussian_fit_paramsxlist)
            gaussian_fit_paramsylistlist.append(gaussian_fit_paramsylist)
            gaussian_fit_covmxlistlist.append(gaussian_fit_covmxlist)
            gaussian_fit_covmylistlist.append(gaussian_fit_covmylist)
    
            
        if show_curvefit:
            fig.tight_layout()
            fig3.tight_layout()
    
    
    big_g_dict = {
        'File': big_filepathlist,
        'Description': big_descriptionlist,
        'time (s)': big_tlist,
        #'color': color,
        'width sigma': big_sigmalist,
        'sigma stderr': big_sigma_stderr,
        'xy': big_xy_list
    }

    big_gaussian_df = pd.DataFrame(big_g_dict, index=[big_tlist, big_filepathlist ])
    big_data_df = pd.concat([big_kurtosis_df, big_gaussian_df], axis = 1)
    
    # Get a boolean mask of duplicated columns
    duplicate_mask = big_data_df.columns.duplicated()
    # Get the column indexes to keep (excluding duplicates)
    column_indexes_to_keep = [i for i in range(len(big_data_df.columns)) if not duplicate_mask[i]]
    # Create a new DataFrame with the selected columns (excluding duplicates)
    big_data_df = big_data_df.iloc[:, column_indexes_to_keep]
    
    ## Saving results
    # Create a dictionary to save
    data_dict_vanhove = {
        'filename_list': filename_list,
        'binsequence': binsequence,
        'trackshistlistlist': trackshistlistlist,
        'trackshistlistlisty': trackshistlistlisty,
        'kurtosisdflist': kurtosisdflist,
        'kurtosisdflisty': kurtosisdflisty,
        'gaussian_fit_paramsxlistlist': gaussian_fit_paramsxlistlist,
        'gaussian_fit_paramsylistlist': gaussian_fit_paramsylistlist,
        'gaussian_fit_covmxlistlist': gaussian_fit_covmxlistlist,
        'gaussian_fit_covmylistlist': gaussian_fit_covmylistlist,
        'lagtimelist': lagtimelist,
        'maxlagtime': maxlagtime,
        'skip': skip,
        'scaling': scaling,
        'fps': fps,
        'big_data_df': big_data_df, # added 2023-06-22
    }

    today = date.today().isoformat() # Get today's date in ISO format
    pickle_file = f'data_dict_vanhove_{today}_{len(lagtimelist)}lagtimes.pkl' # Create the pickle file name

    os.chdir(savefolder)
    # Save the dictionary to a pickle file
    with open(pickle_file, 'wb') as f:
        pickle.dump(data_dict_vanhove, f)
        
    beep()

    #calculate_van_hove_slow = False

    end_time = time.time()
    execution_time_minutes = (end_time - start_time) / 60
    print("Execution time:", execution_time_minutes, "minutes")
else: # open saved file
    root = tk.Tk() # Create a Tkinter root window
    root.lift()
    root.withdraw()  # Hide the root window
    
    print('Look for the file dialogue window!')
    beep()
    beep()

    # Open the file dialog to select the pickle file
    file_path = filedialog.askopenfilename(filetypes=[("Pickle Files", "*.pkl")])

    # Check if a file was selected
    if file_path:
        # Process the selected pickle file
        with open(file_path, 'rb') as f:
            data_dict_vanhove = pickle.load(f)
        # Extract the variables from the dictionary
        filename_list = data_dict_vanhove['filename_list']
        binsequence = data_dict_vanhove['binsequence']
        trackshistlistlist = data_dict_vanhove['trackshistlistlist']
        trackshistlistlisty = data_dict_vanhove['trackshistlistlisty']
        kurtosisdflist = data_dict_vanhove['kurtosisdflist']
        kurtosisdflisty = data_dict_vanhove['kurtosisdflisty']
        gaussian_fit_paramsxlistlist = data_dict_vanhove['gaussian_fit_paramsxlistlist']
        gaussian_fit_paramsylistlist = data_dict_vanhove['gaussian_fit_paramsylistlist']
        gaussian_fit_covmxlistlist = data_dict_vanhove['gaussian_fit_covmxlistlist']
        gaussian_fit_covmylistlist = data_dict_vanhove['gaussian_fit_covmylistlist']
        lagtimelist = data_dict_vanhove['lagtimelist']  # in seconds
        maxlagtime = data_dict_vanhove['maxlagtime']  # in number of frames
        skip = data_dict_vanhove['skip']  # in number of frames
        scaling = data_dict_vanhove['scaling']  # in microns per pixel
        fps = data_dict_vanhove['fps']  # in Hz
        try:
            big_data_df = data_dict_vanhove['big_data_df']
            
            ## If big_data_df has duplicated columns, this will remove them.
            # Get a boolean mask of duplicated columns
            duplicate_mask = big_data_df.columns.duplicated()
            # Get the column indexes to keep (excluding duplicates)
            column_indexes_to_keep = [i for i in range(len(big_data_df.columns)) if not duplicate_mask[i]]
            # Create a new DataFrame with the selected columns
            big_data_df = big_data_df.iloc[:, column_indexes_to_keep]
        except:
            print('Could not extract big_data_df; this must be an older file.')
        print('Loaded file', file_path)
    else:
        print("No file selected.")
        


In [None]:
big_gaussian_df.index.duplicated()

In [None]:
big_kurtosis_df.index.duplicated()

In [None]:
big_gaussian_df.index == big_kurtosis_df.index

In [None]:
with pd.option_context('display.multi_sparse', False):
    display(list(big_gaussian_df.index))

In [None]:
big_kurtosis_df.index

In [None]:
big_data_df

In [None]:
big_data_df.columns

In [None]:
# Select the desired time index for plotting
time_index = 2  # Replace with the appropriate index

# Get the time value corresponding to the chosen time index
time_value = lagtimelist[time_index] 

data = big_data_df[big_data_df['time (s)'] == time_value]

# Create a violin plot
sns.violinplot(x='Description', y='width sigma', data=data, color='skyblue')
sns.stripplot(x='Description', y='width sigma', data=data, color='black', jitter=True, size=5)

plt.xlabel('')
plt.ylabel('Sigma')
plt.title('Gaussian Sigma at Time ' + str(time_value) + ' s')
plt.xticks(rotation=90)

plt.show()


In [None]:
# Create lists for Gaussian widths
gaussian_widths = []
gaussian_widthsx = []
gaussian_widthsy = []

for file, gaussian_fit_paramsxlist, gaussian_fit_paramsylist in zip(shortfiles, gaussian_fit_paramsxlistlist, gaussian_fit_paramsylistlist):
    # Calculate the standard deviation (sigma) for each lag time
    sigmas = []
    sigmas_x = []
    sigmas_y = []
    for gaussian_fit_paramsx, gaussian_fit_paramsy in zip(gaussian_fit_paramsxlist, gaussian_fit_paramsylist):
        sigma_x = gaussian_fit_paramsx[2]
        sigma_y = gaussian_fit_paramsy[2]
        sigmas.append((sigma_x, sigma_y))
        sigmas_x.append(sigma_x)
        sigmas_y.append(sigma_y)
    gaussian_widths.append((file, sigmas))
    gaussian_widthsx.append((file, sigmas_x))
    gaussian_widthsy.append((file, sigmas_y))

# Select the desired time index for plotting
time_index = 1  # Replace with the appropriate index

# Extract the sigma values at the chosen time index
sigma_values_x = [width_tuple[1][time_index][0] for width_tuple in gaussian_widths]
sigma_values_y = [width_tuple[1][time_index][1] for width_tuple in gaussian_widths]

shorterfiles = [remove_number_parentheses(file) for file in shortfiles]

# Create a pandas DataFrame
sigmadf = pd.DataFrame({'Short Names': shorterfiles, 'Sigma_X': sigma_values_x, 'Sigma_Y': sigma_values_y})

# Get the time value corresponding to the chosen time index
time_value = lagtimelist[time_index] 

# Create a violin plot
sns.violinplot(x='Short Names', y='Sigma_X', data=sigmadf, color='skyblue')
sns.stripplot(x='Short Names', y='Sigma_X', data=sigmadf, color='black', jitter=True, size=5)

plt.xlabel('')
plt.ylabel('Sigma X')
plt.title('Gaussian Sigma X at Time ' + str(time_value) + ' s')
plt.xticks(rotation=90)

plt.show()

# Create a separate violin plot for Sigma Y
sns.violinplot(x='Short Names', y='Sigma_Y', data=sigmadf, color='skyblue')
sns.stripplot(x='Short Names', y='Sigma_Y', data=sigmadf, color='black', jitter=False, size=5)

plt.xlabel('')
plt.ylabel('Sigma Y')
plt.title('Gaussian Sigma Y at Time ' + str(time_value) + ' s')
plt.xticks(rotation=90)

plt.show()


In [None]:
# Combine Sigma_X and Sigma_Y values into a single column
combined_values = pd.melt(sigmadf, id_vars='Short Names', value_vars=['Sigma_X', 'Sigma_Y'], var_name='Variable', value_name='Sigma')

# Define the order for the horizontal axis (largest to smallest)
order = combined_values.groupby('Short Names')['Sigma'].mean().sort_values(ascending=False).index

# Create a violin plot for combined Sigma_X and Sigma_Y with the specified order
sns.violinplot(x='Short Names', y='Sigma', hue='Variable', data=combined_values, palette='muted', split=True, order=order)
sns.stripplot(x='Short Names', y='Sigma', hue='Variable', data=combined_values, order=order, palette='dark', dodge=True, jitter=True, size=5)

plt.xlabel('')
plt.ylabel('Sigma ($\mu$m)')
plt.title('Gaussian Sigma at Time ' + str(time_value) + ' s')
plt.xticks(rotation=90)

plt.legend(title='Variable')

os.chdir(r'C:\Users\vhorowit\Documents\fig-expt')
datestr = datestring()
if saving:
    savefigure(datestr + 'Gaussian width violin')
#plt.show()

In [None]:
# Plot previously calculated probability distribution function

show_curvefit = True
plots_per_row = 5

assert len(filename_list) == len(trackshistlistlist)
assert len(filename_list) == len(trackshistlistlisty)
assert len(filename_list) == len(kurtosisdflist)
assert len(filename_list) == len(kurtosisdflisty)

numplots = int(maxlagtime / skip)
plots_per_column = int(math.ceil(numplots / plots_per_row))
figsize = [8, max(2, plots_per_column*1.5)]

fig, axs = plt.subplots(plots_per_column, plots_per_row, figsize=figsize, dpi=300)
plt.subplots_adjust(top=.96)
fig3, axsy = plt.subplots(plots_per_column, plots_per_row, figsize=figsize, dpi=300)
plt.subplots_adjust(top=.96)

plt.subplots_adjust(hspace=0.4)  # Adjust the spacing between subplots

for i in range(plots_per_column-1):
    for j in range(plots_per_row):
        axs[i, j].set_xticklabels([])  # Set shared x-axis for axs
        axs[i, j].tick_params(bottom=True, labelbottom=True)  # Customize tick labels for the last row of axs

        axsy[i, j].set_xticklabels([])  # Set shared x-axis for axsy
        axsy[i, j].tick_params(bottom=True, labelbottom=True)  # Customize tick labels for the last row of axsy

fig.suptitle('Probability distribution in x direction')
fig3.suptitle('Probability distribution in y direction')

fig2, ax2 = plt.subplots(1, 1, )
fig4, ax4 = plt.subplots(1, 1, )

color_mapping = {}
if not show_curvefit:
    xlimx = np.zeros_like(axs.flat)
    xlimy = np.zeros_like(axs.flat)
    ylimx = np.zeros_like(axsy.flat)
    ylimy = np.zeros_like(axsy.flat)
i = 0

warnings.filterwarnings("ignore", category=DeprecationWarning) # Not my fault. There's some dependency.


for file, trackshistlist, trackshistlisty, kurtosisdf, kurtosisdfy, gaussian_fit_paramsxlist, gaussian_fit_paramsylist in tqdm(
    zip(shortfiles, trackshistlistlist, trackshistlistlisty, kurtosisdflist, kurtosisdflisty, gaussian_fit_paramsxlistlist, gaussian_fit_paramsylistlist),
    total=len(shortfiles),
):
    # Vary colors if base_filename is different
    base_filename = re.sub(r"\(\d\)$", "", file).strip()
    if base_filename not in color_mapping:
        color_mapping[base_filename] = plt.rcParams["axes.prop_cycle"].by_key()["color"][
            i % len(plt.rcParams["axes.prop_cycle"])
        ]
        i += 1
    color = color_mapping[base_filename]

    j = 0
    
    for vanhove_lagtime_insecs, trackshist, trackshisty, gaussian_fit_paramsx, gaussian_fit_paramsy in zip(
        lagtimelist, trackshistlist, trackshistlisty, gaussian_fit_paramsxlist, gaussian_fit_paramsylist
    ):
        if show_curvefit:
            try:
                gaussian_fitx = gaussian(trackshist.index, gaussian_fit_paramsx[0],
                                         gaussian_fit_paramsx[1], gaussian_fit_paramsx[2])
                gaussian_fity = gaussian(trackshisty.index, gaussian_fit_paramsy[0],
                                         gaussian_fit_paramsy[1], gaussian_fit_paramsy[2])

                (axs.flat[j]).plot(trackshist.index, gaussian_fitx, color=color, linewidth=0.3, alpha=0.3)
                (axsy.flat[j]).plot(trackshisty.index, gaussian_fity, color=color, linewidth=0.3, alpha=0.3)

                try:
                    (axs.flat[j]).set_xlim(xlimx[j])
                    (axs.flat[j]).set_ylim(ylimx[j])
                    (axsy.flat[j]).set_xlim(xlimy[j])
                    (axsy.flat[j]).set_ylim(ylimy[j])
                except:
                    pass
            except:
                print('Failed to show Gaussian fits.')
            #ax4.scatter(x=vanhove_lagtime_insecs,y=gaussian_fit_paramsx[2], color = color, alpha = 0.3)
        
        # Plot vanhove datapoints 
        trackshist.plot(
            marker=".",
            markersize=1,
            linestyle="",
            ax=axs.flat[j],
            label=file + " x",
            color=color,
            alpha=0.3,
        )

        trackshisty.plot(
            marker=".",
            markersize=1,
            linestyle="",
            ax=axsy.flat[j],
            label=file + " y",
            color=color,
            alpha=0.3,
        )

        for ax in [axs.flat[j], axsy.flat[j]]:
            ax.set_title("{:.2f} ms".format(1000 * vanhove_lagtime_insecs))
            ax.set_xticks([int(trackshist.index.min())+1, 0, int(trackshist.index.max())])
            ax.set_yscale("log")
        if not show_curvefit:
            xlimx[j] = axs.flat[j].get_xlim()
            ylimx[j] = axs.flat[j].get_ylim()
            xlimy[j] = axsy.flat[j].get_xlim()
            ylimy[j] = axsy.flat[j].get_ylim()
        j += 1
        
    # Gaussian widths
    ax4.plot(lagtimelist, np.array(gaussian_fit_paramsxlist)[:, 2], color=color, alpha = .5)
    ax4.plot(lagtimelist, np.array(gaussian_fit_paramsylist)[:, 2], color=color, alpha = .5)

    # Kurtosis
    kurtosisdf.set_index("time (s)").plot(ax=ax2, color=color, label=file + " x", alpha = .3)
    kurtosisdfy.set_index("time (s)").plot(ax=ax2, color=color, label=file + " y", alpha = .3)

warnings.resetwarnings()

datestr = datestring()

plt.sca(ax4)
plt.xlabel('Time (s)')
plt.ylabel('Gaussian width (um)')
plt.sca(ax2)
plt.xlabel('Time (s)')
plt.ylabel("Kurtosis")

fig3.tight_layout()
fig.tight_layout()

# Consolidate the figure legend to just the unique filenames (not counting (1), (2), (3))

handles, legend_labels = ax2.get_legend_handles_labels()
short_labels = []
handle_mapping = {}
for handle, label in zip(handles, legend_labels):
    match = re.search(r'(.+?) \(\d+\) kurtosis', label)  # Extract the short file names using regular expressions
    if match:
        short_label = match.group(1)
        if short_label not in handle_mapping:
            handle_mapping[short_label] = handle
            short_labels.append(short_label)
    else:
        short_labels.append(label)

new_handles = [] # Create new handles
for label in short_labels:
    handle = handle_mapping[label]
    # Create new Line2D objects with solid lines and no alpha
    new_handle = mlines.Line2D([], [], color=handle.get_color(), linestyle='-', linewidth=handle.get_linewidth())
    new_handles.append(new_handle)
    
# Set the modified handles and labels
ax2.legend(handles=new_handles, labels=short_labels, bbox_to_anchor=(1, 1), loc='upper left')  # Move the legend to the right

beep()

In [None]:
saving = True

if saving:
    ## Save figures
    plt.figure(fig)
    savefigure(datestr + 'PDx')
    plt.figure(fig2)
    savefigure(datestr + 'kurtosis')
    plt.figure(fig3)
    savefigure(datestr + 'PDy')
    plt.figure(fig4)
    savefigure(datestr + 'Gaussian width')

In [None]:

row = 2  # Select the desired row

shortnames = []  # List to store the modified short names
kurtosis_values = []  # List to store the kurtosis values

for kurtosisdf in kurtosisdflist + kurtosisdflisty:
    shortname = kurtosisdf.columns[1]  # Extract the short name from the column name
    shortname = re.sub(r'\(\d\)\s*kurtosis', '', shortname)  # Remove the "(1) kurtosis" or "(2) kurtosis" part
    kurtosis_value = kurtosisdf.loc[row][1]  # Extract the kurtosis value from the specified row

    shortnames.append(shortname)
    kurtosis_values.append(kurtosis_value)

# Create plot
plt.scatter(shortnames, kurtosis_values)
plt.xlabel('Short Names')
plt.ylabel('Kurtosis')
plt.title('Kurtosis Values at Time ' + str(kurtosisdflist[0].loc[row][0]) + ' s')  # Use the time from the first DataFrame
plt.xticks(rotation=90)  # Rotate x-axis labels if needed
plt.show()


In [None]:
# Combine the shortnames and kurtosis_values into a list of tuples
data = list(zip(shortnames, kurtosis_values))

# Sort the list of tuples based on the kurtosis values
sorted_data = sorted(data, key=lambda x: x[1])  # Sort by the second element of each tuple

# Unzip the sorted list of tuples back into separate lists
shortnames, kurtosis_values = zip(*sorted_data)

# Create a bar plot
plt.scatter(shortnames, kurtosis_values)
#plt.xlabel('Short Names')
plt.ylabel('Kurtosis')
plt.title('Kurtosis Values at Time ' + str(kurtosisdflist[0].loc[row][0]) + ' s')  # Use the time from the first DataFrame
plt.xticks(rotation=90)  # Rotate x-axis labels if needed
plt.show()

# Create a violin plot
data_df = pd.DataFrame({'Short Names': shortnames, 'Kurtosis': kurtosis_values})

sns.violinplot(x='Short Names', y='Kurtosis', data=data_df, color='skyblue')
sns.stripplot(x='Short Names', y='Kurtosis', data=data_df, color='black', jitter=True, size=5)

plt.xlabel('')
plt.ylabel('Kurtosis')
plt.title('Kurtosis Values at Time ' + str(kurtosisdf.loc[row][0]) + ' s')
plt.xticks(rotation=90);

os.chdir(savefolder)
saving = False
if saving:
    datestr = datestring()
    savefigure(datestr + 'kurtosis violin plot')


In [None]:
row = 2  # Select the desired row

shortnamesx = []  # List to store the modified short names
shortnamesy=[]
kurtosis_xvalues = []  # List to store the kurtosis values
kurtosis_yvalues = []  # List to store the kurtosis values

for kurtosisdf in kurtosisdflist:
    shortname = kurtosisdf.columns[1]  # Extract the short name from the column name
    shortname = re.sub(r'\(\d\)\s*kurtosis', '', shortname)  # Remove the "(1) kurtosis" or "(2) kurtosis" part
    kurtosis_value = kurtosisdf.loc[row][1]  # Extract the kurtosis value from the specified row
    
    shortnamesx.append(shortname)
    kurtosis_xvalues.append(kurtosis_value)
    
for kurtosisdf in kurtosisdflisty:
    shortname = kurtosisdf.columns[1]  # Extract the short name from the column name
    shortname = re.sub(r'\(\d\)\s*kurtosis', '', shortname)  # Remove the "(1) kurtosis" or "(2) kurtosis" part
    kurtosis_value = kurtosisdf.loc[row][1]  # Extract the kurtosis value from the specified row
    
    shortnamesy.append(shortname)
    kurtosis_yvalues.append(kurtosis_value)

# Create a bar plot
plt.scatter(shortnamesx, kurtosis_xvalues)
plt.scatter(shortnamesy, kurtosis_yvalues)
#plt.xlabel('Short Names')
plt.ylabel('Kurtosis')
plt.title('Kurtosis Values at Time ' + str(kurtosisdf.loc[row][0]) + ' s')
plt.xticks(rotation=90);  # Rotate x-axis labels