SciVis2023 Full Dataset Processing Pipeline

In [98]:
## Step 1: Split per-sensor input files into manageable chuncks

import os

input_dir = "viz-stimulus"  # "input"       # Directory containing the input files
output_dir = "output-stimulus"   #-lesion     # Directory to store the output chunks
chunk_size = 300          # Number of lines in each chunk (300 lines per 30 seconds)
line_interval = 100       # Sampling interval in milliseconds (100ms interval)
sensor_files = 50000      # Total number of input sensor files

def get_chunk_filename(sensor_id, begin_ms):
    return f"{output_dir}/simstep_{sensor_id}_{begin_ms}.csv"

def preprocess_file(input_file):
    #sensor_id = int(input_file.split(".")[0].split("_")[1], 16) # Convert hexadecimal to integer
    sensor_id = input_file.split(".")[0].split(".")[0].split("_")[1]
    line_count = 0

    with open('simsplit.log', 'a') as log_file:
        print(f"sensor_id: {sensor_id}", file=log_file)

        with open(input_file, 'r') as infile:
            chunk_lines = []
            for line in infile:
                chunk_lines.append(line.strip())
                line_count += 1
                if line_count % chunk_size == 0:
                    begin_ms = line_count * line_interval
                    chunk_filename = get_chunk_filename(sensor_id, begin_ms)

                    print(f"writing: {chunk_filename}", file=log_file)

                    with open(chunk_filename, 'w') as outfile:
                        outfile.write('\n'.join(chunk_lines))
                    chunk_lines.clear()

            # Save any remaining lines as a last chunk
            if chunk_lines:
                begin_ms = line_count * line_interval
                chunk_filename = get_chunk_filename(sensor_id, begin_ms)
                with open(chunk_filename, 'w') as outfile:
                    outfile.write('\n'.join(chunk_lines))
            
    
if __name__ == "__main__":
    os.makedirs(output_dir, exist_ok=True)

    print("Beginnng splitting process. Writing log file: simsplit.log"
    # Process each input file in the input directory
    for i in range(sensor_files):
        input_file = os.path.join(input_dir, f"0_{i}.csv")
        with open('simsplit.log', 'a') as log_file:
            print(f"Visiting: {input_file}",file=log_file)
        # break # stop after first file to testing
        preprocess_file(input_file)
        

In [None]:
# average fine grain data to chuncks
import os

input_dir = "output-stimulus"     #-lesion      # Directory containing the output chunks
output_dir = "averages-stimulus"  #-lesion   # Directory to store the averaged data
chunk_size = 100          # Number of lines in each chunk (100 lines per 10 seconds)
line_interval = 100       # Sampling interval in milliseconds (100ms interval)

def get_output_filename(sensor_id, begin_ms):
    return f"{output_dir}/averaged_{sensor_id}_{begin_ms}.csv"

def aggregate_data(input_file):
    sensor_id = int(input_file.split(".")[0].split("_")[1])
    begin_ms = int(input_file.split(".")[0].split("_")[2])
    output_filename = get_output_filename(sensor_id, begin_ms)

    with open(input_file, 'r') as infile:
        chunk_lines = []
        aggregated_data = []
        for line in infile:
            chunk_lines.append(line.strip())
            if len(chunk_lines) == chunk_size:
                avg_data = [sum(values) / len(values) for values in zip(*[map(float, line.split(";")) for line in chunk_lines])]
                avg_data[0] = begin_ms
                aggregated_data.append(avg_data)
                chunk_lines.clear()

        # Save the aggregated data to the output file
        with open(output_filename, 'w') as outfile:
            for idx, avg_data in enumerate(aggregated_data):
                if idx == 1:
                    avg_data[0] += 10000
                elif idx == 2:
                    avg_data[0] += 20000
                outfile.write(";".join(map(str, avg_data)))
                outfile.write("\n")
                begin_ms += line_interval

if __name__ == "__main__":
    os.makedirs(output_dir, exist_ok=True)

    print("Beginnng data averaging process. Writing log file: simavg.log")
    
    # Process each file in the "output" directory
    for file in os.listdir(input_dir):
        with open('simavg.log', 'a') as log_file:
            print(f"Visiting: {file}",file=log_file)
        if file.endswith(".csv"):
            input_file = os.path.join(input_dir, file)
            aggregate_data(input_file)

    print("Completed data averaging process. See log file: simavg.log")


Beginnng data averaging process. Writing log file: simavg.log


In [139]:
import os

input_dir = "averages-lesion" #"-calcium" #"-stimulus" #"-lesion"     # Directory containing the input files
output_dir = "aggregates-lesion" #"-calcium" #-stimulus" #"-lesion"  # Directory to store the aggregated data
sensor_files = 50000       # Total number of sensor files
timestamps = range(0, 970000, 30000)  # Timestamps from 0 to 970000 in steps of 30000

def get_output_filename(timestamp):
    return f"{output_dir}/simagg_{timestamp}.csv"

def process_file(input_file):
    sensor_id = int(input_file.split(".")[0].split("_")[1])
    try:
        with open(input_file, 'r') as infile:
            lines = infile.readlines()[:3]  # Process only the first 3 lines of each input file
            for idx in range(0, len(lines)):
                line = lines[idx].strip()
                timestamp = line.split(";")[0]
                output_filename = get_output_filename(timestamp)

                values = line.split(";")[1:]  # Extracting the 12 values after the first timestamp
                values_rounded = [round(float(value), 4) for value in values]  # Round each value to 4 decimal digits
                values_str = ";".join(map(str, values_rounded))

                if not os.path.exists(output_filename):
                    # Write the headings on the first line if the file doesn't exist
                    with open(output_filename, 'a') as outfile:
                        outfile.write("SensorID;" + ";".join(["Step", "Fired", "FiredFraction", "ElectricActivity",
                                                             "Inhibition", "Calcium", "TargetCalcium", "SynapticInput",
                                                             "BackgroundActivity", "GrownAxons", "ConnectedAxons",
                                                             "GrownDendrites", "ConnectedDendrites"]) + "\n")

                with open(output_filename, 'a') as outfile:
                    outfile.write(f"{sensor_id};{values_str}\n")

                with open('simagg.log', 'a') as log_file:
                    print(f"Appending line: {sensor_id};{values_str} to file: {output_filename}",file=log_file)

    except FileNotFoundError:
        with open('simagg.log', 'a') as log_file:
            print(f"File not found: {input_file}",file=log_file)

if __name__ == "__main__":
    os.makedirs(output_dir, exist_ok=True)

    print("Beginnng data aggregation process. Writing log file: simagg.log")

    # Process each file in the "averages" directory
    for i in range(sensor_files):
        for timestamp in timestamps:
            input_file = os.path.join(input_dir, f"averaged_{i}_{timestamp}.csv")
            with open('simagg.log', 'a') as log_file:
                print(f"Visiting: {input_file}",file=log_file)
            process_file(input_file)
    print("Completed data aggregation process. See log file: simagg.log")


Beginnng data aggregation process. Writing log file: simagg.log
Completed data aggregation process. See log file: simagg.log


In [150]:
# Merge Aggregates and Positions

import os
import csv

input_dir = "aggregates-stimulus" #"-lesion" #"-stimulus" #"-lesion"     # Directory containing the input files
output_dir = "merged-stimulus" #"-lesion" #"-calcium" #-stimulus" #"-lesion"  # Directory to store the aggregated data

#aggr_header = []
def merge_files():
    with open("positions.csv", 'r') as positions_file:
        positions_reader = csv.reader(positions_file)
        header = next(positions_reader)

        for filename in os.listdir(input_dir): #"-stimulus"):
            if filename.endswith(".csv"):
                timestamp = filename.split("_")[1].split(".")[0]
                print(f"Opening aggregates file: {filename}")

                with open(os.path.join(input_dir, filename), 'r') as aggregate_file:
                    aggregate_reader = csv.reader(aggregate_file)
                    aggr_header = next(aggregate_reader)  # Skip header
                    print(f"{filename} header: {aggr_header}")

                    merged_filename = os.path.join(output_dir, f"merged_data_{timestamp}.csv")
                    with open(merged_filename, 'w', newline='') as merged_file:
                        merged_writer = csv.writer(merged_file)
                        new_header = aggr_header[0].split(";")[0:]
                        new_header = header  + new_header[2:]
                        print(new_header) #[1] = header[1]
                        merged_writer.writerow(new_header) ##[1] + aggr_header[2])  ##  ["Fired", "Calcium"])

                        # Reset positions_reader to the beginning of the file
                        positions_file.seek(0)
                        next(positions_reader)  # Skip header

                        for pos_row, aggr_row_text in zip(positions_reader, aggregate_reader):
                            aggr_row = aggr_row_text[0].split(";")
                            sensor_id = aggr_row[0]
                            #print(f"Sensor ID: {sensor_id}")
                            #print(f"pos_row: {pos_row}")
                            #print(f"aggr_row: {aggr_row}")

                            data_to_write = pos_row[0:] + aggr_row[1:]#, aggr_row[2], aggr_row[3], aggr_row[4], aggr_row[5], aggr_row[6], aggr_row[7], aggr_row[8], aggr_row[9], aggr_row[10], aggr_row[11], aggr_row[12]]
                            #print(f"Data to write: {data_to_write}")
                            merged_writer.writerow(data_to_write)
                    #break # stop after first output file

if __name__ == "__main__":
    os.makedirs(output_dir, exist_ok=True)
    merge_files()


Opening aggregates file: simagg_190000.csv
simagg_190000.csv header: ['SensorID;Step;Fired;FiredFraction;ElectricActivity;Inhibition;Calcium;TargetCalcium;SynapticInput;BackgroundActivity;GrownAxons;ConnectedAxons;GrownDendrites;ConnectedDendrites']
['label', 'anatomy', '', '', 'Fired', 'FiredFraction', 'ElectricActivity', 'Inhibition', 'Calcium', 'TargetCalcium', 'SynapticInput', 'BackgroundActivity', 'GrownAxons', 'ConnectedAxons', 'GrownDendrites', 'ConnectedDendrites']
Opening aggregates file: simagg_310000.csv
simagg_310000.csv header: ['SensorID;Step;Fired;FiredFraction;ElectricActivity;Inhibition;Calcium;TargetCalcium;SynapticInput;BackgroundActivity;GrownAxons;ConnectedAxons;GrownDendrites;ConnectedDendrites']
['label', 'anatomy', '', '', 'Fired', 'FiredFraction', 'ElectricActivity', 'Inhibition', 'Calcium', 'TargetCalcium', 'SynapticInput', 'BackgroundActivity', 'GrownAxons', 'ConnectedAxons', 'GrownDendrites', 'ConnectedDendrites']
Opening aggregates file: simagg_520000.csv
s

In [168]:
# Final Merge With UMAP and Normalize
import pandas as pd
import numpy as np
import os
import re

# input_merged_dir = "merged-lesion" #"-calcium" # inpuddir -stimulus" #"
# input_umap_dir = "posinclust-lesion" #"viz-calcium-network/posinclust" #"viz-stimulus-network/posinclust" -stimulus" #"
# output_dir= "final-merged-lesion-natgrid" #"final-merged-stimulus-grid"
input_merged_dir = "merged-stimulus" #
input_umap_dir = "posinclust-stimulus" #
output_dir= "final-merged-stimulus-natgrid"


# Function to normalize values in columns
def normalize_column(col, min_value, max_value):
    # Replace NaN and Inf with specific values or handle them as needed
    col = col.fillna(0)  # Example: filling NaN with 0
    col = col.replace([np.inf, -np.inf], 0)  # Example: replacing infinite values with 0
    min_col = col.min()
    max_col = col.max()
    with open('simmergefin.log', 'a') as log_file:
        #print(f"sensor_id: {sensor_id}", file=log_file)
        print(min_col, max_col, file=log_file)
        print((col - min_col) / (max_col - min_col) * (max_value - min_value) + min_value, file=log_file)
    normalized_col = ((col - min_col) / (max_col - min_col) * (max_value - min_value) + min_value).astype(int)
    return normalized_col

# Function to post-process Clustering columns
def post_process_clustering(col):
    col_max = col.max()
    # Iterate through the range repeatedly until no more unused or solitary values are found
    for value in range(1, col_max + 1):
        count = (col == value).sum()
        if count == 0 or count == 1:
            missing_value = value
            col[col == col_max] = missing_value
            col_max -= 1
    return col
#The function iterates through the range of possible values and looks for any values that are missing or occur only once. When such a value is found, all instances of the maximum value in the column are replaced with that missing value, and col_max is decremented accordingly.
#This updated code should solve the problem without recursion and without a TypeError. You can use this function in the previous code to replace the existing post-processing function.

print("Beginnng final data merge and normalization process. Writing log file: simmergefin.log")
              
# Iterate through files in the posinclust folder
for filename in os.listdir(input_umap_dir):
    match = re.match(r'positions_(\d+)_(in|out)\.csv', filename)
    if match:
        timestamp, direction = match.groups()
        positions_umap_path = f'{input_umap_dir}/{filename}'
        #positions_isomap_path =  f'{input_isomap_dir}/{filename}'
        merged_path = f'{input_merged_dir}/merged_data_{timestamp}.csv'
        final_path = f'{output_dir}/final_merged_data_{timestamp}_{direction}.csv'

        print(f'Merging {positions_umap_path} and {merged_path} into {final_path}...')

        # Read the positions file
        positions_umap_data = pd.read_csv(positions_umap_path)
        positions_umap_data.columns = ["umap", "", "", "UMAPClustering","GridUMAPClustering"]
#         try:
#             positions_isomap_data = pd.read_csv(positions_isomap_path)
#         except FileNotFoundError:
#             print(f"File {positions_isomap_path} not found. Skipping to next file.")
#             continue  # Skip to next loop iteration
            
#         positions_isomap_data.columns = ["isomap", "", "", "IsomapGridClustering","IsomapClustering"]

        #Check and adjust "*Clustering" columns if there are zero values
        if 0 in positions_umap_data["UMAPClustering"]:
            positions_umap_data["UMAPClustering"] += 1
        if 0 in positions_umap_data["GridUMAPClustering"]:
            positions_umap_data["GridUMAPClustering"] += 1
        # if 0 in positions_isomap_data["IsomapGridClustering"]:
        #     positions_isomap_data["IsomapGridClustering"] += 1
        # if 0 in positions_isomap_data["IsomapClustering"]:
        #     positions_isomap_data["IsomapClustering"] += 1

        
        # Read the merged file, ignore header, and assign proper columns
        try:
            merged_data = pd.read_csv(merged_path) #, header=None, skiprows=1)
        except FileNotFoundError:
            print(f"File {merged_path} not found. Skipping to next file.")
            continue  # Skip to next loop iteration

        #merged_data.columns = ["label", "anatomy", "", "", "FiredClustering", "CalciumClustering"]

        # Append "Heatmaps" to first mirged column names after the 4th# Append "Clustering" to column names after the 4th
        new_columns = [col + "Heatmap" if idx > 3 else col for idx, col in enumerate(merged_data.columns)]
        new_columns[2] = ''
        new_columns[3] = ''
        
                
        # Assign the new column names to the DataFrame
        merged_data.columns = new_columns
        
        uncluster = []
        # Normalize values in columns 5 and 6 and all after in initial merge
        for col_name in new_columns[4:]: # + new_columns[11:]:  #[("FiredClustering", 9), ("CalciumClustering", 12)]:
            #col = merged_data[col_name]
            col = merged_data[col_name]
            #max_value = max(len(set(col)),20)
            if len(set(col)) < 10:
                max_value = len(set(col))
            else:
                max_value = 10
                
            if len(set(col)) > 1:
                normalized_col = normalize_column(col, 1, max_value)
            else:
                uncluster.append(col_name)
        
            if 0 in normalized_col:
                col += 1
            #normalized_col = normalize_column(col, 1, max_value)
            #normalized_col = post_process_clustering(normalized_col)
            merged_data[col_name] = normalized_col

        #new_col_name = col_name.replace("Clustering", "")
        #merged_data.rename(columns={col_name: new_col_name}, inplace=True)
        
        #for col_name in uncluster:
        with open('simmergefin.log', 'a') as log_file:
            print("uncluster",uncluster,file=log_file)
        merged_data.drop(columns=uncluster, inplace=True, errors='ignore')

        # Concatenate the merged_data and positions_data to place positions_data in columns 7-10
        final_data = pd.concat([merged_data, positions_umap_data], axis=1)

        # DONT Apply post-processing to all Clustering columns
        #for col_name in final_data.columns:
        #    if "Clustering" in col_name:
        #        final_data[col_name] = post_process_clustering(final_data[col_name])

        # Save the final data
        os.makedirs(output_dir, exist_ok=True)
        final_data.to_csv(final_path, index=False)

        
        with open('simmergefin.log', 'a') as log_file:
            print(f'Merged {positions_umap_path} and {merged_path} into {final_path}',file=log_file)
        print(f'Merged {positions_umap_path} and {merged_path} into {final_path}')

        
        # break # Stop after first file
print("Completed final data merge and normalization process. Wrote log file: simmergefin.log")

Beginnng final data merge and normalization process. Writing log file: simmergefin.log
Merging posinclust-stimulus/positions_230000_out.csv and merged-stimulus/merged_data_230000.csv into final-merged-stimulus-natgrid/final_merged_data_230000_out.csv...
Merged posinclust-stimulus/positions_230000_out.csv and merged-stimulus/merged_data_230000.csv into final-merged-stimulus-natgrid/final_merged_data_230000_out.csv
Merging posinclust-stimulus/positions_290000_in.csv and merged-stimulus/merged_data_290000.csv into final-merged-stimulus-natgrid/final_merged_data_290000_in.csv...
Merged posinclust-stimulus/positions_290000_in.csv and merged-stimulus/merged_data_290000.csv into final-merged-stimulus-natgrid/final_merged_data_290000_in.csv
Merging posinclust-stimulus/positions_810000_out.csv and merged-stimulus/merged_data_810000.csv into final-merged-stimulus-natgrid/final_merged_data_810000_out.csv...
Merged posinclust-stimulus/positions_810000_out.csv and merged-stimulus/merged_data_810000

In [132]:
# Final Merge With UMAP and and Isomap and Normalize
import pandas as pd
import numpy as np
import os
import re

input_merged_dir = "merged-calcium"
input_umap_dir = "viz-calcium-network/posinclust" #"viz-stimulus-network/posinclust"
input_isomap_dir = "Isomap2/calcium-posinclust/"
output_dir= "final-merged-calcium"

# Function to normalize values in columns
def normalize_column(col, min_value, max_value):
    # Replace NaN and Inf with specific values or handle them as needed
    col = col.fillna(0)  # Example: filling NaN with 0
    col = col.replace([np.inf, -np.inf], 0)  # Example: replacing infinite values with 0
    min_col = col.min()
    max_col = col.max()
    with open('simmergefin.log', 'a') as log_file:
        #print(f"sensor_id: {sensor_id}", file=log_file)
        print(min_col, max_col, file=log_file)
        print((col - min_col) / (max_col - min_col) * (max_value - min_value) + min_value, file=log_file)
    normalized_col = ((col - min_col) / (max_col - min_col) * (max_value - min_value) + min_value).astype(int)
    return normalized_col

# Function to post-process Clustering columns
def post_process_clustering(col):
    col_max = col.max()
    # Iterate through the range repeatedly until no more unused or solitary values are found
    for value in range(1, col_max + 1):
        count = (col == value).sum()
        if count == 0 or count == 1:
            missing_value = value
            col[col == col_max] = missing_value
            col_max -= 1
    return col
#The function iterates through the range of possible values and looks for any values that are missing or occur only once. When such a value is found, all instances of the maximum value in the column are replaced with that missing value, and col_max is decremented accordingly.
#This updated code should solve the problem without recursion and without a TypeError. You can use this function in the previous code to replace the existing post-processing function.

print("Beginnng final data merge and normalization process. Writing log file: simmergefin.log")
              
# Iterate through files in the posinclust folder
for filename in os.listdir(input_umap_dir):
    match = re.match(r'positions_(\d+)_(in|out)\.csv', filename)
    if match:
        timestamp, direction = match.groups()
        positions_umap_path = f'{input_umap_dir}/{filename}'
        positions_isomap_path =  f'{input_isomap_dir}/{filename}'
        merged_path = f'{input_merged_dir}/merged_data_{timestamp}.csv'
        final_path = f'{output_dir}/final_merged_data_{timestamp}_{direction}.csv'

        print(f'Merging {positions_umap_path}, {positions_isomap_path} and {merged_path} into {final_path}...')

        # Read the positions file
        positions_umap_data = pd.read_csv(positions_umap_path)
        positions_umap_data.columns = ["umap", "", "", "UMAPClustering","GridUMAPClustering"]
        try:
            positions_isomap_data = pd.read_csv(positions_isomap_path)
        except FileNotFoundError:
            print(f"File {positions_isomap_path} not found. Skipping to next file.")
            continue  # Skip to next loop iteration
            
        positions_isomap_data.columns = ["isomap", "", "", "IsomapGridClustering","IsomapClustering"]

        # Check and adjust "*Clustering" columns if there are zero values
        # if 0 in positions_umap_data["UMAPClustering"]:
        #     positions_umap_data["UMAPClustering"] += 1
        # if 0 in positions_umap_data["GridUMAPClustering"]:
        #     positions_umap_data["GridUMAPClustering"] += 1
        # if 0 in positions_isomap_data["IsomapGridClustering"]:
        #     positions_isomap_data["IsomapGridClustering"] += 1
        # if 0 in positions_isomap_data["IsomapClustering"]:
        #     positions_isomap_data["IsomapClustering"] += 1

        
        # Read the merged file, ignore header, and assign proper columns
        try:
            merged_data = pd.read_csv(merged_path) #, header=None, skiprows=1)
        except FileNotFoundError:
            print(f"File {merged_path} not found. Skipping to next file.")
            continue  # Skip to next loop iteration

        #merged_data.columns = ["label", "anatomy", "", "", "FiredClustering", "CalciumClustering"]

        # Append "Heatmaps" to first mirged column names after the 4th# Append "Clustering" to column names after the 4th
        new_columns = [col + "Heatmap" if idx > 3 else col for idx, col in enumerate(merged_data.columns)]
        new_columns[2] = ''
        new_columns[3] = ''
        
                
        # Assign the new column names to the DataFrame
        merged_data.columns = new_columns
        
        uncluster = []
        # Normalize values in columns 5 and 6 and all after in initial merge
        for col_name in new_columns[4:]: # + new_columns[11:]:  #[("FiredClustering", 9), ("CalciumClustering", 12)]:
            #col = merged_data[col_name]
            col = merged_data[col_name]
            #max_value = max(len(set(col)),20)
            if len(set(col)) < 10:
                max_value = len(set(col))
            else:
                max_value = 10
                
            if len(set(col)) > 1:
                normalized_col = normalize_column(col, 1, max_value)
            else:
                uncluster.append(col_name)
        
            if 0 in normalized_col:
                col += 1
            #normalized_col = normalize_column(col, 1, max_value)
            #normalized_col = post_process_clustering(normalized_col)
            merged_data[col_name] = normalized_col

        #new_col_name = col_name.replace("Clustering", "")
        #merged_data.rename(columns={col_name: new_col_name}, inplace=True)
        
        #for col_name in uncluster:
        with open('simmergefin.log', 'a') as log_file:
            print("uncluster",uncluster,file=log_file)
        merged_data.drop(columns=uncluster, inplace=True, errors='ignore')

        # Concatenate the merged_data and positions_data to place positions_data in columns 7-10
        final_data = pd.concat([merged_data, positions_umap_data, positions_isomap_data], axis=1)

        # DONT Apply post-processing to all Clustering columns
        #for col_name in final_data.columns:
        #    if "Clustering" in col_name:
        #        final_data[col_name] = post_process_clustering(final_data[col_name])

        # Save the final data
        os.makedirs(output_dir, exist_ok=True)
        final_data.to_csv(final_path, index=False)

        
        with open('simmergefin.log', 'a') as log_file:
            print(f'Merged {positions_umap_path}, {positions_isomap_path} and {merged_path} into {final_path}',file=log_file)
        print(f'Merged {positions_umap_path},{positions_isomap_path} and {merged_path} into {final_path}')

        
        # break # Stop after first file
print("Completed final data merge and normalization process. Wrote log file: simmergefin.log")


Beginnng final data merge and normalization process. Writing log file: simmergefin.log
Merging viz-calcium-network/posinclust/positions_230000_out.csv, Isomap2/calcium-posinclust//positions_230000_out.csv and merged-calcium/merged_data_230000.csv into final-merged-calcium/final_merged_data_230000_out.csv...
Merged viz-calcium-network/posinclust/positions_230000_out.csv,Isomap2/calcium-posinclust//positions_230000_out.csv and merged-calcium/merged_data_230000.csv into final-merged-calcium/final_merged_data_230000_out.csv
Merging viz-calcium-network/posinclust/positions_290000_in.csv, Isomap2/calcium-posinclust//positions_290000_in.csv and merged-calcium/merged_data_290000.csv into final-merged-calcium/final_merged_data_290000_in.csv...
Merged viz-calcium-network/posinclust/positions_290000_in.csv,Isomap2/calcium-posinclust//positions_290000_in.csv and merged-calcium/merged_data_290000.csv into final-merged-calcium/final_merged_data_290000_in.csv
Merging viz-calcium-network/posinclust/po

In [None]:
max(5,7)

In [172]:
## Clean up cluster labels final merged files 

import pandas as pd
import os

# Path to the directory where the output files from the first script are saved
# output_directory = 'final-merged-lesion-natgrid' #finalmergedcalcium #stimulus-natgrid'
# clean_directory = 'clean_final-lesion-natgrid' #-stimulus-natgrid'
# output_prefix = "lesion"  #"stimulus" #
output_directory = 'final-merged-stimulus-natgrid'
clean_directory = 'clean_final-stimulus-natgrid'
output_prefix = "stimulus" #

def post_process_clustering(col):
    col = col.copy()
    col_max = col.max()
    was_dirty = False
    max_count = (col == col_max).sum()
    print("max_count",max_count)
    if max_count == 1:
        col[col == col_max] = col_max-1
        #col_max -= 1
    value = 1
    #for value in range(1, col_max + 1:
    while value < col_max:
        count = (col == value).sum()
        if count == 0 or count == 1:
            print(f"value dirty: {value}")
            was_dirty = True
            col[col == col_max] = value
            print("col_max",col_max)
            col_max -= 1
        value += 1
    return col, was_dirty

# Path to the directory where the cleaned files will be saved
if not os.path.exists(clean_directory):
    os.makedirs(clean_directory)

# Loop through the files in the output directory
for filename in os.listdir(output_directory):
    if filename.startswith('final_merged_data_'):
        print(f"Processing file: {filename}")  # Debug statement

        # Load the file
        #file_path = os.path.join(output_directory, filename)
        #data = pd.read_csv(file_path, header=0)
        # Read the header separately
        file_path = os.path.join(output_directory, filename)
        with open(file_path, 'r') as f:
            header_line = f.readline().strip()

        # Read the data without the header
        data = pd.read_csv(file_path, header=None, skiprows=1)
        data.columns = header_line.split(',')

        # Post-process Clustering columns
        for col_name in data.columns:
            if "Clustering" in col_name or "Heatmap" in col_name:
                print(f"  Processing column: {col_name}")  # Debug statement
                was_dirty = True
                iteration = 0
                #while was_dirty :
                while was_dirty and iteration < 12:  #data[col_name].max():
                    print(f"    Iteration: {iteration}")  # Debug statement
                    cleaned_col, was_dirty = post_process_clustering(data[col_name])
                    data[col_name] = cleaned_col
                    iteration += 1

                
        # Split the filename by the underscore character
        parts = filename.split('_')

        # Change the first part to "lesion"
        parts[0] = output_prefix #"stimulus" #"lesion"

        # Join the parts back together with underscores
        new_filename = '_'.join(parts)

        # Save the cleaned data to the new directory
        cleaned_file_path = os.path.join(clean_directory, new_filename)
        data.to_csv(cleaned_file_path, index=False)

        print(f"  File {filename} processed and saved.")  # Debug statement
                  
        #break

print("Post-processing complete, cleaned files saved in", clean_directory)


Processing file: final_merged_data_360000_out.csv
  Processing column: FiredHeatmap
    Iteration: 0
max_count 1
  Processing column: FiredFractionHeatmap
    Iteration: 0
max_count 2
  Processing column: ElectricActivityHeatmap
    Iteration: 0
max_count 1
  Processing column: InhibitionHeatmap
    Iteration: 0
max_count 1
  Processing column: CalciumHeatmap
    Iteration: 0
max_count 1
  Processing column: SynapticInputHeatmap
    Iteration: 0
max_count 1
  Processing column: BackgroundActivityHeatmap
    Iteration: 0
max_count 1
  Processing column: GrownAxonsHeatmap
    Iteration: 0
max_count 1
  Processing column: ConnectedAxonsHeatmap
    Iteration: 0
max_count 1
  Processing column: GrownDendritesHeatmap
    Iteration: 0
max_count 1
  Processing column: ConnectedDendritesHeatmap
    Iteration: 0
max_count 13545
  Processing column: UMAPClustering
    Iteration: 0
max_count 14
  Processing column: GridUMAPClustering
    Iteration: 0
max_count 2
  File final_merged_data_360000_out

In [171]:
# For the lesion dataset add 100000 to all nodes on disabled list to mark them with boxes
# Also remove all columns with unvarying data
import pandas as pd
import os

# Read disabled IDs
with open('disabled.txt', 'r') as f:
    disabled_ids = f.read().splitlines()
disabled_ids = [int(id) for id in disabled_ids]

# Path to the directory containing the files
#path = 'clean_final/save_lesions/'
path = 'clean_final-lesion-natgrid'

# Iterate over all files in the directory
for filename in os.listdir(path):
    if filename.endswith('.csv'):  # Check that the file is a CSV
        
        
        file_path = os.path.join(path, filename)
        #file_path = os.path.join(output_directory, filename)
        with open(file_path, 'r') as f:
            header_line = f.readline().strip()

        # Read the data without the header
        df = pd.read_csv(file_path, header=None, skiprows=1)
        df.columns = header_line.split(',')
        #df = pd.read_csv(file_path)

        # If the first column value is in the disabled list, add 100000 to that value
        df.iloc[:, 0] = df.iloc[:, 0].apply(lambda x: x + 100000 if x in disabled_ids else x)

        # Drop the columns where all values are the same
        df = df.loc[:, df.nunique() != 1]

        # Save the modified dataframe back to the file
        df.to_csv(file_path, index=False)


In [173]:
# For the stimulus dataset add 100000 to all nodes on stimulus list to mark them with boxes
# Also remove all columns with unvarying data
import pandas as pd
import os

# Read disabled IDs
with open('stimulus.txt', 'r') as f:
    stimulus_ids = f.read().splitlines()
stimulus_ids = [int(id) for id in stimulus_ids]

# Path to the directory containing the files
#path = 'clean_final/save_stimulus/'
path = 'clean_final-stimulus-natgrid'

# Iterate over all files in the directory
for filename in os.listdir(path):
    if filename.endswith('.csv'):  # Check that the file is a CSV
        
        
        file_path = os.path.join(path, filename)
        #file_path = os.path.join(output_directory, filename)
        with open(file_path, 'r') as f:
            header_line = f.readline().strip()

        # Read the data without the header
        df = pd.read_csv(file_path, header=None, skiprows=1)
        df.columns = header_line.split(',')
        #df = pd.read_csv(file_path)

        # If the first column value is in the disabled list, add 100000 to that value
        df.iloc[:, 0] = df.iloc[:, 0].apply(lambda x: x + 100000 if x in stimulus_ids else x)

        # Drop the columns where all values are the same
        df = df.loc[:, df.nunique() != 2]

        # Save the modified dataframe back to the file
        df.to_csv(file_path, index=False)


In [130]:
# For the calcium and phase1 burn-in dataset just remove all columns with unvarying data
# Debug version below
import pandas as pd
import os


# Path to the directory containing the files
path = 'final-merged-calcium' #'clean_final_calcium/'

# Iterate over all files in the directory
for filename in os.listdir(path):
    if filename.endswith('.csv'):  # Check that the file is a CSV
               
        file_path = os.path.join(path, filename)
        #file_path = os.path.join(output_directory, filename)
        with open(file_path, 'r') as f:
            header_line = f.readline().strip()

        # Read the data without the header
        df = pd.read_csv(file_path, header=None, skiprows=1)
        df.columns = header_line.split(',')
        #df = pd.read_csv(file_path)

        # Drop the columns where all values are the same
        df = df.loc[:, df.nunique() != 2]

        # Save the modified dataframe back to the file
        df.to_csv(file_path, index=False)


In [126]:
import os
import pandas as pd

def find_csv_files(path, depth):
    if depth == 0:
        return
    for filename in os.listdir(path):
        full_path = os.path.join(path, filename)
        if os.path.isdir(full_path):
            find_csv_files(full_path, depth - 1)
        elif filename.endswith('.csv'):
            df = pd.read_csv(full_path)
            if df.iloc[:, 4].nunique() > 1 or df.iloc[:, 4].unique()[0] != 1:
                print(full_path)

find_csv_files('umap2', 4)


IndexError: single positional indexer is out-of-bounds

In [129]:
import pandas as pd
import os

# Path to the directory containing the files
path = 'final-merged-calcium' #'clean_final_calcium/'

# Iterate over all files in the directory
for filename in os.listdir(path):
    if filename.endswith('.csv'):  # Check that the file is a CSV
        print(f"Processing file: {filename}")  # DEBUG PRINT
        file_path = os.path.join(path, filename)

        # Read header from file
        with open(file_path, 'r') as f:
            header_line = f.readline().strip()
            print(f"Header: {header_line}")  # DEBUG PRINT

        # Read the data without the header
        df = pd.read_csv(file_path, header=None, skiprows=1)
        df.columns = header_line.split(',')

        # Print unique value counts for each column before dropping
        for col in df.columns:
            print(f"{col} unique count: {df[col].nunique()}")  # DEBUG PRINT

        # Drop the columns where all values are the same
        df = df.loc[:, df.nunique() != 1]

        # Print unique value counts for each column after dropping
        for col in df.columns:
            print(f"{col} unique count: {df[col].nunique()}")  # DEBUG PRINT

        # Save the modified dataframe back to the file
        df.to_csv(file_path, index=False)

        print(f"Finished processing file: {filename}\n")  # DEBUG PRINT


Processing file: final_merged_data_360000_out.csv
Header: label,anatomy,,,FiredHeatmap,FiredFractionHeatmap,ElectricActivityHeatmap,InhibitionHeatmap,CalciumHeatmap,TargetCalciumHeatmap,SynapticInputHeatmap,BackgroundActivityHeatmap,GrownAxonsHeatmap,ConnectedAxonsHeatmap,GrownDendritesHeatmap,ConnectedDendritesHeatmap,umap,,,UMAPClustering,GridUMAPClustering,isomap,,
label unique count: 50000
anatomy unique count: 49965
 unique count:     49958
    49952
    49703
    49726
    45937
    45884
dtype: int64
 unique count:     49958
    49952
    49703
    49726
    45937
    45884
dtype: int64
FiredHeatmap unique count: 10
FiredFractionHeatmap unique count: 10
ElectricActivityHeatmap unique count: 10
InhibitionHeatmap unique count: 10
CalciumHeatmap unique count: 10
TargetCalciumHeatmap unique count: 10
SynapticInputHeatmap unique count: 10
BackgroundActivityHeatmap unique count: 10
GrownAxonsHeatmap unique count: 10
ConnectedAxonsHeatmap unique count: 10
GrownDendritesHeatmap unique c