# Organize summary data for AR=1.3 clusters

This notebook takes outputs from `extract_data_summary_all.ipynb` and `fig_clustering_dendrogram.ipynb` to produced organized dataframe for statistical analysis for behavior attributes of AR=1.3 clusters.

In [1]:
from pathlib import Path
import pandas as pd

import pickle

In [2]:
from pp_utils.core import generate_data_path_dict
from pp_utils import utils

## Set paths, load main info df and clusters

In [3]:
data_path = generate_data_path_dict(Path("../data_processed/"))

In [4]:
output_path = data_path["main"] / "data_summary"

with open(output_path / "clustering_7_rostrum_only.pickle", "rb") as filein:
    cluster_fnameprefix_7 = pickle.load(filein)

### Load compiled trial info

In [5]:
df_summary = pd.read_csv(output_path / "data_summary_all.csv", index_col=0)

## Sort clusters

### 7clusters

In [6]:
# Clean up clusters
# 0: TC-straight, 1: TC-very curved, 2: TC-curved (need to change sequence to: 0-2-1)
# 3: CT-very curved, 4: CT-curved, 6: CT-straight (need to change sequence to: 6-4-3)
df_cluster_7 = []
for seq, fp_list in enumerate(cluster_fnameprefix_7):
    df_sel = utils.sort_df_in_cluster(fp_list, df_summary)
    df_cluster_7.append(df_sel)

In [7]:
# Change sequence
new_seq = [0, 2, 1, 6, 4, 3]
df_cluster_clean_7 = []
for seq in new_seq:
    df_cluster_clean_7.append(df_cluster_7[seq])

In [8]:
# Sanity check that we've swapped things correctly
print([len(df) for df in df_cluster_7])
print([len(df) for df in df_cluster_clean_7])

[16, 9, 16, 6, 8, 1, 23]
[16, 16, 9, 23, 8, 6]


### Build dataframe

In [9]:
# Cluster names
cluster_names = ["TC_Straight", "TC_Curved-1", "TC_Curved-2", "CT_Straight", "CT_Curved-1", "CT_Curved-2"]

In [10]:
# Fill in new columns
cluster_all = []

for trial_idx in df_summary.index:
    fname_prefix = df_summary.loc[trial_idx]["fname_prefix"]
    cluster_fill = "X"
    for seq in range(6):
        if fname_prefix in df_cluster_clean_7[seq]["fname_prefix"].values:
            cluster_fill = cluster_names[seq]
            break
    cluster_all.append(cluster_fill)

In [11]:
# Sanity check
assert len(cluster_all) == 247

In [12]:
df_summary["cluster"] = cluster_all

In [13]:
df_sel = df_summary[df_summary["cluster"]!="X"]

In [14]:
len(df_sel)

78

In [15]:
# Sanity check
assert len(df_sel[df_sel["cluster"]=="X"]) == 0

### Export to CSV

In [16]:
df_sel.to_csv(output_path / "data_summary_cluster_only.csv")