In [9]:
import pandas as pd
from utils import parse_name_igabasnfr
import numpy as np
import os

'''
ingest csv and save processed dataframe pkl
basic "number of construct" stats
'''

min_responsive_px = 0 # (QC) minimum # pixels in variant to be considered responsive

nAP='10'
segmentation_type = 'ilastik-segmented-csvs'# 'ilastik-segmented-csvs' or 'responsivepx-segmented-csvs'
csv_dir = r'./data/{}/good-{}AP-all-omit.csv'.format(segmentation_type, nAP)

col_names = ["construct","plateID","SNR_norm","SNR_pval_1","SNR_pval_2","SNR_unnorm",
             "DF/F_norm","DF/F_pval_1","DF/F_pval 2","DF/F_unnorm",
             "F0_norm","F0_pval_1","F0_pval_2","F0_unnorm",
             "tau_on_norm","tau_on_pval_1","tau_on_pval_2","tau_on_unnorm",
             "tau_off_norm","tau_off_pval_1","tau_off_pval_2","tau_off_unnorm",
             "bleach_norm","bleach_pval_1","bleach_pval_2","bleach_unnorm",
             "num pixels","num wells","texture","ctrl num pixels","ctrl pdi","ctrl DF/F","isbad"]

df = pd.read_csv(csv_dir, header=0, names=col_names)
print('Ingested {}AP csv'.format(nAP))
construct_info = [parse_name_igabasnfr(c) for c in df['construct']]
df['construct_id'] = [c[4] for c in construct_info]
df['orig_letter'] = [c[0] for c in construct_info]
df['new_letter'] = [c[1] for c in construct_info]
df['pos'] = [c[2] for c in construct_info]
df['n_mutations'] = [c[3] for c in construct_info]
df['mutation_str'] = [c[5] for c in construct_info]
print('Total # constructs screened: ' + str(df.shape[0]))

# dropping rows that had < 10(?) responsive pixels per plate and thus NaNs
df = df.dropna(axis='index', subset=['SNR_norm'])

# normalized num pixels = num pixels (construct) / num pixels (control)
df['num pixels_norm'] = df['num pixels'] / df['ctrl num pixels']

print('Filtered (> X responsive pixels) # constructs screened: ' + str(df.shape[0]))

# QC
df.drop(df[df['num pixels']<min_responsive_px].index, inplace=True)

print('QC: Remaining {} constructs have > {} pixels / plate'.format(df.shape[0], min_responsive_px))

id_cols = ['mutation_str', 'construct', 'n_mutations', 'construct_id', 'orig_letter', 'new_letter', 'pos', 'plateID']
median_cols = [c for c in df.columns if c not in id_cols] # all columns that will be median'd
df_grouped = df.groupby(by=['mutation_str'])[median_cols].median()

# for plate ids, and construct ids, keep all in a list
plate_ids = df.groupby('mutation_str')['plateID'].apply(list)
construct_ids = df.groupby('mutation_str')['construct_id'].apply(list)

df_grouped['num_plates'] = df.groupby('mutation_str').size() # num plates that this clone appears in
df_grouped['num_wells'] = df.groupby('mutation_str')['num wells'].sum()
df_grouped['plate_ids'] = df.groupby('mutation_str')['plateID'].apply(list)
df_grouped['construct_ids'] = df.groupby('mutation_str')['construct_id'].apply(list)
df_grouped[['orig_letter', 'new_letter', 'pos', 'n_mutations']] = df.groupby('mutation_str')[['orig_letter', 'new_letter', 'pos', 'n_mutations']].first()

'''

df.drop(columns=median_cols, inplace=True)
df.drop_duplicates(subset=['construct'], keep='last', inplace=True)
df = df.merge(right=df_grouped, right_index=True, left_on='mutation_str', how='right')
df.set_index('mutation_str', verify_integrity=True, inplace=True)
'''
print('# unique mutations screened: ' + str(df_grouped.shape[0]))


# look at mis-assembled clones (containing STOP codon)
df_misassembled = df_grouped[[True if (f == ['STOP']) else False for f in df_grouped['new_letter'].to_list()]]
print('# misassembled mutations: {}'.format(df_misassembled.shape[0]))
df_misassembled_means = df_misassembled.mean()
df_misassembled_std = df_misassembled.std()
# do something with these? baseline unresponsive data?

# is_combo: 1 if combo mutant. 0 if single mutant
df_grouped['is_combo_int'] = (df_grouped['n_mutations'] > 1).astype(int)

# drop the mis-assembled clones
df_grouped.drop(labels=df_misassembled.index, axis='index', inplace=True)

# add negative-going column (0: pos-going, 1: neg-going, according to DF/F_unnorm)
df_grouped['negative-going'] = (df_grouped['DF/F_unnorm'] <0).astype(int)

print('# properly assembled mutations: {}'.format(df_grouped.shape[0]))
df_grouped.to_pickle(csv_dir.replace('.csv', '.pkl'))
print('saved df_grouped pickle')



Ingested 10AP csv
Total # constructs screened: 517
Filtered (> X responsive pixels) # constructs screened: 490
QC: Remaining 490 constructs have > 0 pixels / plate
# unique mutations screened: 321
# misassembled mutations: 0
# properly assembled mutations: 321
saved df_grouped pickle
