In [10]:
cd

/Users/keith_tetrad


In [16]:
import pandas as pd
from itertools import combinations
import numpy as np

class DataProcessor:
    def __init__(self):
        self.codon_variant = None

    def load_codon_variants(self, file_path):
        self.codon_variant = pd.read_csv(file_path, usecols=['barcode','library','codon_substitutions','aa_substitutions','n_codon_substitutions','n_aa_substitutions'])

    def calculate_functional_score(self, pre_selection, pre_selection_name, post_selection, post_selection_name, library, count_threshold):
        pre_selection = pre_selection.rename(columns={'count': 'count_pre'})
        post_selection = post_selection.rename(columns={'count': 'count_post'})
        
        pre_selection = pre_selection[(pre_selection['count_pre'] >= count_threshold)]
        pre_selection = pd.merge(pre_selection, self.codon_variant, on='barcode')
        post_selection = pd.merge(post_selection, self.codon_variant, on='barcode')
        
        pre_selection = pre_selection[pre_selection['library'] == library]
        post_selection = post_selection[post_selection['library'] == library]

        merged_df = pd.merge(pre_selection, post_selection[['barcode', 'count_post']], on='barcode', how='left').fillna(0)
        
        wt_barcodes = merged_df[merged_df['n_codon_substitutions'] == 0]
        pre_wt_counts = wt_barcodes['count_pre'].sum()
        post_wt_counts = wt_barcodes['count_post'].sum()

        merged_df = merged_df[merged_df['n_codon_substitutions'] > 0]
        merged_df['count_post'] = merged_df['count_post'].replace(0, 0.5).astype(float)
        merged_df['func_score'] = np.log2((merged_df['count_post'] / post_wt_counts) / (merged_df['count_pre'] / pre_wt_counts))
        
        merged_df = merged_df.rename(columns={
            'count_pre': f'{pre_selection_name}_count',
            'count_post': f'{post_selection_name}_count'
        })
        return merged_df

    def merge_dataframes(self, dataframes_dict):
        merged_group_dict = {}
        for (key1, df1), (key2, df2) in combinations(dataframes_dict.items(), 2):
            df1_renamed = df1.rename(columns={'func_score': f'{key1}_func_score'})
            df2_renamed = df2.rename(columns={'func_score': f'{key2}_func_score'})
            merged_df = pd.merge(df1_renamed, df2_renamed, on=['barcode', 'library', 'aa_substitutions', 'n_aa_substitutions','codon_substitutions', 'n_codon_substitutions'], how='outer')
            merged_df = merged_df.dropna(subset=[f'{key1}_func_score', f'{key2}_func_score'])
            merged_df['func_score_avg'] = merged_df[[f'{key1}_func_score', f'{key2}_func_score']].mean(axis=1)
            merged_df = merged_df.sort_values(by='func_score_avg').reset_index(drop=True)
            merged_group_dict[f'{key1}-and-{key2}'] = merged_df
        return merged_group_dict

    def automate_functional_score_calculations(self, sample_paths, comparison_pairs, library, count_threshold):
        samples = {name: pd.read_csv(path) for name, path in sample_paths.items()}
        func_scores = {}
        for (pre_name, post_name), comparison_name in comparison_pairs.items():
            func_scores[comparison_name] = self.calculate_functional_score(samples[pre_name], pre_name, samples[post_name], post_name, library, count_threshold)
        return self.merge_dataframes(func_scores)


In [23]:
data_processor = DataProcessor()
data_processor.load_codon_variants('/Users/keith_tetrad/kw_dms/results/variants/codon_variants.csv')

sample_paths = {
    'DMSO_B1T1': 'kw_dms/old_results/barcode_counts/LibB-231017-DMSO_bio1-1_counts.csv',
    '4U8C_B1T1': 'kw_dms/old_results/barcode_counts/LibB-231017-4u8c_bio1-1_counts.csv',
    'DMSO_B2T2': 'kw_dms/old_results/barcode_counts/LibB-231017-DMSO_bio2-1_counts.csv',
    '4u8c_B2T2': 'kw_dms/old_results/barcode_counts/LibB-231017-4u8c_bio2-2_counts.csv'
    # Add other sample paths
}


comparison_pairs = {
    ('4u8c_B1T1','DMSO_B1T1'): 'Comparison1',
    ()
    # Add other comparisons
}

func_score_comparisons = data_processor.automate_functional_score_calculations(sample_paths, comparison_pairs, 'LibB', 10)


Processing: Comparison1 with 4u8c_B1T1 and DMSO_B1T1


In [24]:
func_score_comparisons

{}