In [2]:
import numpy as np
import pandas as pd
import os
import glob

# Define paths
input_dir = '/Users/trollj01/PycharmProjects/pythonProject1/20250527_SGE_FinalScripts/Jurkat_Valine/Jurkat_Valine_Counts/output_csvs'
output_csv = '/Users/trollj01/PycharmProjects/pythonProject1/20250527_SGE_FinalScripts/Jurkat_Valine/Jurkat_Valine_SummaryOutputs/Jurkat_Val_RelativeReadCounts.csv'

# Ensure output directory exists
os.makedirs(os.path.dirname(output_csv), exist_ok=True)

# Get all input files
input_files = sorted(glob.glob(os.path.join(input_dir, 'Counts_BS*')))

# Define promoters and CDS
promoters = ['Ef1a', 'PGK']
mts_promoters = [f'{p}-MTS' for p in promoters]
cds = ['GFP', 'ilvA', 'ilvA mut 1', 'ilvA mut 2', 'ilvG', 'ilvM', 'ilvB', 'ilvN', 'ilvC', 'ilvD']

csv_data = []

for file_path in input_files:
    df = pd.read_csv(file_path, skiprows=13, names=['BC Number', 'BC Name', 'Sequence', 'Counts'])
    total_reads = df['Counts'].sum()
    norm = 100 / total_reads if total_reads > 0 else 0
    quality = 'Above_50000' if total_reads >= 50000 else 'Below_50000'

    print(f"File: {file_path}, Total Reads: {total_reads}, Normalization Value: {norm:.6f}, Read Quality: {quality}")

    # Filter for usable barcodes
    df_filtered = df[
        (~df['BC Name'].str.contains("Unassigned")) &
        (df['BC Name'].str.contains("Ef1a__|PGK__|CMV__|TightTRE__|NoProm__|Ef1a-MTS__|PGK-MTS__|CMV-MTS__|TightTRE-MTS__|NoProm-MTS__|__Ef1a|__PGK|__CMV|__TightTRE|__NoProm|__Ef1a-MTS|__PGK-MTS"))
    ]

    sum_TU = {}

    # Count regular (non-MTS) promoters
    for promoter in promoters:
        for cd in cds:
            match1 = df_filtered['BC Name'].str.contains(f"{cd}__{promoter}(?!-MTS)", regex=True)
            match2 = df_filtered['BC Name'].str.contains(f"{promoter}__{cd}(?!-MTS)", regex=True)
            sum_TU[f'{promoter}_{cd}'] = (df_filtered.loc[match1, 'Counts'].sum() + df_filtered.loc[match2, 'Counts'].sum()) * norm

    # Count MTS promoters
    for promoter in mts_promoters:
        for cd in cds:
            match1 = df_filtered['BC Name'].str.contains(f"{cd}__{promoter}")
            match2 = df_filtered['BC Name'].str.contains(f"{promoter}__{cd}")
            sum_TU[f'{promoter}_{cd}'] = (df_filtered.loc[match1, 'Counts'].sum() + df_filtered.loc[match2, 'Counts'].sum()) * norm

    # Add row to output
    row = {'File': os.path.basename(file_path), 'Read_Quality': quality}
    row.update(sum_TU)
    csv_data.append(row)

# Write to CSV
output_df = pd.DataFrame(csv_data)
output_df.to_csv(output_csv, index=False)
print(f"Summary saved to: {output_csv}")


File: /Users/trollj01/PycharmProjects/pythonProject1/20250527_SGE_FinalScripts/Jurkat_Valine/Jurkat_Valine_Counts/output_csvs/Counts_BS27976A, Total Reads: 238296.0, Normalization Value: 0.000420, Read Quality: Above_50000
File: /Users/trollj01/PycharmProjects/pythonProject1/20250527_SGE_FinalScripts/Jurkat_Valine/Jurkat_Valine_Counts/output_csvs/Counts_BS27977A, Total Reads: 184117.0, Normalization Value: 0.000543, Read Quality: Above_50000
File: /Users/trollj01/PycharmProjects/pythonProject1/20250527_SGE_FinalScripts/Jurkat_Valine/Jurkat_Valine_Counts/output_csvs/Counts_BS27978A, Total Reads: 624092.0, Normalization Value: 0.000160, Read Quality: Above_50000
File: /Users/trollj01/PycharmProjects/pythonProject1/20250527_SGE_FinalScripts/Jurkat_Valine/Jurkat_Valine_Counts/output_csvs/Counts_BS27979A, Total Reads: 377707.0, Normalization Value: 0.000265, Read Quality: Above_50000
File: /Users/trollj01/PycharmProjects/pythonProject1/20250527_SGE_FinalScripts/Jurkat_Valine/Jurkat_Valine_C