In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
codontab = {
    'TCA': 'S', 'TCC': 'S', 'TCG': 'S', 'TCT': 'S',    # serine
    'TTC': 'F', 'TTT': 'F',    # Fenilalanine
    'TTA': 'L', 'TTG': 'L',    # Leucine
    'TAC': 'Y', 'TAT': 'Y',    # Tirosine
    'TAA': '*', 'TAG': '*',    # Stop
    'TGC': 'C', 'TGT': 'C',    # Cisteine
    'TGA': '*',    # Stop
    'TGG': 'W',    # Triptofane
    'CTA': 'L', 'CTC': 'L', 'CTG': 'L', 'CTT': 'L',    # Leucine
    'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P',    # Proline
    'CAC': 'H', 'CAT': 'H',    # Histidine
    'CAA': 'Q', 'CAG': 'Q',    # Glutamine
    'CGA': 'R', 'CGC': 'R',    # arginine
    'CGG': 'R', 'CGT': 'R',    # arginine
    'ATA': 'I', 'ATC': 'I', 'ATT': 'I',    # Isoleucine
    'ATG': 'M',    # Methionine
    'ACA': 'T', 'ACC': 'T', 'ACG': 'T', 'ACT': 'T',    # Treonine
    'AAC': 'N', 'AAT': 'N',    # asparagine
    'AAA': 'K', 'AAG': 'K',    # lisine
    'AGC': 'S', 'AGT': 'S',    # serine
    'AGA': 'R', 'AGG': 'R',    # arginine
    'GTA': 'V', 'GTC': 'V', 'GTG': 'V', 'GTT': 'V',    # valine
    'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A',    # alanine
    'GAC': 'D', 'GAT': 'D',    # Aspartic Acid
    'GAA': 'E', 'GAG': 'E',    # Glutamic Acid
    'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G'     # glicine
}

In [3]:
# map the DNA variants to the amino acid changes
def map_dna_to_aa(dna_seq):
    if dna_seq is None or pd.isnull(dna_seq):
        return np.nan
    aa_seq = []
    for i in range(0, len(dna_seq), 3):
        codon = dna_seq[i:i+3]
        if codon in codontab:
            aa = codontab[codon]
            if aa == '*':
                break
            aa_seq.append(aa)
        else:
            print(f"Unknown codon: {codon}")
            aa_seq.append('X')
    if aa == "*" and i < len(dna_seq) - 3:
        print(f"Stop codon encountered at position {i} in sequence ({i//3} AA in sequence).")
    aa_seq.append('*')
        #print(f"Warning: Sequence is longer than expected. Remaining sequence: {dna_seq[i:]}")
    return ''.join(aa_seq)

In [4]:
def apply_dna_mutations(mutations, offset=0):
    # Create a list to hold the mutated sequence
    mutated_seq = list(refseq)
    
    # Apply the mutation
    for mut in mutations.split('_'):
        ref_bp, pos, new_bp = mut[0], mut[1:-1], mut[-1]
        if 'DEL' in mut or 'INS' in mut:
            pos = mut[1:-3]
        pos = int(pos) - 1  # Convert to 0-based index
        # if a barcode was included at the start, then remove that
        pos -= offset
        if pos >= len(refseq):
            print(f"WARNING: {pos = } > {len(refseq) = } for '{mutations}'. Skipping")
            return np.nan

        if refseq[pos] != ref_bp:
            print(f"WARNING: '{refseq[pos]}' != '{ref_bp}'. "
                  f"Reference base  does not match at position {pos + 1}. Mutations: {mutations}")
            return ''.join(mutated_seq)
        # assert refseq[pos] == ref_bp, f"'{refseq[pos]}' != '{ref_bp}'. Reference base  does not match at position {pos + 1}. Mutations: {mutations}"

        try:
            # need to handle deletions and insertions
            # e.g., T551DEL, C651INS
            if "DEL" in mut:
                # Deletion: remove the base
                mutated_seq.pop(pos)
            elif "INS" in mut:
                # Insertion: copy the current base (it always accompanies a substitution mutation)
                # e.g., C651INS, C651T
                mutated_seq.insert(pos, ref_bp)
            else:
                # Substitution: replace the base
                mutated_seq[pos] = new_bp
        except IndexError:
            print(f"Error when handling mutations: '{mutations}'")
            return np.nan

    return ''.join(mutated_seq)

In [5]:
def get_aa_mutations(ref_seq, mut_seq):
    """ List the mutations between two amino acid sequences. """
    # Convert the sequences to lists of amino acids
    ref_aa = list(ref_seq)
    mut_aa = list(mut_seq)

    # TODO better handle insertions and deletions
    if len(ref_aa) != len(mut_aa):
        return np.nan
    
    # Find the positions of the mutations
    mutations = []
    for i in range(len(mut_aa)):
        if ref_aa[i] != mut_aa[i]:
            mutations += [f"{ref_aa[i]}{i+1}{mut_aa[i]}"]
            #mutations.append((i+1, ref_aa[i], mut_aa[i]))
    
    return '_'.join(mutations)

In [8]:
date = "251030"
base_dir = Path(f"/projects/bpms/jlaw/projects/other/levseq/runs/{date}")
for run_dir in base_dir.glob("*"):
    var_file = run_dir / "outputs/variants.csv"
    print(var_file)
    df = pd.read_csv(var_file)

    print(len(df["refseq"].unique()))
    refseq = df["refseq"].unique()[0]
    # print(len(df["refseq"].unique()), len(refseq))
    offset = 0
    print(df.head(2))
    aa_refseq = map_dna_to_aa(refseq)
    # print(aa_refseq)

    df_mut = df[df["P value"] < 0.05]
    print(f"{len(df) = }, {len(df_mut) = }")
    # print(df_mut.Variant.value_counts())

    print("Applying mutations")
    df_mut["AA_seq"] = df_mut.Variant.apply(lambda x: map_dna_to_aa(apply_dna_mutations(x, offset=offset)))
    df_mut = df_mut.dropna(subset="AA_seq")
    df_mut = df_mut[~df_mut.AA_seq.str.contains("X")]
    print(f"{len(df_mut) = }")

    print("Extracting AA mutations")
    df_mut["AA_mutations"] = df_mut.AA_seq.apply(lambda x: get_aa_mutations(aa_refseq, x))

    print("")
    print(run_dir.name)
    print(df_mut.AA_mutations.value_counts())
    rename_file = str(var_file).replace("variants.csv", f"{run_dir.name}_levseq_variants.csv")
    df.to_csv(rename_file, index=False)
    out_file = str(rename_file).replace(".csv", "_aa_mut.csv")
    print(out_file)
    df_mut.to_csv(out_file, index=False)
    break

/projects/bpms/jlaw/projects/other/levseq/runs/251030/ref2/outputs/variants.csv
1
   barcode_plate    name                                             refseq  \
0              4  W4VM3D  ATGGACCGTCAGCAGATTGAACAGGTTGTTAAAGCAGTTCTGGCAG...   
1              4  W4VM3D  ATGGACCGTCAGCAGATTGAACAGGTTGTTAAAGCAGTTCTGGCAG...   

   variant  index   Plate Well    Barcode         ID  P value Mixed Well  \
0      NaN    0.0  W4VM3D   A1  RB04_NB01  W4VM3D_A1      NaN        NaN   
1      NaN    1.0  W4VM3D   A2  RB04_NB02  W4VM3D_A2      NaN        NaN   

  Variant  Average mutation frequency  Alignment Count  Average error rate  \
0     NaN                         NaN              NaN                 NaN   
1     NaN                         NaN              NaN                 NaN   

   P adj. value  
0           NaN  
1           NaN  
len(df) = 96, len(df_mut) = 9
Applying mutations
Stop codon encountered at position 369 in sequence (123 AA in sequence).
Unknown codon: GA
Unknown codon: GA
Stop

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mut["AA_seq"] = df_mut.Variant.apply(lambda x: map_dna_to_aa(apply_dna_mutations(x, offset=offset)))


In [9]:
df_mut

Unnamed: 0,barcode_plate,name,refseq,variant,index,Plate,Well,Barcode,ID,P value,Mixed Well,Variant,Average mutation frequency,Alignment Count,Average error rate,P adj. value,AA_seq,AA_mutations
16,4,W4VM3D,ATGGACCGTCAGCAGATTGAACAGGTTGTTAAAGCAGTTCTGGCAG...,,16.0,W4VM3D,B5,RB04_NB17,W4VM3D_B5,3.930562e-09,False,C239T,1.0,5.0,0.0,3.77334e-07,MDRQQIEQVVKAVLAGMAANSAPEPVTPPCGTGVFASLDDAVQAAS...,A80V
34,4,W4VM3D,ATGGACCGTCAGCAGATTGAACAGGTTGTTAAAGCAGTTCTGGCAG...,,34.0,W4VM3D,C11,RB04_NB35,W4VM3D_C11,0.0,True,C196DEL_G197DEL_C198DEL_C199DEL_T200DEL_G201DE...,1.0,11.0,0.0,,MDRQQIEQVVKAVLAGMAANSAPEPVTPPCGTGVFASLDDAVQAAS...,
41,4,W4VM3D,ATGGACCGTCAGCAGATTGAACAGGTTGTTAAAGCAGTTCTGGCAG...,,41.0,W4VM3D,D6,RB04_NB42,W4VM3D_D6,1.436498e-08,False,C231A,0.833333,6.0,0.166667,1.379038e-06,MDRQQIEQVVKAVLAGMAANSAPEPVTPPCGTGVFASLDDAVQAAS...,
47,4,W4VM3D,ATGGACCGTCAGCAGATTGAACAGGTTGTTAAAGCAGTTCTGGCAG...,,47.0,W4VM3D,D12,RB04_NB48,W4VM3D_D12,6.811273e-10,False,C231A,1.0,6.0,0.0,6.538822e-08,MDRQQIEQVVKAVLAGMAANSAPEPVTPPCGTGVFASLDDAVQAAS...,
64,4,W4VM3D,ATGGACCGTCAGCAGATTGAACAGGTTGTTAAAGCAGTTCTGGCAG...,,64.0,W4VM3D,F5,RB04_NB65,W4VM3D_F5,0.0,True,G190DEL_C191DEL_A192DEL_A193DEL_T194DEL_T195DE...,0.86569,20.0,0.097826,,MDRQQIEQVVKAVLAGMAANSAPEPVTPPCGTGVFASLDDAVQAAS...,
70,4,W4VM3D,ATGGACCGTCAGCAGATTGAACAGGTTGTTAAAGCAGTTCTGGCAG...,,70.0,W4VM3D,F11,RB04_NB71,W4VM3D_F11,1.285927e-185,True,G190DEL_C191DEL_A192DEL_A193DEL_T194DEL_T195DE...,0.957152,12.0,0.024732,1.23449e-183,MDRQQIEQVVKAVLAGMAANSAPEPVTPPCGTGVFASLDDAVQAAS...,
94,4,W4VM3D,ATGGACCGTCAGCAGATTGAACAGGTTGTTAAAGCAGTTCTGGCAG...,,94.0,W4VM3D,H11,RB04_NB95,W4VM3D_H11,7.06237e-14,False,G424T,1.0,8.0,0.0,6.779875e-12,MDRQQIEQVVKAVLAGMAANSAPEPVTPPCGTGVFASLDDAVQAAS...,A142S


In [10]:
df

Unnamed: 0,barcode_plate,name,refseq,variant,index,Plate,Well,Barcode,ID,P value,Mixed Well,Variant,Average mutation frequency,Alignment Count,Average error rate,P adj. value
0,4,W4VM3D,ATGGACCGTCAGCAGATTGAACAGGTTGTTAAAGCAGTTCTGGCAG...,,0.0,W4VM3D,A1,RB04_NB01,W4VM3D_A1,,,,,,,
1,4,W4VM3D,ATGGACCGTCAGCAGATTGAACAGGTTGTTAAAGCAGTTCTGGCAG...,,1.0,W4VM3D,A2,RB04_NB02,W4VM3D_A2,,,,,,,
2,4,W4VM3D,ATGGACCGTCAGCAGATTGAACAGGTTGTTAAAGCAGTTCTGGCAG...,,2.0,W4VM3D,A3,RB04_NB03,W4VM3D_A3,,,,,,,
3,4,W4VM3D,ATGGACCGTCAGCAGATTGAACAGGTTGTTAAAGCAGTTCTGGCAG...,,3.0,W4VM3D,A4,RB04_NB04,W4VM3D_A4,,,,,,,
4,4,W4VM3D,ATGGACCGTCAGCAGATTGAACAGGTTGTTAAAGCAGTTCTGGCAG...,,4.0,W4VM3D,A5,RB04_NB05,W4VM3D_A5,,False,#PARENT#,,6.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,4,W4VM3D,ATGGACCGTCAGCAGATTGAACAGGTTGTTAAAGCAGTTCTGGCAG...,,91.0,W4VM3D,H8,RB04_NB92,W4VM3D_H8,,,,,,,
92,4,W4VM3D,ATGGACCGTCAGCAGATTGAACAGGTTGTTAAAGCAGTTCTGGCAG...,,92.0,W4VM3D,H9,RB04_NB93,W4VM3D_H9,,,,,,,
93,4,W4VM3D,ATGGACCGTCAGCAGATTGAACAGGTTGTTAAAGCAGTTCTGGCAG...,,93.0,W4VM3D,H10,RB04_NB94,W4VM3D_H10,,,,,,,
94,4,W4VM3D,ATGGACCGTCAGCAGATTGAACAGGTTGTTAAAGCAGTTCTGGCAG...,,94.0,W4VM3D,H11,RB04_NB95,W4VM3D_H11,7.062370e-14,False,G424T,1.0,8.0,0.0,6.779875e-12


In [38]:
refseq[362]

'C'

In [24]:
df_mut.AA_mutations.value_counts().head(20)

AA_mutations
G162N                                                2
G162D                                                2
G28S_L279A                                           2
T163W                                                2
G28S_L279K                                           2
T163P                                                2
G28S_L279V                                           2
G104R_L279W                                          2
G28S                                                 2
                                                     2
T163A                                                2
G28S_G162Q                                           1
G28S_E65K_G104R_G162E                                1
G28S_G162N                                           1
G28S_G162F                                           1
G28S_G162Y                                           1
G28S_R103H_G104R_G162N                               1
R103H_G104R_G162A                                   

In [18]:
df_mut.head(2)

Unnamed: 0,barcode_plate,name,refseq,variant,index,Plate,Well,Barcode,ID,P value,Mixed Well,Variant,Average mutation frequency,Alignment Count,Average error rate,P adj. value
0,1,TtAldh_opt_P161,ATGAGCGAGAGAGTAAAGGTAGCCATTTTAGGTAGCGGTAACATAG...,,0.0,TtAldh_opt_P161,A1,RB01_NB01,TtAldh_opt_P161_A1,3.2981830000000003e-31,True,G3C_G142A_G492A_A840G,0.445652,46.0,0.375,3.166256e-29
1,1,TtAldh_opt_P161,ATGAGCGAGAGAGTAAAGGTAGCCATTTTAGGTAGCGGTAACATAG...,,1.0,TtAldh_opt_P161,A2,RB01_NB02,TtAldh_opt_P161_A2,9.560219e-20,True,G3C_G142A_G492A_A840G,0.475806,31.0,0.346774,9.17781e-18


In [20]:
df_mut.Variant.value_counts().head(50)

Variant
G492A_A840G                                                                              3
A840G                                                                                    2
G3C_G142A_G492A_A840G                                                                    2
G3C_C352G_G370A_A840G                                                                    1
G3C_G142A_C352A_T353C_G492A_A840G                                                        1
C352A_T353A_G354T_G492A_C812DEL                                                          1
G3C_G142A_A252C_G253A_G257A_A329DEL_G334DEL_C352G_T353C_G370A_G492A_A516G_G834C_A840G    1
C352T_T353G_A399DEL_A639G_A840G                                                          1
C352G_G354T_G492A_A840G                                                                  1
G3C_G48A_G50A_G52A_G142A_A252C_G253A_G257A_T353A_G354T_G368A_G492A_G834C_A840G           1
G3C_G48A_G253A_C352G_T353G_G354T_G368A_G370A_A840G                                

In [9]:
df_mut["Mixed Well"].value_counts()

Mixed Well
True     113
False     52
Name: count, dtype: int64

In [9]:
df_mut.AA_mutations.dropna().head(50)

0                                             M1I_G48S
1                                             M1I_G48S
3                                                P181R
5                                           G48S_P181H
6                                 M1I_G48S_R123H_P181R
7                                                P181G
8                           M1I_G48S_R123H_G124R_P181Q
9                                       M1I_G48S_P181T
12                                               P181L
13                                                G48S
14                                          L84F_P181H
16                                          G18S_P181E
18                                               P181H
19                                          G48C_P181D
20                                      M1I_G48S_P181R
21                                               P181W
22                                                    
24                           M1I_G48S_R86Q_P181V_Y295C
27        

In [7]:
date = "250911"
base_dir = Path(f"/projects/bpms/jlaw/projects/other/levseq/runs/{date}")
for run_name, postfix in [
    # "20250731_C199_Phusion", 
     # "20250731_I194_Phusion", 
     # "20250731_S197_Phusion",
     # "20250731_W200_Phusion",
     # "20250731_A31_Phusion",
    # "20250812_CpThim22C",
    # ("W200", "_PhusionGibson"),
    # ("I10A", "-T154"),
    # ("I10S", "-T154"),
    ("VKV3LZ_1_nnk_440_plate_3", ""),
                ]:
    # var_file = base_dir / f"20{date}_{run_name}{postfix}/outputs/variants.csv"
    var_file = base_dir / f"{run_name}/outputs/variants.csv"
    print(var_file)
    df = pd.read_csv(var_file)

    refseq = df["refseq"].unique()[0]
    print(len(df["refseq"].unique()), len(refseq))
    offset = 0
    print(df.head(2))
    aa_refseq = map_dna_to_aa(refseq)
    print(aa_refseq)

    df_mut = df[df["P value"] < 0.05]
    print(len(df), len(df_mut))
    print(df_mut.Variant.value_counts())

    print("Applying mutations")
    df_mut["AA_seq"] = df_mut.Variant.apply(lambda x: map_dna_to_aa(apply_dna_mutations(x, offset=offset)))
    df_mut = df_mut.dropna(subset="AA_seq")
    df_mut = df_mut[~df_mut.AA_seq.str.contains("X")]
    print(len(df_mut))

    print("Extracting AA mutations")
    df_mut["AA_mutations"] = df_mut.AA_seq.apply(lambda x: get_aa_mutations(aa_refseq, x))

    print("")
    print(run_name)
    print(df_mut.AA_mutations.value_counts())
    rename_file = str(var_file).replace("variants.csv", f"{run_name}_levseq_variants.csv")
    df.to_csv(rename_file, index=False)
    out_file = str(rename_file).replace(".csv", "_aa_mut.csv")
    print(out_file)
    df_mut.to_csv(out_file, index=False)
    # break

/projects/bpms/jlaw/projects/other/levseq/runs/250909/VKV3LZ_1_nnk_440_plate_3/outputs/variants.csv
1 1077
   barcode_plate                      name  \
0              4  VKV3LZ_1_nnk_440_plate_3   
1              4  VKV3LZ_1_nnk_440_plate_3   

                                              refseq  variant  index  \
0  ATGGGCAGCAGCCATCATCATCATCATCACAGCAGCGGCCTGGTGC...      NaN    0.0   
1  ATGGGCAGCAGCCATCATCATCATCATCACAGCAGCGGCCTGGTGC...      NaN    1.0   

                      Plate Well    Barcode                           ID  \
0  VKV3LZ_1_nnk_440_plate_3   A1  RB04_NB01  VKV3LZ_1_nnk_440_plate_3_A1   
1  VKV3LZ_1_nnk_440_plate_3   A2  RB04_NB02  VKV3LZ_1_nnk_440_plate_3_A2   

   P value Mixed Well   Variant  Average mutation frequency  Alignment Count  \
0      NaN      False  #PARENT#                         NaN             22.0   
1      NaN      False  #PARENT#                         NaN             33.0   

   Average error rate  P adj. value  
0                 NaN        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mut["AA_seq"] = df_mut.Variant.apply(lambda x: map_dna_to_aa(apply_dna_mutations(x, offset=offset)))


In [84]:
out_file = var_file.replace(".csv", "_aa_mut.csv")
print(out_file)
df_mut.to_csv(out_file, index=False)

/Users/jlaw/Dev/bpms/levseq_runs/sylvia/250801/MjMvkT154//variants_aa_mut.csv


In [16]:
df_mut.head(4)

Unnamed: 0,barcode_plate,name,refseq,variant,index,Plate,Well,Barcode,ID,P value,Mixed Well,Variant,Average mutation frequency,Alignment Count,Average error rate,P adj. value,AA_seq,AA_mutations
5,1,GsAdh_2025-04-16_plate5,ATGAAAGCTGCAGTTGTGGAACAATTTAAAAAGCCGTTACAAGTGA...,,5.0,GsAdh_2025-04-16_plate5,A6,RB01_NB06,GsAdh_2025-04-16_plate5_A6,0.0,False,C148T_G251A_A648C,0.988095,84.0,0.0,,MKAAVVEQFKKPLQVKEVEKPKISYGEVLVRIKACGVCHTDLHAAH...,P50S_G84D_K216N
8,1,GsAdh_2025-04-16_plate5,ATGAAAGCTGCAGTTGTGGAACAATTTAAAAAGCCGTTACAAGTGA...,,8.0,GsAdh_2025-04-16_plate5,A9,RB01_NB09,GsAdh_2025-04-16_plate5_A9,5.481139e-141,False,T439C,0.971429,70.0,0.014286,5.261893e-139,MKAAVVEQFKKPLQVKEVEKPKISYGEVLVRIKACGVCHTDLHAAH...,F147L
9,1,GsAdh_2025-04-16_plate5,ATGAAAGCTGCAGTTGTGGAACAATTTAAAAAGCCGTTACAAGTGA...,,9.0,GsAdh_2025-04-16_plate5,A10,RB01_NB10,GsAdh_2025-04-16_plate5_A10,6.028883e-34,False,T209C_T752C,0.916667,12.0,0.083333,5.787728e-32,MKAAVVEQFKKPLQVKEVEKPKISYGEVLVRIKACGVCHTDLHAAH...,V70A_I251T
10,1,GsAdh_2025-04-16_plate5,ATGAAAGCTGCAGTTGTGGAACAATTTAAAAAGCCGTTACAAGTGA...,,10.0,GsAdh_2025-04-16_plate5,A11,RB01_NB11,GsAdh_2025-04-16_plate5_A11,9.406419e-13,False,A941G,1.0,6.0,0.0,9.030162e-11,MKAAVVEQFKKPLQVKEVEKPKISYGEVLVRIKACGVCHTDLHAAH...,E314G


## Now map the plate and wells

In [30]:
# load the stats file previously generated
stats_file = "250507/20250423_GsAdh_epPCR_Batch1_CF15_R2_linear_fits_22000s.xlsx"

plate_data = pd.DataFrame()
for plate in range(1,7):
    sheet_name = f"Plate {plate} - Sheet1"
    sheet_data = pd.read_excel(stats_file, 
                       sheet_name=sheet_name, 
                       header=0, 
                       index_col=0).reset_index(drop=False, names="Well")
    sheet_data["Plate"] = plate
    plate_data = pd.concat([plate_data, sheet_data])

plate_data

Unnamed: 0,Well,slope,r2,y-intercept,1/slope,Plate
0,C04,8.666805e-07,0.939892,0.236645,1.153828e+06,1
1,B05,7.662667e-07,0.839006,0.255263,1.305029e+06,1
2,F03,7.279289e-07,0.932977,0.231986,1.373760e+06,1
3,B07,7.000757e-07,0.902764,0.266747,1.428417e+06,1
4,B03,6.524307e-07,0.906600,0.263941,1.532730e+06,1
...,...,...,...,...,...,...
91,E04,-2.076123e-07,0.895734,0.254311,-4.816669e+06,6
92,F05,-2.118150e-07,0.658653,0.253639,-4.721102e+06,6
93,F04,-2.181697e-07,0.585321,0.255100,-4.583589e+06,6
94,A12,-2.788953e-07,0.920436,0.327502,-3.585574e+06,6


In [34]:
df_mut_stats = pd.DataFrame()
for plate_name, df_p in df_mut.groupby("Plate"):
    plate = int(plate_name[-1])
    sheet_data = plate_data[plate_data.Plate == plate].drop(columns="Plate")  #.set_index("Well")
    
    df_p["Well"] = df_p.Well.apply(lambda w: f"{w[0]}{int(w[1:]):02d}")
    # df_p = df_p.set_index("Well")

    df_p_stats = pd.merge(df_p, sheet_data, left_on="Well", right_on="Well")
    print(len(df_p), len(df_p_stats))

    df_mut_stats = pd.concat([df_mut_stats, df_p_stats])
df_mut_stats

52 52
43 43


Unnamed: 0,barcode_plate,name,refseq,variant,index,Plate,Well,Barcode,ID,P value,...,Average mutation frequency,Alignment Count,Average error rate,P adj. value,AA_seq,AA_mutations,slope,r2,y-intercept,1/slope
0,1,GsAdh_2025-04-16_plate5,ATGAAAGCTGCAGTTGTGGAACAATTTAAAAAGCCGTTACAAGTGA...,,5.0,GsAdh_2025-04-16_plate5,A06,RB01_NB06,GsAdh_2025-04-16_plate5_A6,0.000000e+00,...,0.988095,84.0,0.000000,,MKAAVVEQFKKPLQVKEVEKPKISYGEVLVRIKACGVCHTDLHAAH...,P50S_G84D_K216N,4.148571e-07,0.981295,0.260448,2.410469e+06
1,1,GsAdh_2025-04-16_plate5,ATGAAAGCTGCAGTTGTGGAACAATTTAAAAAGCCGTTACAAGTGA...,,8.0,GsAdh_2025-04-16_plate5,A09,RB01_NB09,GsAdh_2025-04-16_plate5_A9,5.481139e-141,...,0.971429,70.0,0.014286,5.261893e-139,MKAAVVEQFKKPLQVKEVEKPKISYGEVLVRIKACGVCHTDLHAAH...,F147L,1.883223e-07,0.289903,0.284857,5.310047e+06
2,1,GsAdh_2025-04-16_plate5,ATGAAAGCTGCAGTTGTGGAACAATTTAAAAAGCCGTTACAAGTGA...,,9.0,GsAdh_2025-04-16_plate5,A10,RB01_NB10,GsAdh_2025-04-16_plate5_A10,6.028883e-34,...,0.916667,12.0,0.083333,5.787728e-32,MKAAVVEQFKKPLQVKEVEKPKISYGEVLVRIKACGVCHTDLHAAH...,V70A_I251T,-7.927568e-08,0.113331,0.300822,-1.261421e+07
3,1,GsAdh_2025-04-16_plate5,ATGAAAGCTGCAGTTGTGGAACAATTTAAAAAGCCGTTACAAGTGA...,,10.0,GsAdh_2025-04-16_plate5,A11,RB01_NB11,GsAdh_2025-04-16_plate5_A11,9.406419e-13,...,1.000000,6.0,0.000000,9.030162e-11,MKAAVVEQFKKPLQVKEVEKPKISYGEVLVRIKACGVCHTDLHAAH...,E314G,-9.524406e-09,0.003182,0.296995,-1.049934e+08
4,1,GsAdh_2025-04-16_plate5,ATGAAAGCTGCAGTTGTGGAACAATTTAAAAAGCCGTTACAAGTGA...,,11.0,GsAdh_2025-04-16_plate5,A12,RB01_NB12,GsAdh_2025-04-16_plate5_A12,3.808144e-07,...,0.800000,5.0,0.200000,3.655818e-05,MKAAVVEQFKKPLQVKEVEKPKISYGEVLVRIKACGVCHTDLHAAH...,,-7.497115e-09,0.012695,0.297973,-1.333846e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38,2,GsAdh_2025-04-16_plate6,ATGAAAGCTGCAGTTGTGGAACAATTTAAAAAGCCGTTACAAGTGA...,,65.0,GsAdh_2025-04-16_plate6,F06,RB02_NB66,GsAdh_2025-04-16_plate6_F6,1.543818e-27,...,0.933333,15.0,0.066667,1.482065e-25,MKAAVVEQFKKPLQVKEVEKPKISYGEVLVRIKACGVCHTDLHAAH...,Y171N,-6.613615e-09,0.003724,0.249420,-1.512032e+08
39,2,GsAdh_2025-04-16_plate6,ATGAAAGCTGCAGTTGTGGAACAATTTAAAAAGCCGTTACAAGTGA...,,66.0,GsAdh_2025-04-16_plate6,F07,RB02_NB67,GsAdh_2025-04-16_plate6_F7,9.598843e-75,...,0.956522,23.0,0.000000,9.214889e-73,MKAAVVEQFKKPLQVKEVEKPKISYGEVLVRIKACGVCHTDLHAAH...,A180T,2.346722e-07,0.808475,0.247111,4.261263e+06
40,2,GsAdh_2025-04-16_plate6,ATGAAAGCTGCAGTTGTGGAACAATTTAAAAAGCCGTTACAAGTGA...,,67.0,GsAdh_2025-04-16_plate6,F08,RB02_NB68,GsAdh_2025-04-16_plate6_F8,8.925972e-59,...,1.000000,27.0,0.000000,8.568933e-57,MKAAVVEQFKKPLQVKEVEKPKISYGEVLVRIKACGVCHTDLHAAH...,Y171N,1.347975e-07,0.392745,0.262289,7.418535e+06
41,2,GsAdh_2025-04-16_plate6,ATGAAAGCTGCAGTTGTGGAACAATTTAAAAAGCCGTTACAAGTGA...,,68.0,GsAdh_2025-04-16_plate6,F09,RB02_NB69,GsAdh_2025-04-16_plate6_F9,0.000000e+00,...,0.927928,37.0,0.036036,,MKAAVVEQFKKPLQVKEVEKPKISYGEV*,L29*,1.955329e-07,0.615141,0.254234,5.114229e+06


In [35]:
df_mut_stats.to_csv("250507/250507_variants_aa_mut_seq_stats.csv", index=False)

In [18]:
df_mut.Plate.value_counts()

Plate
GsAdh_2025-04-16_plate5    52
GsAdh_2025-04-16_plate6    43
Name: count, dtype: int64

In [19]:
df_mut.Well.value_counts()

Well
A6     2
F2     2
D8     2
D11    2
E4     2
      ..
D10    1
D3     1
D2     1
C6     1
E9     1
Name: count, Length: 62, dtype: int64