In [1]:
import os
import re
import glob
import math
from collections import defaultdict
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
CSVs = []
for root, dirs, files in os.walk("Affinity Data"):
    for file in files:
        if file.endswith(".csv"):
             CSVs.append(os.path.join(root, file))

In [3]:
len(CSVs)

378

In [4]:
def make_dir_if_not_exists(folder):
    """
    This function takes in a folder name and creates a folder if it does not exist.
    Args:
        folder: folder name
    Returns:
        None"
    """
    if not os.path.exists(folder):
        os.makedirs(folder)

def get_col_name_from_index(data, index):
    """
    This function takes in a dataframe and an index and returns the column name.
    Args:
        data: dataframe
        index: index of the column
    Returns:
        column name"""
    return data.columns[index]

def load_useful_data(data):
    """
    This function takes in a dataframe and returns a dictionary with useful data.
    Args:
        data: dataframe
    Returns:
        data_dict: dictionary with useful data
    """
    pH = list(data.columns).index('pH')

    uv_280 = list(data.columns).index('mAU')
    uv_260 = list(data.columns).index('mAU.1')
    conductivity = list(data.columns).index('mS/cm')
    sample_flow = list(data.columns).index('CV/h')
    system_flow = list(data.columns).index('CV/h.1')
    sample_pressure = list(data.columns).index('MPa')
    system_pressure = list(data.columns).index('MPa.1')
    run_log = list(data.columns).index('Logbook')

    ml_pH = pH - 1
    ml_uv_280 = uv_280 - 1
    ml_uv_260 = uv_260 - 1
    ml_conductivity = conductivity - 1
    ml_sample_flow = sample_flow - 1
    ml_system_flow = system_flow - 1
    ml_sample_pressure = sample_pressure - 1
    ml_system_pressure = system_pressure - 1
    ml_run_log = run_log - 1

    data_dict = {'pH': [get_col_name_from_index(data, ml_pH), get_col_name_from_index(data, pH)],
                'UV_280': [get_col_name_from_index(data, ml_uv_280), get_col_name_from_index(data, uv_280)], 
                'UV_260': [get_col_name_from_index(data, ml_uv_260), get_col_name_from_index(data, uv_260)], 
                'Conductivity': [get_col_name_from_index(data, ml_conductivity), get_col_name_from_index(data, conductivity)], 
                'Sample Flow': [get_col_name_from_index(data, ml_sample_flow), get_col_name_from_index(data, sample_flow)], 
                'System_flow': [get_col_name_from_index(data, ml_system_flow), get_col_name_from_index(data, system_flow)], 
                'Sample Pressure': [get_col_name_from_index(data, ml_sample_pressure), get_col_name_from_index(data, sample_pressure)], 
                'System Pressure': [get_col_name_from_index(data, ml_system_pressure), get_col_name_from_index(data, system_pressure)],
                'Run Log': [get_col_name_from_index(data, ml_run_log), get_col_name_from_index(data, run_log)]}
    return data_dict

def get_resin_and_serotype(name):
    """
    This function takes in a name and returns the resin used and serotype of AAV.
    Args:
        name: name of the file
    Returns:
        resin: resin used
        serotype: serotype of AAV
    """
    resin = re.findall(r'AAV[A-Z]\d+', name)
    serotype = re.findall(r'AAV*\d+', name)
    if len(resin) == 0:
        resin = 'Unknown'
    if len(serotype) == 0:
        serotype = 'Unknown'
    return resin[0], serotype[0]

def get_resin(name):
    """
    This function takes in a name and returns the resin used.
    Args:
        name: name of the file
    Returns:
        resin: resin used
    """
    resin = re.findall(r'AAV[A-Z]\d+', name)
    if len(resin) == 0:
        resin = re.findall(r'AAVX', name)
        if len(resin) == 0:
            resin = re.findall(r'AAVx', name)
            if len(resin) == 0:
                resin = 'Unknown'
    return resin[0]

def get_serotype(name):
    """
    This function takes in a name and returns the serotype of AAV.
    Args:
        name: name of the file
    Returns:
        serotype: serotype of AAV
    """
    serotype = re.findall(r'AAV*\d+', name)
    if len(serotype) == 0:
        serotype = 'Unknown'
    return serotype[0]

def get_column_volume(name):
    """
    This function takes in a name and returns the column volume.
    Args:
        name: name of the file
    Returns:    
        column_volume: column volume
    """
    column_volume = re.findall(r'\d+(?:\.\d+)?[mM][lL]', name)
    if len(column_volume) == 0:
        column_volume = 'Unknown'
    return column_volume[0]

def is_pure(name):
    """
    This function takes in a name and returns if the sample is pure or not.
    Args:
        name: name of the file
    Returns:
        pure: True if the sample is pure, False otherwise
    """
    pure = re.findall(r'[pP]ure', name)
    if len(pure) == 0:
        pure = False
    else:
        pure = True
    return pure

# def plot_data(data, folder, name, data_dict, columns=['UV_280', 'Conductivity']):
#     """
#     This function takes in a dataframe and plots the data.
#     Args:
#         data: dataframe
#         folder: folder to save the plots
#         name: name of the plot
#         data_dict: dictionary with useful data
#         columns: list of columns to plot
#     Returns:
#         None"""
#     plt.rcParams["figure.figsize"] = (20,10)
#     for key in columns:
#         plt.plot( data[data_dict[key][0]], data[data_dict[key][1]], label=key)
#     resin, serotype = get_resin_and_serotype(name)
#     plt.title(f'Resin: {resin}, Serotype: {serotype}')
#     plt.xlabel('Volume (ml)')
#     plt.ylabel('mAU')
#     plt.legend()
#     plt.savefig(f'{folder}/plots/{name}.png')
#     plt.clf()

def plot_data(data, folder, name, data_dict, columns=['UV_280', 'Conductivity']):
    """
    This function takes in a dataframe and plots the data.
    Args:
        data: dataframe
        folder: folder to save the plots
        name: name of the plot
        data_dict: dictionary with useful data
        columns: list of columns to plot
    Returns:
        None"""
    plt.rcParams["figure.figsize"] = (20,10)
    fig, ax1 = plt.subplots()
    ax2 = ax1.twinx()
    ax1.plot(data[data_dict[columns[0]][0]], data[data_dict[columns[0]][1]], 'g-', label=columns[0])
    ax2.plot(data[data_dict[columns[0]][0]], data[data_dict[columns[1]][1]], 'b-', label=columns[1]) 
    ax1.set_xlabel('Volume (ml)')
    ax1.set_ylabel('mAU', color='g')
    ax2.set_ylabel('mS/cm', color='b')
    resin, serotype = get_resin_and_serotype(name)
    plt.title(f'Resin: {resin}, Serotype: {serotype}')
    plt.legend()
    fig.savefig(f'{folder}/plots/{name}.png')
    fig.clf()
    fig.clear()
    
    # plt.savefig(f'{folder}/plots/{name}.png')
    # plt.clf()

def get_ph_and_cond_at_elution(df, data_dict):
    """
    This function takes in a dataframe and a dictionary with useful data and returns the pH and conductivity at elution.
    Args:
        df: dataframe
        data_dict: dictionary with useful data
    Returns:
        elution_ph: pH at elution
        elution_cond: conductivity at elution
    """
    ml_pH_col = data_dict['pH'][0]
    ml_cond_col = data_dict['Conductivity'][0]
    ml_log_col = data_dict['Run Log'][0]
    # print('elu')
    elution_ph_index = df[ml_pH_col][round(df[ml_pH_col]) == round(df[ml_log_col][4])].index[1]
    elution_cond_index = df[ml_cond_col][round(df[ml_cond_col]) == round(df[ml_log_col][4])].index[1]
    elution_ph = round(df['pH'][elution_ph_index], 2)
    elution_cond = round(df['mS/cm'][elution_cond_index], 2)
    return elution_ph, elution_cond

def get_ph_and_cond_at_wash(df, data_dict):
    """
    This function takes in a dataframe and a dictionary with useful data and returns the pH and conductivity at wash.
    Args:
        df: dataframe
        data_dict: dictionary with useful data
    Returns:
        wash_ph: pH at wash
        wash_cond: conductivity at wash
    """
    ml_pH_col = data_dict['pH'][0]
    ml_cond_col = data_dict['Conductivity'][0]
    ml_log_col = data_dict['Run Log'][0]
    # print('wash')
    wash_ph_index = df[ml_pH_col][round(df[ml_pH_col ]) == round(df[ml_log_col][3])].index[1]
    wash_cond_index = df[ml_cond_col][round(df[ml_cond_col]) == round(df[ml_log_col][3])].index[1]
    wash_ph = round(df['pH'][wash_ph_index], 2)
    wash_cond = round(df['mS/cm'][wash_cond_index], 2)
    return wash_ph, wash_cond

def get_ph_and_cond_at_equilibration(df, data_dict):
    """
    This function takes in a dataframe and a dictionary with useful data and returns the pH and conductivity at equilibration.
    Args:
        df: dataframe
        data_dict: dictionary with useful data
    Returns:
        equilibration_ph: pH at equilibration
        equilibration_cond: conductivity at equilibration
    """
    ml_pH_col = data_dict['pH'][0]
    ml_cond_col = data_dict['Conductivity'][0]
    ml_log_col = data_dict['Run Log'][0]
    # print('eqil')
    equilibration_ph_index = df[ml_pH_col][round(df[ml_pH_col ]) == round(df[ml_log_col][1])].index[1]
    equilibration_cond_index = df[ml_cond_col][round(df[ml_cond_col]) == round(df[ml_log_col][1])].index[1]
    equilibration_ph = round(df['pH'][equilibration_ph_index], 2)
    equilibration_cond = round(df['pH'][equilibration_cond_index], 2)
    return equilibration_ph, equilibration_cond

In [102]:
CSVs = []
for root, dirs, files in os.walk("Affinity Data"):
    for file in files:
        if file.endswith(".csv"):
             CSVs.append(os.path.join(root, file))
CSVs.remove('Affinity Data/AAV2/New AAV2/AAV2 HEK Lysate  - A10 (T) Run 3 002.csv')
CSVs.remove('Affinity Data/AAVA3/AAVA3_AAV2 SF9_F3 002.csv')
CSVs.remove('Affinity Data/AAVX_2/Column Evaluation - AAVx 2ml SAS-AM012921.csv')
CSVs.remove('Affinity Data/AAVX_2/AAVX Resin For NCTM - 1ml Column HETP and ASYM.csv')
CSVs.remove('Affinity Data/AAVX/AAVX 0.5ML Run SAS102122 004_fl.csv')

In [6]:
df = pd.read_csv('Affinity Data/AAV2/New AAV2/AAV2 HEK Lysate  - A10 (T) Run 3 002.csv', skiprows = [0,1], delimiter='\t', encoding='utf_16',  on_bad_lines='skip', low_memory=False)

In [7]:
df.head()

Unnamed: 0,ml,mAU,ml.1,mS/cm,ml.2,Injection,ml.3,Logbook,ml.4,Fraction,...,ml.20,MPa.4,ml.21,pH,ml.22,mAU.2,ml.23,mAU.3,ml.24,%.1
0,-12.524871,0.701526,-12.524871,0.377653,0.000743,,-12.524871,Method Settings,0.0,2.A.3,...,-12.524871,0.014598,-12.524871,0.0,-12.524871,0.0,-12.524871,0.0,-12.524871,0.0
1,-12.522776,10.294412,-12.522776,0.377685,,,-12.524787,Equilibration,11.506289,Waste(Frac),...,-12.522776,0.203077,-12.519631,6.042378,-12.522251,-1.048699,-12.522251,-0.6682,-12.519631,0.0
2,-12.520679,9.743042,-12.520679,0.377629,,,-0.002127,Sample Application,11.507009,2.A.4,...,-12.520679,0.249384,-12.514392,6.046434,-12.519631,-1.29827,-12.519631,-0.673204,-12.514392,0.0
3,-12.518584,9.307805,-12.518584,0.377646,,,11.506289,Column Wash,21.507234,Waste(Frac),...,-12.518584,0.208721,-12.509153,6.047227,-12.517012,-1.491331,-12.517012,-0.702261,-12.509152,0.0
4,-12.516488,8.953231,-12.516488,0.377796,,,21.508069,Elution,21.516107,5.A.7,...,-12.516488,0.080551,-12.503914,6.047294,-12.514392,-1.614316,-12.514392,-0.717134,-12.503913,0.0


In [103]:
frame = {'resin': [], 'serotype': [], 'file': [], 'Column Volume (mL)':[], 'Pure':[],
         'Elution pH': [],'Wash pH': [],'Equlibration pH': [],'Elution Conductivity': [],
         'Wash Conductivity': [],'Equilibration Conductivity': []}
for csv in CSVs:
    name = csv.split('/')[-1][:-4]
    resin, serotype = get_resin_and_serotype(name)
    if resin == 'U':
        resin = get_resin(name)
    if serotype == 'U':
        serotype = csv.split('/')[1]
    pure = is_pure(name)
    col_vol = get_column_volume(name)
    # print(csv)
    try:
        df = pd.read_csv(csv, skiprows = [0,1], delimiter='\t', encoding='utf_16', low_memory=False)
        data_dict = load_useful_data(df)
    except Exception as e:
        print(e, csv)
    
    try:
        elution_ph, elution_cond = get_ph_and_cond_at_elution(df, data_dict)
        wash_ph, wash_cond = get_ph_and_cond_at_wash(df, data_dict)
        equilibration_ph, equilibration_cond = get_ph_and_cond_at_equilibration(df, data_dict)
        frame['resin'].append(resin)
        frame['serotype'].append(serotype)
        frame['file'].append(name)
        frame['Pure'].append(pure)
        frame['Column Volume (mL)'].append(col_vol[:-2])
        frame['Elution pH'].append(elution_ph)
        frame['Wash pH'].append(wash_ph)
        frame['Equlibration pH'].append(equilibration_ph)
        frame['Elution Conductivity'].append(elution_cond)
        frame['Wash Conductivity'].append(wash_cond)
        frame['Equilibration Conductivity'].append(equilibration_cond)
    except Exception as e:
        print(e, csv)

    



data = pd.DataFrame(frame)
data[['Column Diameter (mm)', 'Coulmn Height (cm)']] = 'U'
# data.loc[['Elution pH','Wash pH','Equlibration pH','Elution Conductivity','Wash Conductivity','Equilibration Conductivity']] = 'U'

cannot convert float NaN to integer Affinity Data/AAV2/AAVx method with LG6 cleaning 001.csv
cannot convert float NaN to integer Affinity Data/AAVX_3/AAVX 0.5ml - AAV5 HEK elution 001.csv
cannot convert float NaN to integer Affinity Data/AAVX/AAVX Run 1 Pure SAS10112022 001.csv
cannot convert float NaN to integer Affinity Data/AAVX/AAVX 0.5ml - AAV5 HEK elution 001.csv
cannot convert float NaN to integer Affinity Data/AAV9_with_LigaGuard/ethanol column cleaning 001.csv
cannot convert float NaN to integer Affinity Data/AAV9_with_LigaGuard/LigaGuard AAV fixed volume 50 mL load 001.csv
cannot convert float NaN to integer Affinity Data/AAV9_with_LigaGuard/LigaGuard AAV 50 mL 002.csv
cannot convert float NaN to integer Affinity Data/AAV9_with_LigaGuard/LigaGuard AAV 50 mL 001.csv
cannot convert float NaN to integer Affinity Data/AAV9/AAVx 9th cycle 50 mL load 001.csv
cannot convert float NaN to integer Affinity Data/AAVX_2/AAVX HEK293 2mL column method(637483396913297566).csv
cannot convert

In [104]:
data

Unnamed: 0,resin,serotype,file,Column Volume (mL),Pure,Elution pH,Wash pH,Equlibration pH,Elution Conductivity,Wash Conductivity,Equilibration Conductivity,Column Diameter (mm),Coulmn Height (cm)
0,AAVx,AAV2,AAVx method LG6 toyo Load 31.5 mL cycle 1 001,,False,7.43,7.55,7.44,16.14,15.85,7.44,U,U
1,AAVx,AAV2,AAVx method with LG6 Load 32 mL S2 cycle 8 001,,False,7.66,7.51,2.05,16.27,14.35,2.05,U,U
2,AAVx,AAV2,AAVx method with LG6 Load cycle 3 001,,False,2.22,7.34,7.38,38.60,16.27,7.38,U,U
3,AAVx,AAV2,AAVx method with LG6 Load 33 mL cycle 10 001,,False,7.58,7.48,2.19,16.50,15.16,2.19,U,U
4,AAVx,AAV2,AAVx method no LG6 Load 35 mL cycle 2 001,,False,7.41,7.58,7.40,16.19,13.44,7.40,U,U
...,...,...,...,...,...,...,...,...,...,...,...,...,...
356,AAVW8,AAV10,AAVW8_AAV10 HEK lysate 003,,False,7.45,7.32,6.13,4.11,2.95,6.13,U,U
357,AAVA5,AAV10,AAVA5_AAV10 HEK lysate 004,,False,7.33,7.35,6.03,3.15,2.91,6.03,U,U
358,AAVA1,AAV10,AAVA1_AAV10 HEK lysate 001,,False,7.31,7.34,5.93,3.14,2.93,5.93,U,U
359,AAVA2,AAV10,AAVA2_AAV10 HEK lysate 001,,False,7.43,7.38,6.11,3.98,2.97,6.11,U,U


In [105]:
for i in range(len(data)):
    if data['resin'][i] == 'U' and (data['serotype'][i] in ['AAV2', 'AAV6', 'AAV9', 'AAV9_with_LigaGuard']):
        # data['resin'][i] = 'AAVX'
        data.loc[i, 'resin'] = 'AAVX'

In [106]:
data

Unnamed: 0,resin,serotype,file,Column Volume (mL),Pure,Elution pH,Wash pH,Equlibration pH,Elution Conductivity,Wash Conductivity,Equilibration Conductivity,Column Diameter (mm),Coulmn Height (cm)
0,AAVx,AAV2,AAVx method LG6 toyo Load 31.5 mL cycle 1 001,,False,7.43,7.55,7.44,16.14,15.85,7.44,U,U
1,AAVx,AAV2,AAVx method with LG6 Load 32 mL S2 cycle 8 001,,False,7.66,7.51,2.05,16.27,14.35,2.05,U,U
2,AAVx,AAV2,AAVx method with LG6 Load cycle 3 001,,False,2.22,7.34,7.38,38.60,16.27,7.38,U,U
3,AAVx,AAV2,AAVx method with LG6 Load 33 mL cycle 10 001,,False,7.58,7.48,2.19,16.50,15.16,2.19,U,U
4,AAVx,AAV2,AAVx method no LG6 Load 35 mL cycle 2 001,,False,7.41,7.58,7.40,16.19,13.44,7.40,U,U
...,...,...,...,...,...,...,...,...,...,...,...,...,...
356,AAVW8,AAV10,AAVW8_AAV10 HEK lysate 003,,False,7.45,7.32,6.13,4.11,2.95,6.13,U,U
357,AAVA5,AAV10,AAVA5_AAV10 HEK lysate 004,,False,7.33,7.35,6.03,3.15,2.91,6.03,U,U
358,AAVA1,AAV10,AAVA1_AAV10 HEK lysate 001,,False,7.31,7.34,5.93,3.14,2.93,5.93,U,U
359,AAVA2,AAV10,AAVA2_AAV10 HEK lysate 001,,False,7.43,7.38,6.11,3.98,2.97,6.11,U,U


In [107]:
for i in range(len(data)):
    if data['Column Volume (mL)'][i] == '':
        data.loc[i, ['Coulmn Height (cm)', 'Column Diameter (mm)', 'Column Volume (mL)']] = [2.55, 0.5, 5]

In [108]:
data

Unnamed: 0,resin,serotype,file,Column Volume (mL),Pure,Elution pH,Wash pH,Equlibration pH,Elution Conductivity,Wash Conductivity,Equilibration Conductivity,Column Diameter (mm),Coulmn Height (cm)
0,AAVx,AAV2,AAVx method LG6 toyo Load 31.5 mL cycle 1 001,5,False,7.43,7.55,7.44,16.14,15.85,7.44,0.5,2.55
1,AAVx,AAV2,AAVx method with LG6 Load 32 mL S2 cycle 8 001,5,False,7.66,7.51,2.05,16.27,14.35,2.05,0.5,2.55
2,AAVx,AAV2,AAVx method with LG6 Load cycle 3 001,5,False,2.22,7.34,7.38,38.60,16.27,7.38,0.5,2.55
3,AAVx,AAV2,AAVx method with LG6 Load 33 mL cycle 10 001,5,False,7.58,7.48,2.19,16.50,15.16,2.19,0.5,2.55
4,AAVx,AAV2,AAVx method no LG6 Load 35 mL cycle 2 001,5,False,7.41,7.58,7.40,16.19,13.44,7.40,0.5,2.55
...,...,...,...,...,...,...,...,...,...,...,...,...,...
356,AAVW8,AAV10,AAVW8_AAV10 HEK lysate 003,5,False,7.45,7.32,6.13,4.11,2.95,6.13,0.5,2.55
357,AAVA5,AAV10,AAVA5_AAV10 HEK lysate 004,5,False,7.33,7.35,6.03,3.15,2.91,6.03,0.5,2.55
358,AAVA1,AAV10,AAVA1_AAV10 HEK lysate 001,5,False,7.31,7.34,5.93,3.14,2.93,5.93,0.5,2.55
359,AAVA2,AAV10,AAVA2_AAV10 HEK lysate 001,5,False,7.43,7.38,6.11,3.98,2.97,6.11,0.5,2.55


In [109]:
data.to_csv('Affinity Data/affinity_data.csv', index=False)

In [None]:
['AAV2', 'AAV6', 'AAV9', 'AAV9_with_LigaGuard']
[]

In [50]:
data['serotype'].unique()

array(['AAV2', 'AAVA3', 'AAV9', 'AAV1', 'AAV6', 'AAV10', 'AAV7', 'AAV4',
       'AAV5', 'AAV3', 'AAV8', 'AAVX_3', 'AAVX', 'AAV9_with_LigaGuard',
       'AAVX_2'], dtype=object)

In [None]:
['Ligand ID', 'Serotype', 'Equil Residence Vol', 'Equil Conductivity', 'Equil pH', 'Elution Residence Vol' 'Elution Conductivity', 'Elution pH', 'Wash Residence Vol', 'Wash pH', 'Wash Conductivity']

In [61]:
def make_dir_if_not_exists(folder):
    """
    This function takes in a folder name and creates a folder if it does not exist.
    Args:
        folder: folder name
    Returns:
        None"
    """
    if not os.path.exists(folder):
        os.makedirs(folder)

def get_col_name_from_index(data, index):
    """
    This function takes in a dataframe and an index and returns the column name.
    Args:
        data: dataframe
        index: index of the column
    Returns:
        column name"""
    return data.columns[index]

def load_useful_data(data):
    """
    This function takes in a dataframe and returns a dictionary with useful data.
    Args:
        data: dataframe
    Returns:
        data_dict: dictionary with useful data
    """
    pH = list(data.columns).index('pH')

    uv_280 = list(data.columns).index('mAU')
    uv_260 = list(data.columns).index('mAU.1')
    conductivity = list(data.columns).index('mS/cm')
    sample_flow = list(data.columns).index('CV/h')
    system_flow = list(data.columns).index('CV/h.1')
    sample_pressure = list(data.columns).index('MPa')
    system_pressure = list(data.columns).index('MPa.1')
    run_log = list(data.columns).index('Logbook')

    ml_pH = pH - 1
    ml_uv_280 = uv_280 - 1
    ml_uv_260 = uv_260 - 1
    ml_conductivity = conductivity - 1
    ml_sample_flow = sample_flow - 1
    ml_system_flow = system_flow - 1
    ml_sample_pressure = sample_pressure - 1
    ml_system_pressure = system_pressure - 1
    ml_run_log = run_log - 1

    data_dict = {'pH': [get_col_name_from_index(data, ml_pH), get_col_name_from_index(data, pH)],
                'UV_280': [get_col_name_from_index(data, ml_uv_280), get_col_name_from_index(data, uv_280)], 
                'UV_260': [get_col_name_from_index(data, ml_uv_260), get_col_name_from_index(data, uv_260)], 
                'Conductivity': [get_col_name_from_index(data, ml_conductivity), get_col_name_from_index(data, conductivity)], 
                'Sample Flow': [get_col_name_from_index(data, ml_sample_flow), get_col_name_from_index(data, sample_flow)], 
                'System_flow': [get_col_name_from_index(data, ml_system_flow), get_col_name_from_index(data, system_flow)], 
                'Sample Pressure': [get_col_name_from_index(data, ml_sample_pressure), get_col_name_from_index(data, sample_pressure)], 
                'System Pressure': [get_col_name_from_index(data, ml_system_pressure), get_col_name_from_index(data, system_pressure)],
                'Run Log': [get_col_name_from_index(data, ml_run_log), get_col_name_from_index(data, run_log)]}
    return data_dict

def get_resin_and_serotype(name):
    """
    This function takes in a name and returns the resin used and serotype of AAV.
    Args:
        name: name of the file
    Returns:
        resin: resin used
        serotype: serotype of AAV
    """
    resin = re.findall(r'AAV[A-Z]\d+', name)
    serotype = re.findall(r'AAV*\d+', name)
    if len(resin) == 0:
        resin = 'Unknown'
    if len(serotype) == 0:
        serotype = 'Unknown'
    return resin[0], serotype[0]

def get_resin(name):
    """
    This function takes in a name and returns the resin used.
    Args:
        name: name of the file
    Returns:
        resin: resin used
    """
    resin = re.findall(r'AAV[A-Z]\d+', name)
    if len(resin) == 0:
        resin = re.findall(r'AAVX', name)
        if len(resin) == 0:
            resin = re.findall(r'AAVx', name)
            if len(resin) == 0:
                resin = 'Unknown'
    return resin[0]

def get_serotype(name):
    """
    This function takes in a name and returns the serotype of AAV.
    Args:
        name: name of the file
    Returns:
        serotype: serotype of AAV
    """
    serotype = re.findall(r'AAV*\d+', name)
    if len(serotype) == 0:
        serotype = 'Unknown'
    return serotype[0]

def get_column_volume(name):
    """
    This function takes in a name and returns the column volume.
    Args:
        name: name of the file
    Returns:    
        column_volume: column volume
    """
    column_volume = re.findall(r'\d+(?:\.\d+)?[mM][lL]', name)
    if len(column_volume) == 0:
        column_volume = 'Unknown'
    return column_volume[0]

def is_pure(name):
    """
    This function takes in a name and returns if the sample is pure or not.
    Args:
        name: name of the file
    Returns:
        pure: True if the sample is pure, False otherwise
    """
    pure = re.findall(r'[pP]ure', name)
    if len(pure) == 0:
        pure = False
    else:
        pure = True
    return pure

# def plot_data(data, folder, name, data_dict, columns=['UV_280', 'Conductivity']):
#     """
#     This function takes in a dataframe and plots the data.
#     Args:
#         data: dataframe
#         folder: folder to save the plots
#         name: name of the plot
#         data_dict: dictionary with useful data
#         columns: list of columns to plot
#     Returns:
#         None"""
#     plt.rcParams["figure.figsize"] = (20,10)
#     for key in columns:
#         plt.plot( data[data_dict[key][0]], data[data_dict[key][1]], label=key)
#     resin, serotype = get_resin_and_serotype(name)
#     plt.title(f'Resin: {resin}, Serotype: {serotype}')
#     plt.xlabel('Volume (ml)')
#     plt.ylabel('mAU')
#     plt.legend()
#     plt.savefig(f'{folder}/plots/{name}.png')
#     plt.clf()

def plot_data(data, folder, name, data_dict, columns=['UV_280', 'Conductivity']):
    """
    This function takes in a dataframe and plots the data.
    Args:
        data: dataframe
        folder: folder to save the plots
        name: name of the plot
        data_dict: dictionary with useful data
        columns: list of columns to plot
    Returns:
        None"""
    plt.rcParams["figure.figsize"] = (20,10)
    fig, ax1 = plt.subplots()
    ax2 = ax1.twinx()
    ax1.plot(data[data_dict[columns[0]][0]], data[data_dict[columns[0]][1]], 'g-', label=columns[0])
    ax2.plot(data[data_dict[columns[0]][0]], data[data_dict[columns[1]][1]], 'b-', label=columns[1]) 
    ax1.set_xlabel('Volume (ml)')
    ax1.set_ylabel('mAU', color='g')
    ax2.set_ylabel('mS/cm', color='b')
    resin, serotype = get_resin_and_serotype(name)
    plt.title(f'Resin: {resin}, Serotype: {serotype}')
    plt.legend()
    fig.savefig(f'{folder}/plots/{name}.png')
    fig.clf()
    fig.clear()
    
    # plt.savefig(f'{folder}/plots/{name}.png')
    # plt.clf()

In [None]:
cannot convert float NaN to integer Affinity Data/AAV2/AAVx method with LG6 cleaning 001.csv
cannot convert float NaN to integer Affinity Data/AAVX_3/AAVX 0.5ml - AAV5 HEK elution 001.csv
cannot convert float NaN to integer Affinity Data/AAVX/AAVX Run 1 Pure SAS10112022 001.csv
cannot convert float NaN to integer Affinity Data/AAVX/AAVX 0.5ml - AAV5 HEK elution 001.csv
cannot convert float NaN to integer Affinity Data/AAV9_with_LigaGuard/ethanol column cleaning 001.csv
cannot convert float NaN to integer Affinity Data/AAV9_with_LigaGuard/LigaGuard AAV fixed volume 50 mL load 001.csv
cannot convert float NaN to integer Affinity Data/AAV9_with_LigaGuard/LigaGuard AAV 50 mL 002.csv
cannot convert float NaN to integer Affinity Data/AAV9_with_LigaGuard/LigaGuard AAV 50 mL 001.csv
cannot convert float NaN to integer Affinity Data/AAV9/AAVx 9th cycle 50 mL load 001.csv
cannot convert float NaN to integer Affinity Data/AAVX_2/AAVX HEK293 2mL column method(637483396913297566).csv
cannot convert float NaN to integer Affinity Data/AAVX_2/AAVX HEK293 0.1mL column DBC Study Part I - 033121 001 - 050721 JAH 001.csv