In [1]:
import os
import re
import glob
from collections import defaultdict
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [10]:
def make_dir_if_not_exists(folder):
    """
    This function takes in a folder name and creates a folder if it does not exist.
    Args:
        folder: folder name
    Returns:
        None"
    """
    if not os.path.exists(folder):
        os.makedirs(folder)

def get_col_name_from_index(data, index):
    """
    This function takes in a dataframe and an index and returns the column name.
    Args:
        data: dataframe
        index: index of the column
    Returns:
        column name"""
    return data.columns[index]

def load_useful_data(data):
    """
    This function takes in a dataframe and returns a dictionary with useful data.
    Args:
        data: dataframe
    Returns:
        data_dict: dictionary with useful data
    """
    pH = list(data.columns).index('pH')

    uv_280 = list(data.columns).index('mAU')
    uv_260 = list(data.columns).index('mAU.1')
    conductivity = list(data.columns).index('mS/cm')
    sample_flow = list(data.columns).index('CV/h')
    system_flow = list(data.columns).index('CV/h.1')
    sample_pressure = list(data.columns).index('MPa')
    system_pressure = list(data.columns).index('MPa.1')

    ml_pH = pH - 1
    ml_uv_280 = uv_280 - 1
    ml_uv_260 = uv_260 - 1
    ml_conductivity = conductivity - 1
    ml_sample_flow = sample_flow - 1
    ml_system_flow = system_flow - 1
    ml_sample_pressure = sample_pressure - 1
    ml_system_pressure = system_pressure - 1

    data_dict = {'pH': [get_col_name_from_index(data, ml_pH), get_col_name_from_index(data, pH)],
                'UV_280': [get_col_name_from_index(data, ml_uv_280), get_col_name_from_index(data, uv_280)], 
                'UV_260': [get_col_name_from_index(data, ml_uv_260), get_col_name_from_index(data, uv_260)], 
                'Conductivity': [get_col_name_from_index(data, ml_conductivity), get_col_name_from_index(data, conductivity)], 
                'Sample Flow': [get_col_name_from_index(data, ml_sample_flow), get_col_name_from_index(data, sample_flow)], 
                'System_flow': [get_col_name_from_index(data, ml_system_flow), get_col_name_from_index(data, system_flow)], 
                'Sample Pressure': [get_col_name_from_index(data, ml_sample_pressure), get_col_name_from_index(data, sample_pressure)], 
                'System Pressure': [get_col_name_from_index(data, ml_system_pressure), get_col_name_from_index(data, system_pressure)]}
    return data_dict

def get_resin_and_serotype(name):
    """
    This function takes in a name and returns the resin used and serotype of AAV.
    Args:
        name: name of the file
    Returns:
        resin: resin used
        serotype: serotype of AAV
    """
    resin = re.findall(r'AAV[A-Z]\d+', name)
    serotype = re.findall(r'AAV*\d+', name)
    if len(resin) == 0:
        resin = 'Unknown'
    if len(serotype) == 0:
        serotype = 'Unknown'
    return resin[0], serotype[0]

# def plot_data(data, folder, name, data_dict, columns=['UV_280', 'Conductivity']):
#     """
#     This function takes in a dataframe and plots the data.
#     Args:
#         data: dataframe
#         folder: folder to save the plots
#         name: name of the plot
#         data_dict: dictionary with useful data
#         columns: list of columns to plot
#     Returns:
#         None"""
#     plt.rcParams["figure.figsize"] = (20,10)
#     for key in columns:
#         plt.plot( data[data_dict[key][0]], data[data_dict[key][1]], label=key)
#     resin, serotype = get_resin_and_serotype(name)
#     plt.title(f'Resin: {resin}, Serotype: {serotype}')
#     plt.xlabel('Volume (ml)')
#     plt.ylabel('mAU')
#     plt.legend()
#     plt.savefig(f'{folder}/plots/{name}.png')
#     plt.clf()

def plot_data(data, folder, name, data_dict, columns=['UV_280', 'Conductivity']):
    """
    This function takes in a dataframe and plots the data.
    Args:
        data: dataframe
        folder: folder to save the plots
        name: name of the plot
        data_dict: dictionary with useful data
        columns: list of columns to plot
    Returns:
        None"""
    plt.rcParams["figure.figsize"] = (20,10)
    fig, ax1 = plt.subplots()
    ax2 = ax1.twinx()
    ax1.plot(data[data_dict[columns[0]][0]], data[data_dict[columns[0]][1]], 'g-', label=columns[0])
    ax2.plot(data[data_dict[columns[0]][0]], data[data_dict[columns[1]][1]], 'b-', label=columns[1]) 
    ax1.set_xlabel('Volume (ml)')
    ax1.set_ylabel('mAU', color='g')
    ax2.set_ylabel('mS/cm', color='b')
    resin, serotype = get_resin_and_serotype(name)
    plt.title(f'Resin: {resin}, Serotype: {serotype}')
    plt.legend()
    fig.savefig(f'{folder}/plots/{name}.png')
    fig.clf()
    fig.clear()
    
    # plt.savefig(f'{folder}/plots/{name}.png')
    # plt.clf()

In [15]:
folder ='AAV10/'
CSVs = glob.glob(f'{folder}/*.csv')

for csv in CSVs:
    name = csv.split('/')[-1][:-4]
    # print(name)
    data = pd.read_csv(csv, skiprows = [0,1], delimiter='\t', encoding='utf_16', low_memory=False)
    data_dict = load_useful_data(data)
    make_dir_if_not_exists(f'{folder}/plots')
    plot_data(data, folder, name, data_dict)

  fig, ax1 = plt.subplots()


<Figure size 2000x1000 with 0 Axes>

<Figure size 2000x1000 with 0 Axes>

<Figure size 2000x1000 with 0 Axes>

<Figure size 2000x1000 with 0 Axes>

<Figure size 2000x1000 with 0 Axes>

<Figure size 2000x1000 with 0 Axes>

<Figure size 2000x1000 with 0 Axes>

<Figure size 2000x1000 with 0 Axes>

<Figure size 2000x1000 with 0 Axes>

<Figure size 2000x1000 with 0 Axes>

<Figure size 2000x1000 with 0 Axes>

<Figure size 2000x1000 with 0 Axes>

<Figure size 2000x1000 with 0 Axes>

<Figure size 2000x1000 with 0 Axes>

<Figure size 2000x1000 with 0 Axes>

<Figure size 2000x1000 with 0 Axes>

<Figure size 2000x1000 with 0 Axes>

<Figure size 2000x1000 with 0 Axes>

<Figure size 2000x1000 with 0 Axes>

<Figure size 2000x1000 with 0 Axes>

<Figure size 2000x1000 with 0 Axes>

<Figure size 2000x1000 with 0 Axes>

<Figure size 2000x1000 with 0 Axes>

In [117]:
!rm -rf AAV10/plots

In [102]:
folder ='SEC'

TXTs = glob.glob(f'{folder}/*.txt')
ARWs = glob.glob(f'{folder}/*.arw')

In [98]:
data = defaultdict(list)

def make_dir_if_not_exists(folder):
    """
    This function takes in a folder name and creates a folder if it does not exist.
    Args:
        folder: folder name
    Returns:
        None"
    """
    if not os.path.exists(folder):
        os.makedirs(folder)

def plot_sec_data(data, sample_set, sample_name, channel):
    """
    This function takes in a dataframe and plots the data.
    Args:
        data: dataframe
    Returns:
        None
    """
    plt.rcParams["figure.figsize"] = (20,10)
    plt.plot(data[0], data[1])
    plt.title(f'Sample Set: {sample_set} | Sample Name: {sample_name} | Channel: {channel}')
    plt.xlabel('Volume (ml)')
    plt.ylabel('mAU')
    # plt.legend()
    plt.savefig(f'{folder}/plots/{name}.png')
    plt.clf()

with tqdm(total=len(TXTs)) as t:
    for txt in TXTs:
        f = open(txt).read().split('\t')
        filename = txt.split('/')[-1][:-4]
        SampleName = f[19].replace('"', '')
        Channel = f[20].replace('"', '')
        SampleSetName = f[18].replace('"', '')
        data[filename].extend([SampleSetName, SampleName, Channel])


with tqdm(total=len(ARWs)) as t:
    for arw in ARWs:
        filename = arw.split('/')[-1][:-4]
        if filename in data:
            details = data[filename]
            sample_set = details[0].strip(" ")
            sample_name = details[1].strip(" ")
            channel = details[2].strip(" ")
            sample_set_dir = make_dir_if_not_exists(details[0])
            sample_name_dir = make_dir_if_not_exists(f'{details[0]}/{details[1]}')
            if details[2] != 'PDA Spectrum':
                df = pd.read_csv(arw, header=None, delimiter='\t')
                df.to_csv(f'{details[0]}/{details[1]}/{details[2]}.csv', header=False, index=False)
            t.update(1)

In [48]:
data

defaultdict(list,
            {'kviral2154': ['FEED FOR REK AND BDM',
              'AAVA3 DBC 6H8',
              'PDA Ch2 280nm@4.8nm'],
             'kviral2151': ['FEED FOR REK AND BDM',
              'AAVA3 DBC 6H8',
              'ACQUITY FLR ChA '],
             'kviral2153': ['FEED FOR REK AND BDM',
              'AAVA3 DBC 6H8',
              'PDA Ch1 260nm@4.8nm'],
             'kviral2152': ['FEED FOR REK AND BDM',
              'AAVA3 DBC 6H8',
              'PDA Spectrum']})

In [18]:
data[['ml', 'mAU']].dropna().to_csv('test.csv', index=False)