In [1]:
import glob
from sys import exit, path
from os.path import join, expanduser, exists, splitext, commonpath
from io import StringIO

import pandas as pd

from bokeh.io import output_file, export_png, export_svgs, show, output_notebook
from bokeh.transform import linear_cmap
from bokeh.plotting import figure
from bokeh.models import BoxAnnotation, Span, Label, tickers
from bokeh.layouts import gridplot
import bokeh.palettes
import bokeh_catplot
output_notebook()

path.insert(1, expanduser('~/src/noexiit/software/analyses'))
path.insert(1, expanduser('/home/hank-x299/src/cmocean-bokeh/'))
from cmocean_cmaps import get_all_cmocean_colours 
import analyze_fictrac
import analyze_stimulus

In [4]:
gc_fids = glob.glob("/mnt/2TB/data_in/noexiit_data/gc-fid/dalotia_on_ball/*/*/*/*.txt")

In [5]:
gc_fids

['/mnt/2TB/data_in/noexiit_data/gc-fid/dalotia_on_ball/2020-07-22/dalotia_0/t_0/dalotia_0_t_0.txt',
 '/mnt/2TB/data_in/noexiit_data/gc-fid/dalotia_on_ball/2020-07-22/dalotia_0/t_noBeetle_postExpt/dalotia_0_t_noBeetle_postExpt.txt',
 '/mnt/2TB/data_in/noexiit_data/gc-fid/dalotia_on_ball/2020-07-23/dalotia_0/t_0/dalotia_0_t_0.txt',
 '/mnt/2TB/data_in/noexiit_data/gc-fid/dalotia_on_ball/2020-07-23/dalotia_0/t_noBeetle_postExpt/dalotia_0_t_noBeetle_postExpt.txt',
 '/mnt/2TB/data_in/noexiit_data/gc-fid/dalotia_on_ball/2020-07-24/dalotia_0/t_0/dalotia_0_t_0.txt',
 '/mnt/2TB/data_in/noexiit_data/gc-fid/dalotia_on_ball/2020-07-24/dalotia_0/t_noBeetle_postExpt/dalotia_0_t_noBeetle_postExpt.txt',
 '/mnt/2TB/data_in/noexiit_data/gc-fid/dalotia_on_ball/2020-07-24/dalotia_1/t_1/dalotia_1_t_1.txt',
 '/mnt/2TB/data_in/noexiit_data/gc-fid/dalotia_on_ball/2020-07-24/dalotia_1/t_noBeetle_postExpt/dalotia_1_t_noBeetle_postExpt.txt',
 '/mnt/2TB/data_in/noexiit_data/gc-fid/dalotia_on_ball/2020-07-24/daloti

In [6]:
def extract_GC_FID_chromatogram(txt):
    
    """
    Extract the chromatogram from the GC-FID .txt file, 
    so it can be read into Pandas via pd.read_csv(). 
    
    Parameters:
    -----------
    txt: The raw text file outputted by the Shimadzu 
         GC-2010 GC-FID machine. 
    
    Returns:
    --------
    A StringIO object containing the chromatogram from 
    the GC-FID output. 
    """
    
    do_store = False
    chromatogram = []
    with open (txt, "r") as f:
        
        txt_lines = f.readlines()
        
        for line in txt_lines:
            if line.startswith("[Chromatogram (Ch2)]"):
                do_store = True
                continue
            elif line.startswith("[Status Trace (Column Oven Temperature)]"):
                do_store = False
                break

            if do_store is True:
                chromatogram.append(line)
    
    chromatogram = "".join(chromatogram)
    
    return(StringIO(chromatogram))

In [7]:
extract_GC_FID_chromatogram(gc_fids[0])

<_io.StringIO at 0x7f34516c95f0>

In [8]:
gc_fid_txt = extract_GC_FID_chromatogram(gc_fids[0])
df = pd.read_csv(gc_fid_txt, skiprows=4)
df

Unnamed: 0,R.Time (min),Intensity
0,0.00067,-360
1,0.00133,-361
2,0.00200,-361
3,0.00267,-361
4,0.00333,-361
...,...,...
17994,11.99667,-619
17995,11.99733,-619
17996,11.99800,-620
17997,11.99867,-620


In [26]:
p = figure(background_fill_color="#f8f5f2", 
           width=1000,
           height=400,
           x_axis_label="retention time (mins)",
           y_axis_label="intensity",
           title="SPME GC-FID from D. coriaria with degastered live L. occidentale")

p.line(df["R.Time (min)"],
       df["Intensity"], 
       color="#1f78b4",
       line_width=3,
       alpha=0.7,
#        legend_label=label
      )

# p.legend.background_fill_color = "#f8f5f2"
p.border_fill_color = "#f8f5f2" 
p.xgrid.grid_line_color = "#efe8e2"
p.ygrid.grid_line_color = "#efe8e2" 
p.xaxis.ticker = tickers.SingleIntervalTicker(interval=1)

show(p)

For the entire dataset:

In [10]:
chromatograms = [extract_GC_FID_chromatogram(gc_fid) for gc_fid in gc_fids]
chromatograms = [pd.read_csv(chromatogram, skiprows=4) for chromatogram in chromatograms]

In [11]:
def parse_GC_FID_txts(paths):
    """
    Generates a list of chromatogram dataframes from a list 
    of GC-FID .txt outputs. Adds metadata from each file's path.
    Assumes that there exists somewhere in the path, a directory 
    structure that goes 'date -> animal_trial'. 
    
    Parameters:
    ------------
    paths: list of paths to the GC-FID text files
    
    Returns:
    -------
    A list of GC-FID dataframes with metadata.
    """
    
    common_path = commonpath(paths)
    
    dfs = []
    for path in paths:
        
        new_path = path.replace(f"{common_path}/", "")
        date = new_path.split("/")[0]
        animal = new_path.split("/")[1]
        trial = new_path.split("/")[2]
        
        chromatogram = extract_GC_FID_chromatogram(path)
        
        df = pd.read_csv(chromatogram, skiprows=4)
        df["date"] = date
        df["animal"] = animal
        df["trial"] = trial
        
        dfs.append(df)
        
    return(dfs)

In [12]:
concat_df = pd.concat(parse_GC_FID_txts(gc_fids))
concat_df

Unnamed: 0,R.Time (min),Intensity,date,animal,trial
0,0.00067,-360,2020-07-22,dalotia_0,t_0
1,0.00133,-361,2020-07-22,dalotia_0,t_0
2,0.00200,-361,2020-07-22,dalotia_0,t_0
3,0.00267,-361,2020-07-22,dalotia_0,t_0
4,0.00333,-361,2020-07-22,dalotia_0,t_0
...,...,...,...,...,...
17995,11.99700,119,2020-08-08,dalotia_4,t_noBeetle_postExpt
17996,11.99767,118,2020-08-08,dalotia_4,t_noBeetle_postExpt
17997,11.99833,118,2020-08-08,dalotia_4,t_noBeetle_postExpt
17998,11.99900,117,2020-08-08,dalotia_4,t_noBeetle_postExpt


Plot GC-FID traces for each animal. Although I am plotting multiple traces for the same animal together, know that we cannot actually compare SPME GC-FID traces.

In [28]:
# group is a df:
for name, group in concat_df.groupby(["date", "animal"]):
    
    # Based on `bokeh.palettes.Paired4`
    palettes = ["#1f78b4", "#a6cee3"]
    
    # Use helper function to unconcatenate into list of dfs:
    dfs_by_trial = analyze_fictrac.unconcat_df(group, "trial")
    
    p = figure(background_fill_color="#f8f5f2", 
           width=1000,
           height=400,
           x_axis_label="retention time (mins)",
           y_axis_label="intensity",
           title=f"SPME GC-FID for D. coriaria with degastered live L. occidentale {name}")
    
    for df, palette in zip(dfs_by_trial, palettes):  
        
        if "noBeetle" in df["trial"][0]:
            palette = "#a8a9aa"
        
        p.line(df["R.Time (min)"],
               df["Intensity"], 
               color=palette,
               alpha=0.8,
               line_width=3,
               legend_label=df["trial"][0]
              )
        
        p.legend.background_fill_color = "#f8f5f2"
        p.border_fill_color = "#f8f5f2"
        p.xgrid.grid_line_color = "#efe8e2"
        p.ygrid.grid_line_color = "#efe8e2"
        p.xaxis.ticker = tickers.SingleIntervalTicker(interval=0.5)
    
    show(p)

In [None]:
# TODO: Use peak data to label retention times for each peak in the trace
# TODO: Build presence-absence matrix for each compound