Algorithm to Buffer subtract, truncate, and area normalize FTIR spectra

5/25/18 Updated directory structure handling

5/29/18 Revised buffer subtraction to use least squares regression in place of manual input of acceptable minima for iteration.  - replaced the buffer subtraction-constant iteration from a static list to a modifyable numpy arange iterator
 - modified the buffer subtraction mean value to obtain better buffer subtraction output

To Do:  
  - Investigate ways to better subtract the buffer, especially if air bubbles are present/slight changes in cell pathlength (peak area based normalization, with/without band narrowing, baseline correction, etc.)
  


In [1]:
#Import required modules
import sys, os, math, warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import optimize

from scipy.integrate import simps
from scipy import integrate

from tkinter import Tk, filedialog
from tkinter.filedialog import askopenfilename, askopenfilenames
from matplotlib import rcParams
from matplotlib import rc
%matplotlib inline

#Ignores ALL warnings. Should delete this before any serious alterations...
warnings.filterwarnings("ignore")

create dataframes


In [2]:
import re
numbers = re.compile(r'(\d+)')

def numericalSort(value):
    """Sort function for sorting filenames numerically"""
    parts = numbers.split(value)
    parts[1::2] = map(int, parts[1::2])
    return parts

In [3]:
def select_file():
    Tk().withdraw()

    ftypes = [
            ('csv or text files', '*.csv *.txt')
            ]
    
    filename = str(askopenfilename(filetypes=ftypes, 
                                           title='''Select the datafile containing wavenumber in column 1, 
                                           buffer spectra in column 2, and sample spectra in remaining columns.'''))
    folder_path = os.path.split(filename)[0]
    filename = filename.split('/')[-1]
    return filename, folder_path               

In [4]:
def select_files():
    Tk().withdraw()

    ftypes = [
            ('csv or text files', '*.csv *.txt')
            ]
  
    buffer_file = str(askopenfilename(filetypes=ftypes, title='Choose the buffer file.'))
    reference_file = str(askopenfilename(filetypes=ftypes, title='Choose the reference file.'))
    data_files = [str(i) for i in askopenfilenames(filetypes=ftypes,title='Choose the sample files.')]

    folder_path = os.path.split(buffer_file)[0]

    """ Sort the data files numerically by first number appearing in file name"""

    data_files= sorted(data_files, key=numericalSort)

    """merge the buffer file, reference file, and sorted data file names"""  

    filenames = [buffer_file] + [reference_file] + data_files       
    filenames = [i.split('/')[-1] for i in filenames]

 
    return filenames, folder_path   

In [5]:
def create_df_from_single_file(data_filename, folder_path):
    """Creates a DataFrame with protein formulations for the given input data files"""
       
    df = pd.read_csv(folder_path + '/' + data_filename)

    df2 = df.columns.get_values()
    filename_list = df2.tolist()
    filenames = ['freq']
    for f in filename_list[1:]:
        title = f.split('.')[0]
        filenames.append(title)
        
    df.columns = filenames

    """Ensures the dataframe is sorted descending wavenumber"""

    df = df.sort_values(by=df.columns[0], ascending=False)

    df = df.reset_index(drop=True)
    
    """Ensures the dataframe is truncated to maximum 1000 - 3999 wavenumber range to start"""
    df = df[(df[df.columns[0]]  < 3999) & (df[df.columns[0]]  > 999)]
    df.reset_index(drop=True,inplace=True)

    
    return df, filenames

In [6]:
def create_df_from_multiple_files(data_filenames, folder_path):
    """Creates a DataFrame with protein formulations for the given input data files"""

    df = pd.read_csv(folder_path + '/' + data_filenames[0], header=None)
    df.columns = ['freq', 'buffer']
    filenames = ['freq','buffer']
    for f in data_filenames[1:]:
        d = pd.read_csv(folder_path + '/' + f, usecols=[1], header=None)
        title = f.split('.')[0]
        d.columns = [title]
        df = pd.concat([df, d], axis=1)
        filenames.append(title)

    """Ensures the dataframe is sorted descending wavenumber"""

    df = df.sort_values(by=['freq'], ascending=False)
    df = df.reset_index(drop=True)

    
    """Ensures the dataframe is truncated to maximum 1000 - 3999 wavenumber range to start"""
    df = df[(df[df.columns[0]]  < 3999) & (df[df.columns[0]]  > 999)]
    df.reset_index(drop=True,inplace=True)

    return df, filenames

In [7]:
def fun(c, df):
    
    df1 = df.copy()
    
    df1['subtr'] = df1[df1.columns[2]] - c*df1[df1.columns[1]]

    df2 = df1[(df1[df1.columns[0]]  < 2500) & (df1[df1.columns[0]]  > 1690)] #orig 2500 - 1720,new 2200, 1710; peptide: 2000-2200
    
    df3 = df2['subtr'].rolling(min_periods=1, center=True, window=12).mean() #minimize impact of noise
    
    return (abs(df3.max()-df3.min()))    

In [8]:
def get_constant(df):
    """Returns the constant to use for the buffer signal subtraction"""

    min_params = optimize.minimize(fun, 1.1, args = (df))

    d = 0.99*min_params.x
    
    print('buffer subtraction factor = ', d)
    
    result = df[df.columns[2]] - d*df[df.columns[1]]
    
    return result

In [9]:
def buffer_subtract(df):
    """Updates the DataFrame to have subtracted signal data"""

    result = get_constant(df)

    df[df.columns[2] +'_subtracted'] = result
    
    df1 = df[(df[df.columns[0]] > 1729) & (df[df.columns[0]]<1731)]
    
    baseline_value = df1[df1.columns[3]].values[0]
    
    df[df.columns[3]] = result-baseline_value
    
    return df

In [10]:
def crop(df):
    """Returns a DataFrame with only the data in the frequency range [1705-1600]cm-1"""
    df = df[(df[df.columns[0]]  < 1706) & (df[df.columns[0]] > 1599)]
    df.reset_index(drop=True,inplace=True)
    return df

The following cell processes the FTIR data

In [11]:
#Block for creating basic UI and output figures/csvs.


buff_subtracted_dataframes = [] #create empty list to store buffer subtracted data
amideI_dataframes = []  #create empty list to store Amide I Area normalized data

source_file = eval(input("""Enter 1 to import all data from a single csv (with headers), or 2 for individual data and buffer files (no headers) """))

if source_file == 1: 
    source_file = True
else: source_file = False
    
if source_file:
    raw_data_filename, directory = select_file()

    rawData_df, raw_data_filenames = create_df_from_single_file(raw_data_filename, directory)  #Creates the raw signal dataframe of buffer and each protein through the loop   
    print(raw_data_filenames)
    print(rawData_df.head(3))
    
else:
    raw_data_filenames, directory = select_files() #Calls select_files to read and pass back the filenames of interest in a list
    
    rawData_df, raw_data_filenames = create_df_from_multiple_files(raw_data_filenames, directory)  #Creates the raw signal dataframe of buffer and each protein through the loop   
    
num_files = len(raw_data_filenames)-2 #length excludes buffer file
initial_x_df = rawData_df.iloc[:, 0:2].copy()  #copies x-axis and buffer data for loop dataframe 


for i in range(num_files): #Loops through all selected files 
    raw_data_file = raw_data_filenames[i+2] #Loops through filename 1 - range to create buffer and protein list

    raw_data_file_df1 = rawData_df[[raw_data_file]].copy()  #Creates the raw signal dataframe of buffer and each protein through the loop   

    raw_data_file_df2 = pd.concat([initial_x_df, raw_data_file_df1], axis=1)

    protein = raw_data_file_df2.columns[2] #Gets column name of current spectra being processed
    print('Currently processing file: ' + protein)    

    subtr_df = buffer_subtract(raw_data_file_df2).copy() #adds buffer subtracted spectra to df  

    
    full_x_df = subtr_df.iloc[:, [0]].copy()  #copies x-axis data for building buffer subtracted dataframe 
    buffsub_col = subtr_df[[protein + '_subtracted']] #Extract only the buffer subtracted column    
    buff_subtracted_dataframes.append(buffsub_col)
    
    crop_df = crop(subtr_df) #truncates data to amide I region
    crop_x_df = crop_df.iloc[:, [0]].copy()  #copies x-axis data for building buffer subtracted dataframe 
    amideI_col = crop_df[[protein + '_subtracted']]
    amideI_dataframes.append(amideI_col)

    
all_subtracted_df = pd.concat(buff_subtracted_dataframes, axis=1)    
all_subtracted_df = pd.concat([full_x_df, all_subtracted_df], axis=1)
all_subtracted_df.plot(x='freq', figsize=(14,10))

plt.savefig(directory +'subtr'+'.png')
plt.show()
plt.close()

all_amideI_df = pd.concat(amideI_dataframes, axis=1)    
all_amideI_df = pd.concat([crop_x_df, all_amideI_df], axis=1)

all_subtracted_df.to_csv(directory + '/' + 'BufferSubtracted'+'.csv', index=False)
all_amideI_df.to_csv(directory + '/' + 'Amide_I_Buffer_Subtracted'+'.csv', index=False)

print('\nDone!')


Enter 1 to import all data from a single csv (with headers), or 2 for individual data and buffer files (no headers)  2


TclError: no display name and no $DISPLAY environment variable