# Pre-process Data

author: Binbin Zhu

This page show the combination of 520 raw data files into one file.

In [1]:
pip install peakutils

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import glob
import os
import seaborn as sns
import matplotlib.pyplot as plt
import peakutils


## Define smooth function and baseline correction fuction

I used moving average to smooth the data and only keep the data of the wavelength between 500 and 2000.

In [3]:
# Define a function to smooth data by moving average
def smooth(data):
    rolling_average = data.rolling(window=10).mean()
    smoothed_data = rolling_average.drop(rolling_average[(rolling_average.index >= 2000 )| (rolling_average.index <= 500)].index)
    return smoothed_data

I used polynomial fitting method to correct baseline. The Baseline estimation algorithm is peakutils.baseline( ), which computes the baseline of a given data. Then, the data minus the baseline value to get corrected values.

In [4]:
# Define a function to correct baseline by polynomial fitting method
def baseline_correction(data):
    baseline_value = peakutils.baseline(data)
    baseline_corrected=data-baseline_value
    return baseline_corrected

## Combine data

I combined all raw negative and positive data separately. Then, the spectral data was denoised by the function I defined. Negative dataframe contains 280 rows x 1556 columns. Positive dataframe contains 240 x 1556 columns. I also added one row to indicate the diagnosis, negative or positive. Finally, there are 1557 columns.

In [5]:
# Combine all negatives into one
path_neg = ['19-441C', '19-442C', '19-445C', '19-447C', '19-448C', '19-450C', '19-452C', '19-453C', '19-455C', '19-456C', '19-460C','19-462C', '19-483C', '19-512C']
neg = []
for i in path_neg:
    files = glob.glob(os.path.join(i, '*.CSV'))
    combined = pd.concat((pd.read_csv(f, index_col=0, header=None) for f in files), axis=1, ignore_index=True)   
    neg.append(combined)
all_neg = pd.concat(neg, axis=1, ignore_index=True)

col = len(all_neg.columns)

frames=[]
for i in range(0,col):
    data=smooth(all_neg.iloc[:,[i]])
    data=baseline_correction(data)
    data=data.transpose()
    frames.append(data)
denoised_neg = pd.concat(frames, ignore_index=True)
denoised_neg['diagnosis'] = 'N'
denoised_neg.shape

(280, 1557)

In [6]:
# Combine all positive into one
path_pos = ['19-443C+', '19-444C+', '19-446C+', '19-449C+', '19-451C+', '19-454C+', '19-457C+', '19-458C+', '19-459C+', '19-461C+', '19-464C+', '19-511C+']
pos = []
for i in path_pos:
    files = glob.glob(os.path.join(i, '*.CSV'))  
    combined = pd.concat((pd.read_csv(f, index_col=0, header=None) for f in files), axis=1, ignore_index=True)   
    pos.append(combined)
all_pos = pd.concat(pos, axis=1, ignore_index=True)

col = len(all_pos.columns)

frames=[]
for i in range(0,col):
    data=smooth(all_pos.iloc[:,[i]])
    data=baseline_correction(data)
    data=data.transpose()
    frames.append(data)
denoised_pos = pd.concat(frames, ignore_index=True)
denoised_pos['diagnosis'] = 'P'
denoised_pos.shape

(240, 1557)

Combine negative and positive together and save into a new csv file, named as all_denoised_data.csv.

In [7]:
# Combine all of raw spectral data
all_data = pd.concat([all_neg, all_pos], ignore_index=True)
all_data.to_csv('all_data.csv')
# Combine all of denoised negative and positive
all_denoised_data = pd.concat([denoised_neg, denoised_pos], ignore_index=True)
all_denoised_data.to_csv('all_denoised_data.csv')
all_denoised_data

Unnamed: 0,500.3769,501.3411,502.3054,503.2696,504.2338,505.1981,506.1623,507.1265,508.0908,509.055,...,1992.046,1993.01,1993.974,1994.9379999999999,1995.9029999999998,1996.8670000000002,1997.8310000000001,1998.795,1999.76,diagnosis
0,1.037038,1.088265,1.233976,1.347499,1.295797,1.109784,0.973557,1.075938,1.347899,1.531108,...,2.191152,2.406767,2.603309,2.681419,2.691378,2.741203,2.844648,2.844278,2.618507,N
1,6.211912,5.902306,5.713722,5.602013,5.524272,5.470924,5.471811,5.552292,5.657490,5.664200,...,5.040420,5.092443,4.931562,4.788789,4.728263,4.662078,4.481508,4.203521,3.969624,N
2,4.988583,5.032977,5.098549,5.159239,5.215715,5.250594,5.218340,5.068592,4.799335,4.586536,...,4.833457,4.806668,4.909719,5.051993,5.056172,4.897813,4.761009,4.737644,4.811733,N
3,5.035774,5.125359,5.208042,5.284477,5.301366,5.258965,5.189198,5.122413,5.059652,4.897291,...,1.877656,2.043109,2.016493,1.957189,2.030062,2.231939,2.419572,2.477660,2.322978,N
4,6.314343,5.932537,5.653940,5.497581,5.446961,5.467883,5.500728,5.474824,5.418199,5.397387,...,4.336513,4.385986,4.522248,4.492372,4.181757,3.667499,3.182403,2.944281,2.972792,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,4.917788,4.679630,4.202548,3.748590,3.576837,3.715020,4.030479,4.360715,4.602141,4.636234,...,1.364448,1.683123,2.096895,2.503461,2.737105,2.759245,2.765097,2.883808,3.002274,P
516,1.867906,1.947775,2.042472,2.101455,2.078990,2.001350,1.950086,2.012657,2.109786,2.151017,...,1.061596,0.978107,0.857280,0.781880,0.738261,0.692152,0.653411,0.629868,0.603550,P
517,5.851671,5.838641,5.660157,5.436936,5.383870,5.503249,5.682234,5.780665,5.770482,5.732470,...,4.353588,4.230101,4.034263,3.818549,3.702072,3.756657,3.885366,4.022213,4.217064,P
518,6.680988,6.596028,6.692590,6.863513,6.935155,6.906429,6.862923,6.905526,6.990236,7.128619,...,1.447391,1.187667,0.890533,0.786869,0.921657,1.111083,1.183753,1.167107,1.182761,P
