# 3 Kiwi Convert to CSV

## Setup

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
df_all = pd.read_csv("data/raw_kiwi/raw_gold_dm2.txt", sep="\t")
print(f"Number of rows: {len(df_all)}")
df_all.head(5)

Number of rows: 25990


Unnamed: 0,RecordID,SpecimenID,Category,RefVal,Timestamp,FirmwareVersion,DeviceID,WavelengthInterpolationC0,WavelengthInterpolationC1,WavelengthInterpolationC2,...,WavelengthInterpolationC5,ReferenceIntegrationTime,ReferenceScansToAverage,SampleIntegrationTime,SpecimenScansToAverage,Wavelengths,DarkReferenceSpectra,LightReferenceSpectra,DarkSpecimenSpectra,LightSpecimenSpectra
0,553f820f-0ccf-49ce-92b5-bb8f73185ce7,28362,Train,11.208791,1/31/2023 9:12:57 AM,,,304.153,3.31737,0.000452,...,0.0,65.6,1,19.7,1,"304.153,307.470825,310.789551,314.109131,317.4...","4045,3963.471,3952.40161,3957.3916,3947.58081,...","4439.83057,4356.86035,4357.85938,4342.615,4331...","3821.117,3743.9126,3727.445,3730.90747,3715.42...","4378.68652,4290.241,4278.24756,4281.49951,4273..."
1,844de5d8-f575-47cb-9258-bba211119ef6,28363,Train,11.208791,1/31/2023 9:13:16 AM,,,304.153,3.31737,0.000452,...,0.0,65.7,1,19.2,1,"304.153,307.470825,310.789551,314.109131,317.4...","4075,3979.72,3977.17529,3981.44458,3974.68848,...","4454.727,4377.766,4360.023,4362.524,4347.023,4...","3848.872,3766.70947,3770.94946,3763.38818,3754...","4404.018,4324.567,4301.32,4309.31836,4297.0737..."
2,e337c655-01e0-41c2-bc8a-ac163c942af9,28364,Train,9.333333,1/31/2023 9:15:20 AM,,,304.153,3.31737,0.000452,...,0.0,65.7,1,19.0,1,"304.153,307.470825,310.789551,314.109131,317.4...","4092,4005.99414,3993.31885,3996.47485,3992.155...","4506.65674,4419.93066,4409.183,4392.441,4395.1...","3881.62451,3802.20386,3800.142,3797.519,3792.4...","4442.448,4342.48438,4348.9834,4340.736,4349.23..."
3,fe7a6fcc-5ffd-4bc4-a480-e9ef2b34477d,28365,Train,9.333333,1/31/2023 9:15:32 AM,,,304.153,3.31737,0.000452,...,0.0,65.7,1,18.0,1,"304.153,307.470825,310.789551,314.109131,317.4...","4141,4056.97656,4057.36841,4053.94458,4044.790...","4513.303,4433.32959,4433.084,4415.08643,4410.3...","3914.60254,3835.595,3837.64917,3832.80322,3826...","4463.7876,4378.082,4369.33252,4373.33447,4349...."
4,d51841d2-9772-4296-a66a-5676f603dd3f,28366,Train,11.576355,1/31/2023 9:15:46 AM,,,304.153,3.31737,0.000452,...,0.0,65.7,1,19.5,1,"304.153,307.470825,310.789551,314.109131,317.4...","4153,4066.56665,4058.07935,4053.97876,4048.403...","4541.943,4452.46973,4441.47461,4438.228,4430.7...","3915.96338,3840.59546,3830.38135,3831.93555,38...","4478.48828,4401.02637,4385.784,4395.53125,4387..."


In [4]:
df_all.columns

Index(['RecordID', 'SpecimenID', 'Category', 'RefVal', 'Timestamp',
       'FirmwareVersion', 'DeviceID', 'WavelengthInterpolationC0',
       'WavelengthInterpolationC1', 'WavelengthInterpolationC2',
       'WavelengthInterpolationC3', 'WavelengthInterpolationC4',
       'WavelengthInterpolationC5', 'ReferenceIntegrationTime',
       'ReferenceScansToAverage', 'SampleIntegrationTime',
       'SpecimenScansToAverage', 'Wavelengths', 'DarkReferenceSpectra',
       'LightReferenceSpectra', 'DarkSpecimenSpectra', 'LightSpecimenSpectra'],
      dtype='object')

In [54]:
wavelengths = (
    df_all["Wavelengths"]
    .str.split(",", expand=True)
    .head(1)
    .transpose()
    .rename(columns={0: "wavelength"})
)
wavelengths["wavelength"] = wavelengths["wavelength"].astype(float).round(decimals=0).astype(int)
wavelengths

Unnamed: 0,wavelength
0,304
1,307
2,311
3,314
4,317
...,...
251,1133
252,1136
253,1139
254,1142


In [55]:
spectra_types = []
for col in ["DarkReferenceSpectra", "LightReferenceSpectra", "DarkSpecimenSpectra", "LightSpecimenSpectra"]:
    spectra = (
        df_all[col]
        .str.split(",", expand=True)
    )
    col = re.sub(r"(?<!^)(?=[A-Z])", "_", col).lower()
    spectra.columns = wavelengths["wavelength"].to_list()
    spectra = spectra.melt(
        var_name="wavelength", 
        value_name=col, 
        ignore_index=False
    )
    spectra[col] = spectra[col].astype(float)
    spectra.reset_index(inplace=True, names="spectra_index")
    spectra.set_index(["spectra_index", "wavelength"], inplace=True)
    spectra_types.append(spectra)
spectra = pd.concat(spectra_types, axis=1).reset_index().sort_values(by=["spectra_index", "wavelength"])
del spectra_types
spectra

Unnamed: 0,spectra_index,wavelength,dark_reference_spectra,light_reference_spectra,dark_specimen_spectra,light_specimen_spectra
0,0,304,4045.00000,4439.83057,3821.11700,4378.68652
23411,0,307,3963.47100,4356.86035,3743.91260,4290.24100
46822,0,311,3952.40161,4357.85938,3727.44500,4278.24756
70233,0,314,3957.39160,4342.61500,3730.90747,4281.49951
93644,0,317,3947.58081,4331.61670,3715.42236,4273.99365
...,...,...,...,...,...,...
5899571,23410,1133,7029.22600,8145.40100,6788.52930,7627.19238
5922982,23410,1136,7033.72700,8118.91162,6784.44336,7605.96500
5946393,23410,1139,7048.00146,8132.91100,6777.10352,7574.72500
5969804,23410,1142,7044.40527,8121.66300,6759.50400,7534.25600


In [56]:
spectra["absorbance"] = -np.log(
    (spectra["light_specimen_spectra"] - spectra["dark_specimen_spectra"])
    /(spectra["light_reference_spectra"] - spectra["dark_reference_spectra"])
)
spectra

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,spectra_index,wavelength,dark_reference_spectra,light_reference_spectra,dark_specimen_spectra,light_specimen_spectra,absorbance
0,0,304,4045.00000,4439.83057,3821.11700,4378.68652,-0.345130
23411,0,307,3963.47100,4356.86035,3743.91260,4290.24100,-0.328420
46822,0,311,3952.40161,4357.85938,3727.44500,4278.24756,-0.306360
70233,0,314,3957.39160,4342.61500,3730.90747,4281.49951,-0.357171
93644,0,317,3947.58081,4331.61670,3715.42236,4273.99365,-0.374646
...,...,...,...,...,...,...,...
5899571,23410,1133,7029.22600,8145.40100,6788.52930,7627.19238,0.285854
5922982,23410,1136,7033.72700,8118.91162,6784.44336,7605.96500,0.278347
5946393,23410,1139,7048.00146,8132.91100,6777.10352,7574.72500,0.307618
5969804,23410,1142,7044.40527,8121.66300,6759.50400,7534.25600,0.329631
