In [64]:
import os
import jcamp
import pandas as pd

# Directory containing the JCAMP-DX files
dir = "nist_files"

# Initialize an empty list to store data from each file
all_data = []

# Loop through all .jdx files in the directory
for filename in os.listdir(dir):
    file_path = os.path.join(dir, filename)
    
    # Read the JCAMP-DX file
    jdx_data = jcamp.jcamp_readfile(file_path)
    
    # Extract all available data
    data = {'filename': filename}
    for key, value in jdx_data.items():
        data[key] = value
    
    all_data.append(data)

# Create a pandas DataFrame from the collected data
df = pd.DataFrame(all_data)

# save to csv
df


Unnamed: 0,filename,title,jcamp-dx,data type,origin,owner,cas registry no,molform,$nist source,state,...,interferogram zerofill,spectral interval after zerofilling,apodization,folding limits,number of interferograms averaged per single channel spectrum,$spectra version,$uncertainty in y,sample description,pressure,temperature
0,nist_files/file_8028.jdx,2-Ethoxycinnamic acid,4.24,INFRARED SPECTRUM,EPA-IR VAPOR PHASE LIBRARY,SRD/NIST\nCollection (C) 2018 copyright by the...,69038-81-9,C 11 H 12 O 3,MSDC-IR,gas,...,,,,,,,,,,
1,nist_files/file_12061.jdx,DIAMYL MALEATE,4.24,INFRARED SPECTRUM,DOW CHEMICAL COMPANY,COBLENTZ SOCIETY\nCollection (C) 2018 copyrigh...,10099-71-5,C14 H24 O4,COBLENTZ,"SOLUTION (10% CCl4 FOR 2.5-7.5, 10% CS2 FOR 7....",...,,,,,,,,,,
2,nist_files/file_14410.jdx,"M-NITRO CARBANILIC ACID, n-UNDECYL ESTER",4.24,INFRARED SPECTRUM,"CITRUS EXPERIMENT STATION, UNIVERSITY OF FLORI...",COBLENTZ SOCIETY\nCollection (C) 2018 copyrigh...,95126-58-2,C18 H28 N2 O4,COBLENTZ,SOLID (1.7 mg / 200 mg KBr),...,,,,,,,,,,
3,nist_files/file_6005.jdx,"ACETOPHENONE, 2,4,5-TRIHYDROXY",4.24,INFRARED SPECTRUM,"TENNESSEE EASTMAN COMPANY, RESEARCH LABORATORIES",COBLENTZ SOCIETY\nCollection (C) 2018 copyrigh...,1818-27-5,C8 H8 O4,COBLENTZ,SOLID (KBr PELLET),...,,,,,,,,,,
4,nist_files/file_9336.jdx,"GLYCINONITRILE, 2,2-DIPHENYL-",4.24,INFRARED SPECTRUM,WYANDOTTE CHEMICALS CORP.,Copyright (C) 1987 by the Coblentz Society\nCo...,52460-99-8,C14 H12 N2,COBLENTZ,SOLID (OIL MULL),...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15767,nist_files/file_13366.jdx,1-Benzyl-5-phenylbarbituric acid,4.24,INFRARED SPECTRUM,EPA-IR VAPOR PHASE LIBRARY,SRD/NIST\nCollection (C) 2018 copyright by the...,72846-00-5,C 17 H 14 N 2 O 3,MSDC-IR,gas,...,,,,,,,,,,
15768,nist_files/file_7302.jdx,"2,4,6-trichloroborazine",4.24,INFRARED SPECTRUM,,COBLENTZ SOCIETY\nCollection (C) 2018 copyrigh...,933-18-6,H3 B3 Cl3 N3,COBLENTZ,,...,,,,,,,,,,
15769,nist_files/file_8031.jdx,ETHYLBENZOYL ACETATE,4.24,INFRARED SPECTRUM,DOW CHEMICAL COMPANY,COBLENTZ SOCIETY\nCollection (C) 2018 copyrigh...,94-02-0,C11 H12 O3,COBLENTZ,"SOLUTION (10% CCl4 FOR 5000-1330, 10% CS2 FOR ...",...,,,,,,,,,,
15770,nist_files/file_12078.jdx,"ACETIC ACID, NITRILOTRI-, TRISODIUM SALT",4.24,INFRARED SPECTRUM,STANDARD OIL OF OHIO,COBLENTZ SOC.\nCollection (C) 2018 copyright b...,,C6 H6 N Na3 O6,COBLENTZ,SOLID (OIL MULL),...,,,,,,,,,,


In [31]:
# Print columns as a numbered list
for i, column in enumerate(df.columns, start=1):
    print(f"{i}. {column}")


1. filename
2. title
3. jcamp-dx
4. data type
5. origin
6. owner
7. cas registry no
8. molform
9. $nist source
10. state
11. xunits
12. yunits
13. xfactor
14. yfactor
15. deltax
16. firstx
17. lastx
18. firsty
19. maxx
20. minx
21. maxy
22. miny
23. npoints
24. xydata
25. end
26. x
27. y
28. class
29. date
30. names
31. source reference
32. $nist image
33. spectrometer/data system
34. path length
35. sampling procedure
36. resolution
37. data processing
38. mp
39. instrument parameters
40. $nist id
41. bp
42. xlabel
43. ylabel
44. $nist doc file
45. $nist psd file
46. aperture
47. external diffuse reflectance accessory
48. beamsplitter
49. detector (dia. det. port in sphere)
50. sphere diameter
51. acquisition mode
52. scanner speed
53. coadded scans
54. phase resolution
55. phase correction
56. zerofilling
57. spectral range
58. spectral resolution
59. wavenumber accuracy
60. apodization function
61. low pass filter
62. switch gain on
63. cas name
64. instrument resolution
65. ir sour

In [71]:
import requests
from tqdm import tqdm

def cas_to_smiles(cas_number):
    # PubChem PUG REST API URL for converting CAS to SMILES
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{cas_number}/property/CanonicalSMILES/JSON"
    
    # Send a request to PubChem
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()
        # Extract SMILES from the response
        smiles = data['PropertyTable']['Properties'][0]['CanonicalSMILES']
        return smiles
    else:
        return None

# Convert values in the "cas registry no" column to values in the "smiles" column with tqdm progress bar
df['smiles'] = [cas_to_smiles(x) if pd.notna(x) else None for x in tqdm(df['cas registry no'], desc="Converting CAS to SMILES")]


Converting CAS to SMILES: 100%|██████████| 15772/15772 [34:30<00:00,  7.62it/s] 


In [72]:
df.to_csv('nist_data_with_smiles.csv', index=False)

In [52]:
state_counts = df['state'].value_counts().reset_index()
state_counts.columns = ['state', 'count']
state_counts = state_counts.sort_values(by='state', ascending=True)
state_counts

Unnamed: 0,state,count
821,,1
79,(NEAT),10
689,CAPILLARY,1
520,FILM,1
341,FILM FROM C2HCl3 SOLUTION,2
...,...,...
530,VAPOR AT 55 mmHg PRESSURE; $$ RESEARCH PURITY,1
898,VISC. PASTE,1
0,gas,6683
45,liquid,19


In [53]:
state_counts.to_csv('state_counts.csv', index=False)