### Cutting Data Frame in Half

In [None]:
# Cutting the dataframe in half to use only one molecule per each enantiomeric pair

In [1]:
#Gets the updates from the development files that are imported
%load_ext autoreload
%autoreload 2

In [13]:
#Imports the pands library, the math library, and the init class python file
#Last line: updates matlab library that is used in init file
import Utils as model_helpers
import numpy as np
import pandas as pd

In [14]:
enantiomer_data = pd.read_csv("enantiomer_data.csv")

In [15]:
# Take the absolute log values and harmonic values for each enantiomeric pair
half_log_abs = enantiomer_data.groupby('N').apply(model_helpers.log_abs)
half_det = enantiomer_data.groupby('N').apply(model_helpers.harmonic)

In [16]:
# Creates a new data frame with just one odorant of each enantiomeric pair from the original dataset 
# adds the absolute value and detection threshold value for remaining odorants from enantiomering pair
half_enantiomer_data = enantiomer_data.iloc[::2].copy()
half_enantiomer_data.loc[:, 'log_abs'] = half_log_abs.values
half_enantiomer_data.loc[:, 'det'] = half_det.values

In [17]:
# This line makes sure that the rest of the exsisting null values are equal in the new data frame and in the new data frame's 'log_abs' column
assert half_log_abs.isnull().sum() == half_enantiomer_data['log_abs'].isnull().sum()

In [18]:
# This line checks that log_abs and det columns were added properly
half_enantiomer_data.head()

Unnamed: 0.1,Unnamed: 0,Molecule Name,Pubchem ID #,Note,SMILES String,Other SMILES,Method,Contributor,Detection Threshold,Detection Units,Normalized Detection Threshold,Molecule Odour,Resources,N,log_abs,det
0,0,(R)-(-)-gamma-ionone,11389922,,CC(=O)/C=C/[C@H]1C(=C)CCCC1(C)C,,,,11.0,ppb water,11.0,"Weak green, fruity, pineapple-like odor with m...",Rows 66-100 are from here: https://www.jstage....,0,2.196295,-0.856627
2,2,(4R)-(-)-carvone,439570,,CC1=CC[C@H](CC1=O)C(=C)C,,,,2.0,ppb,2.0,"sweet spearmint, fresh herbal",Rows 122 - 193 are from here: https://github.c...,1,1.812913,0.595429
4,4,"(4R,7R)-(+)-galaxolide",14177988,,C[C@H]1COCC2=CC3=C(C=C12)C([C@H](C3(C)C)C)(C)C,,,,0.44,ppb in air,0.44,weak to almost odorless,Rows 224-267 are from here: https://github.com...,2,2.643453,-2.699956
6,6,"(4R,4aS,6R)-(+) nootkatone",1268142,,C[C@@H]1CC(=O)C=C2[C@]1(C[C@@H](CC2)C(=C)C)C,,,,15.0,ppm,15000.0,grapefruit odor,Rows 370-407 are from here: https://github.com...,3,3.643453,4.477023
8,8,"(2S,4R)-(+) cis-2-methyl-4-propyl-1,3-oxathiane",6931728,,CCC[C@@H]1CCO[C@@H](S1)C,,,,2.0,ppb,2.0,"ctypical sulfurous, with a rubbery onion note;...",Rows 424-435 are from here: https://github.com...,4,0.30103,0.425969


In [19]:
# Gets rid of all the invalid SMILES Strings, specifically the duplicates because we don't want to count their perceptual features twice and the "nan" values 
half_enantiomer_data = half_enantiomer_data.drop_duplicates(subset=['SMILES String'])
half_enantiomer_data = half_enantiomer_data[~half_enantiomer_data['SMILES String'].str.contains('NaN', na=True)]
half_enantiomer_data = half_enantiomer_data[~half_enantiomer_data['SMILES String'].str.contains('nan', na=True)]

In [20]:
# Assert statement to ensure that we only have unqiue smiles strings 
assert half_enantiomer_data['SMILES String'].shape == half_enantiomer_data['SMILES String'].unique().shape, "Number of SMILES strings should equal number of unique SMILES strings at this stage"

In [21]:
# Assert that there are no more nan values in the smiles string column
assert sum(half_enantiomer_data['SMILES String']=='nan') == 0, "There should be no NaN SMILES strings at this point"

In [22]:
# Gets rid of the rows with a null log_abs value
half_enantiomer_data = half_enantiomer_data[~half_enantiomer_data['log_abs'].isnull()]

In [23]:
# Assert that there are no more log_abs of det values with the value null
assert not sum(half_enantiomer_data['log_abs'].isnull())
assert not sum(half_enantiomer_data['det'].isnull())

In [24]:
half_enantiomer_data.to_csv("half_enantiomer_data.csv")