In [9]:
from modules import data

records = data.from_json('./data/json/ind.json')
for record in records:
    record.min_quality(90)
    

In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

def to_df(records) -> pd.DataFrame:
    """
    Creates a dataframe where each column are different features (i.e., compounds). 
    
    A column of `Label` is appended in the last column for classification tasks.
    """
    output = pd.DataFrame()
    synonyms = {
        'Ethyl Alcohol': 'Ethanol',
        'Limonene': 'D-Limonene',
    }
    
    for record in records:
        column_name = f'{record.envr} {record.medium} {record.species} {record.time}h {record.trial}'
        df = record.data.copy(deep=True)
        df.drop(['Retention Time', 'Relative Area', 'ID', 'CAS Number', 'Quality', 'Type', 'Width'], axis=1, inplace=True)
        df.rename({'Area': column_name}, axis=1, inplace=True)
        df.set_index('Compound', inplace=True)
        df.rename(index=synonyms, inplace=True)
        df = df.groupby(df.index).sum()
        output = pd.concat([output, df], axis=1)
        
    output.fillna(0.0, inplace=True)
    output = output.T
    
    # Filtering out significant compounds
    sig_list = pd.read_excel('./data/sigs.xlsx').iloc[:, 0].to_list()
    output = output[sig_list]
    
    # Standard Scaling
    output = pd.DataFrame(StandardScaler().fit_transform(output), index=output.index, columns=output.columns)
    
    output['Envr'] = [record.envr for record in records]
    output['Medium'] = [record.medium for record in records]
    output['Species'] = [record.species for record in records]
    
    return output

records_df = to_df(records)
records_df

Compound,"Benzenemethanol, .Alpha.-Methyl-,Acetate","Pyrazine, 2,5-Dimethyl","Heptane, 3-Methylene",Decane,"Benzene, 1,3-Bis(1,1-Dimethylethyl)","2H-Pyran, 2-Ethoxy-3,4-Dihydro","2-Propenoic Acid, 2-Ethylhexyl Ester",2-Octanone,"Octane, 4-Methyl","Pyrazine, 2-Ethyl-6-Methyl","2,4-Dimethyl-1-Heptene","Undecane, 2-Methyl",Envr,Medium,Species
Ae LB BS 0h 1,-0.135291,-0.385571,-0.242499,-0.250930,-0.263297,-0.108069,-0.10028,-0.098141,-0.266833,-0.216286,-0.261246,-0.318946,Ae,LB,BS
Ae LB BS 24h 1,-0.135291,-0.359123,-0.242499,-0.400475,-0.263297,-0.108069,-0.10028,-0.098141,-0.414593,-0.216286,-0.261246,-0.318946,Ae,LB,BS
Ae LB BS 3h 1,-0.135291,-0.257642,-0.242499,-0.137516,-0.200083,-0.108069,-0.10028,-0.098141,-0.183342,-0.216286,0.122919,-0.318946,Ae,LB,BS
Ae LB BS 9h 1,-0.135291,-0.306654,-0.242499,-0.400475,-0.263297,-0.108069,-0.10028,-0.098141,-0.414593,-0.216286,-0.261246,-0.318946,Ae,LB,BS
Ae LB Ctrl 0h 1,-0.135291,-0.420724,-0.242499,-0.400475,-0.263297,-0.108069,-0.10028,-0.098141,-0.370298,-0.216286,-0.261246,-0.318946,Ae,LB,Ctrl
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
An TSB EC 9h 2,-0.135291,1.570522,-0.242499,1.112307,-0.263297,-0.108069,-0.10028,-0.098141,-0.414593,-0.216286,-0.261246,-0.318946,An,TSB,EC
An TSB SA 0h 2,-0.135291,-0.420724,-0.242499,-0.400475,-0.263297,-0.108069,-0.10028,-0.098141,-0.414593,-0.216286,-0.261246,-0.318946,An,TSB,SA
An TSB SA 24h 2,-0.135291,2.021104,-0.242499,1.679154,0.106905,-0.108069,-0.10028,-0.098141,-0.414593,-0.216286,-0.261246,-0.318946,An,TSB,SA
An TSB SA 3h 2,-0.135291,2.392083,-0.242499,0.671535,-0.263297,-0.108069,-0.10028,-0.098141,-0.414593,-0.216286,-0.261246,-0.318946,An,TSB,SA


In [11]:
from sklearn.preprocessing import LabelEncoder

envr_encoder = LabelEncoder()
med_encoder = LabelEncoder()
spec_encoder = LabelEncoder()

records_df['Envr'] = envr_encoder.fit_transform(records_df['Envr'])
records_df['Medium'] = med_encoder.fit_transform(records_df['Medium'])
records_df['Species'] = spec_encoder.fit_transform(records_df['Species'])

records_df

Compound,"Benzenemethanol, .Alpha.-Methyl-,Acetate","Pyrazine, 2,5-Dimethyl","Heptane, 3-Methylene",Decane,"Benzene, 1,3-Bis(1,1-Dimethylethyl)","2H-Pyran, 2-Ethoxy-3,4-Dihydro","2-Propenoic Acid, 2-Ethylhexyl Ester",2-Octanone,"Octane, 4-Methyl","Pyrazine, 2-Ethyl-6-Methyl","2,4-Dimethyl-1-Heptene","Undecane, 2-Methyl",Envr,Medium,Species
Ae LB BS 0h 1,-0.135291,-0.385571,-0.242499,-0.250930,-0.263297,-0.108069,-0.10028,-0.098141,-0.266833,-0.216286,-0.261246,-0.318946,0,0,0
Ae LB BS 24h 1,-0.135291,-0.359123,-0.242499,-0.400475,-0.263297,-0.108069,-0.10028,-0.098141,-0.414593,-0.216286,-0.261246,-0.318946,0,0,0
Ae LB BS 3h 1,-0.135291,-0.257642,-0.242499,-0.137516,-0.200083,-0.108069,-0.10028,-0.098141,-0.183342,-0.216286,0.122919,-0.318946,0,0,0
Ae LB BS 9h 1,-0.135291,-0.306654,-0.242499,-0.400475,-0.263297,-0.108069,-0.10028,-0.098141,-0.414593,-0.216286,-0.261246,-0.318946,0,0,0
Ae LB Ctrl 0h 1,-0.135291,-0.420724,-0.242499,-0.400475,-0.263297,-0.108069,-0.10028,-0.098141,-0.370298,-0.216286,-0.261246,-0.318946,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
An TSB EC 9h 2,-0.135291,1.570522,-0.242499,1.112307,-0.263297,-0.108069,-0.10028,-0.098141,-0.414593,-0.216286,-0.261246,-0.318946,1,3,2
An TSB SA 0h 2,-0.135291,-0.420724,-0.242499,-0.400475,-0.263297,-0.108069,-0.10028,-0.098141,-0.414593,-0.216286,-0.261246,-0.318946,1,3,3
An TSB SA 24h 2,-0.135291,2.021104,-0.242499,1.679154,0.106905,-0.108069,-0.10028,-0.098141,-0.414593,-0.216286,-0.261246,-0.318946,1,3,3
An TSB SA 3h 2,-0.135291,2.392083,-0.242499,0.671535,-0.263297,-0.108069,-0.10028,-0.098141,-0.414593,-0.216286,-0.261246,-0.318946,1,3,3


In [12]:
import numpy as np

n_features = len(records_df.columns.drop(['Envr', 'Medium', 'Species']))
feature_names = records_df.columns.drop(['Envr', 'Medium', 'Species']).to_list()
envr_string = np.unique(envr_encoder.inverse_transform(records_df['Envr']))
medium_string = np.unique(med_encoder.inverse_transform(records_df['Medium']))
species_string = np.unique(spec_encoder.inverse_transform(records_df['Species']))

In [19]:
from sklearn.model_selection import StratifiedShuffleSplit

stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
combined_labels = records_df['Envr'].astype(str) + '_' + records_df['Medium'].astype(str) + '_' + records_df['Species'].astype(str)

train_idx, test_idx = next(stratified_split.split(records_df, combined_labels))

train_records = records_df.iloc[train_idx]

train_envr_labels = train_records['Envr']
train_medium_labels = train_records['Medium']
train_species_labels = train_records['Species']
train_values = train_records.drop(['Envr', 'Medium', 'Species'], axis=1).to_numpy()

test_records = records_df.iloc[test_idx]

test_envr_labels = test_records['Envr']
test_medium_labels = test_records['Medium']
test_species_labels = test_records['Species']
test_values = test_records.drop(['Envr', 'Medium', 'Species'], axis=1).to_numpy()


In [22]:
from sklearn.svm import SVC

linear_svc = SVC(kernel='linear')
linear_svc.fit(train_values, train_envr_labels)
print(linear_svc.score(test_values, test_envr_labels))

linear_svc = SVC(kernel='linear')
linear_svc.fit(train_values, train_medium_labels)
print(linear_svc.score(test_values, test_medium_labels))

linear_svc = SVC(kernel='linear')
linear_svc.fit(train_values, train_species_labels)
print(linear_svc.score(test_values, test_species_labels))

0.5769230769230769
0.6153846153846154
0.36538461538461536


In [23]:
linear_svc = SVC(kernel='poly')
linear_svc.fit(train_values, train_envr_labels)
print(linear_svc.score(test_values, test_envr_labels))

linear_svc = SVC(kernel='poly')
linear_svc.fit(train_values, train_medium_labels)
print(linear_svc.score(test_values, test_medium_labels))

linear_svc = SVC(kernel='poly')
linear_svc.fit(train_values, train_species_labels)
print(linear_svc.score(test_values, test_species_labels))

0.5384615384615384
0.38461538461538464
0.2692307692307692


In [24]:
linear_svc = SVC(kernel='rbf')
linear_svc.fit(train_values, train_envr_labels)
print(linear_svc.score(test_values, test_envr_labels))

linear_svc = SVC(kernel='rbf')
linear_svc.fit(train_values, train_medium_labels)
print(linear_svc.score(test_values, test_medium_labels))

linear_svc = SVC(kernel='rbf')
linear_svc.fit(train_values, train_species_labels)
print(linear_svc.score(test_values, test_species_labels))

SyntaxError: unterminated string literal (detected at line 5) (2320397119.py, line 5)

In [15]:
poly_svc = SVC(kernel='poly')
poly_svc.fit(train_datas, train_labels)
print(poly_svc.score(test_datas, test_labels))

rbf_svc = SVC(kernel='rbf')
rbf_svc.fit(train_datas, train_labels)
print(rbf_svc.score(test_datas, test_labels))

0.057692307692307696
0.07692307692307693


In [16]:
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV

estim = SVC(kernel='linear')
rfecv = RFECV(estim)

rfecv.fit(train_datas, train_labels)
print(rfecv.score(test_datas, test_labels))
rfecv.ranking_

0.1346153846153846


array([1, 3, 5, 4, 1, 2, 1, 1, 1, 1, 1, 1])

In [17]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=100, criterion='gini')
rf_classifier.fit(train_datas, train_labels)
print(rf_classifier.score(test_datas, test_labels))

0.23076923076923078


In [18]:
rf_classifier.feature_importances_

array([0.07246035, 0.00326706, 0.00601165, 0.00941389, 0.17086381,
       0.0184134 , 0.17602503, 0.03971703, 0.1268241 , 0.23519973,
       0.07668146, 0.06512249])