## Import Libraries

In [1]:
import importlib
import os
from sklearn.model_selection import train_test_split
import pandas as pd 

try:
    importlib.reload(dataloader)
    importlib.reload(model)
    importlib.reload(analysis)
except:
    import dataloader
    import model
    import analysis

## Load Data

In [2]:
data = pd.read_csv(os.path.abspath("all_tested_molecules.csv"))

In [None]:
descriptors = dataloader.get_molecular_descriptors(data, remove_fingerprints=False)
descriptors.head()

In [None]:
labels = dataloader.get_labels(data)
labels.head()

## PCA Analysis

In [None]:
descriptors = analysis.ScaleDescriptors(descriptors)

In [None]:
highly_correlated_pairs = analysis.correlation(descriptors)

In [None]:
descriptors = analysis.remove_colinear(descriptors, highly_correlated_pairs)

In [None]:
num_components = analysis.plot_variance(descriptors, percentage=0.95)

In [None]:
analysis.plot_loadings(descriptors, labels, num_components)

In [None]:
feature_rankings = analysis.feature_rankings(descriptors, num_components)
feature_rankings

## PCA Selected Descriptors

In [None]:
selected_descriptors = list(feature_rankings[feature_rankings>0.07].index)
selected_descriptors

In [None]:
num_components_selected = analysis.plot_variance(descriptors[selected_descriptors], percentage=0.9)

In [None]:
analysis.plot_loadings(descriptors[selected_descriptors], labels, num_components_selected)

## PCA Literature

In [None]:
literature_descriptors = descriptors.columns[:5].tolist()+descriptors.columns[19:20].tolist()+ descriptors.columns[28:42].tolist()+ descriptors.columns[52:64].tolist()+ descriptors.columns[93:96].tolist()+ descriptors.columns[101:102].tolist()
a = literature_descriptors.count()
print(a)    



In [None]:
num_components_literature = analysis.plot_variance(descriptors[literature_descriptors], percentage=0.95)

In [None]:
analysis.plot_loadings(descriptors[literature_descriptors], labels, num_components_selected)

## Model training

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(descriptors[selected_descriptors], labels, test_size=0.2, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(descriptors[literature_descriptors], labels, test_size=0.2, random_state=0)
#X_train, X_test, y_train, y_test = train_test_split(descriptors, labels, test_size=0.2, random_state=0)

In [None]:
#pipeline = model.train(X_train, y_train, num_components_selected, degrees=[1,2], use_pca=[True, False])
pipeline = model.train(X_train, y_train, num_components_literature, degrees=[1,2], use_pca=[True, False])
#pipeline = model.train(X_train, y_train, num_components, degrees=[1,2], use_pca=[True, False])

In [None]:
accuracy = model.test(pipeline, X_test, y_test)

## Predict new molecules

In [None]:
new_data = pd.read_csv(os.path.abspath("untested_molecules.csv"))

In [None]:
new_descriptors = dataloader.get_molecular_descriptors(new_data, remove_fingerprints=False)
new_descriptors.head()

In [None]:
new_descriptors = analysis.remove_colinear(new_descriptors, highly_correlated_pairs)

In [None]:
new_labels = dataloader.get_labels(new_data)
new_labels.head()

In [None]:
#new_labels = model.predict(pipeline, new_descriptors[selected_descriptors], new_labels)
new_labels = model.predict(pipeline, new_descriptors[literature_descriptors], new_labels)
#new_labels = model.predict(pipeline, new_descriptors, new_labels)

In [None]:
new_labels.head(100)

In [None]:
new_labels.head(100).to_csv("predicted_molecules.csv", index=False)