## Import Libraries

In [1]:
import importlib
import os
from sklearn.model_selection import train_test_split
import pandas as pd 

try:
    importlib.reload(dataloader)
    importlib.reload(model)
    importlib.reload(analysis)
except:
    import dataloader
    import model
    import analysis

ModuleNotFoundError: No module named 'rdkit'

## Load Data

In [None]:
data = pd.read_csv(os.path.abspath("all_tested_molecules.csv"))

In [None]:
descriptors = dataloader.get_molecular_descriptors(data, remove_fingerprints=True)
descriptors.head()

In [None]:
labels = dataloader.get_labels(data)
labels.head()

## PCA Analysis

In [None]:
highly_correlated_pairs = analysis.correlation(descriptors)

In [None]:
descriptors = analysis.remove_colinear(descriptors, highly_correlated_pairs)

In [None]:
descriptors = analysis.ScaleDescriptors(descriptors)
descriptors

In [None]:
num_components = analysis.plot_variance(descriptors, percentage=0.9)

In [None]:
analysis.plot_loadings(descriptors, num_components)

In [None]:
analysis.feature_rankings(descriptors, num_components)

## Model training

In [None]:
X_train, X_test, y_train, y_test = train_test_split(descriptors, labels, test_size=0.2, random_state=0)

In [None]:
pipeline = model.train(X_train, y_train, num_components)

In [None]:
accuracy = model.test(pipeline, X_test, y_test)
print(accuracy)

## Predict new molecules

In [None]:
new_data = pd.read_csv(os.path.abspath("untested_molecules.csv"))

In [None]:
new_descriptors = dataloader.get_molecular_descriptors(new_data, remove_fingerprints=True)
new_descriptors.head()

In [None]:
new_labels = dataloader.get_labels(new_data)
new_labels.head()

In [None]:
new_descriptors = analysis.remove_colinear(new_descriptors, highly_correlated_pairs)

In [None]:
new_descriptors = analysis.ScaleDescriptors(new_descriptors)

In [None]:
new_labels = model.predict(pipeline, new_descriptors, new_labels)

In [None]:
new_labels.head(100)

In [None]:
new_labels.head(100).to_csv("predicted_molecules.csv", index=False)