In [2]:
# Import packages
import numpy as np
import pandas as pd
import time
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import scipy
from scipy.stats import spearmanr
from scipy.stats import norm
import pickle

In [None]:
df = pd.read_csv("Data/OMG_DFT_COSMOC_chi.csv")

molecules = df.SMILES.apply(Chem.MolFromSmiles)
fp = molecules.apply(lambda m: AllChem.GetMorganFingerprint(m, radius=3))
fp_n = fp.apply(lambda m: m.GetNonzeroElements())
HashCode = []
for i in fp_n:
    for j in i.keys():
        HashCode.append(j)
                
unique_set = set(HashCode)
unique_list = list(unique_set)
Corr_df = pd.DataFrame(unique_list).reset_index()
                
MY_finger = []
for polymer in fp_n:
    my_finger = [0] * len(unique_list)
    for key in polymer.keys():
        index = Corr_df[Corr_df[0] == key]['index'].values[0]
        my_finger[index] = polymer[key]
    MY_finger.append(my_finger)
X = pd.DataFrame(MY_finger)

# filter input into the most popular X substructures
Zero_Sum = (X == 0).astype(int).sum()


38


In [33]:
NumberOfZero = 47000 # to control the dimention of morgan fingerprint with frenquency
print(len(Zero_Sum[Zero_Sum < NumberOfZero]))

Columns = Zero_Sum[Zero_Sum < NumberOfZero].index
Substructure_list = list(polymer.keys())
X_count = X[Columns]

X_count

435


Unnamed: 0,575,588,653,777,817,895,1063,2158,2254,2474,...,150479,150500,150768,151782,151794,152055,152081,152271,152546,152602
0,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47671,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
47672,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
47673,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
47674,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
pickle_out = open("Corr_All.pickle","wb")
pickle.dump(Corr_df, pickle_out)
pickle_out.close()

pickle_out = open("unique_list_All.pickle","wb")
pickle.dump(unique_list, pickle_out)
pickle_out.close()

pickle_out = open("polymer.keys_All.pickle","wb")
pickle.dump(Substructure_list, pickle_out)
pickle_out.close()

pickle_out = open("Columns_All.pickle","wb")
pickle.dump(Columns, pickle_out)
pickle_out.close()

In [35]:
Corr_df = pickle.load(open("Corr_All.pickle","rb"))
unique_list = pickle.load(open("unique_list_All.pickle","rb"))
Columns = pickle.load(open("Columns_All.pickle","rb"))
Substructure_list = pickle.load(open("polymer.keys_All.pickle","rb"))

In [36]:
def process_MFF_dataset(file_path, unique_list, Corr_df, Columns):
    data_exp = pd.read_csv(file_path)
    
    molecules = data_exp.SMILES.apply(Chem.MolFromSmiles)
    fp = molecules.apply(lambda m: AllChem.GetMorganFingerprint(m, radius=3))
    fp_n = fp.apply(lambda m: m.GetNonzeroElements())
    MY_finger = []
    for polymer in fp_n:
        my_finger = [0] * len(unique_list)
        for key in polymer.keys():
            if key in list(Corr_df[0]):
                index = Corr_df[Corr_df[0] == key]['index'].values[0]
                my_finger[index] = polymer[key]         
        MY_finger.append(my_finger)
    exp_MFF = pd.DataFrame(MY_finger)
    exp_MFF = exp_MFF[Columns]
    
    return exp_MFF

# Process exp dataset
exp_MFF = process_MFF_dataset(
    "Data/Experiment_chi_data.csv",
    unique_list,
    Corr_df,
    Columns,
)