## Notebook for generating multicomponent dataset (for LGBM/others)

### Random splitting

In [2]:
import warnings
warnings.filterwarnings('ignore')
import requests
import os
import json
import csv
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from rdkit import Chem
import rdkit.Chem.rdMolDescriptors as MolDescriptors
import rdkit.Chem.Descriptors as Descriptors
from sklearn.model_selection import train_test_split

In [3]:
%%bash
pwd
ls -ltr

/project/rcc/hyadav/TransPolymer/data
total 355698
-rw-rw-r-- 1 hyadav rcc-staff     10987 Jun 28 09:51 EPS.csv
-rw-rw-r-- 1 hyadav rcc-staff     11263 Jun 28 09:51 Eea.csv
-rw-rw-r-- 1 hyadav rcc-staff     17922 Jun 28 09:51 Egb.csv
-rw-rw-r-- 1 hyadav rcc-staff    144834 Jun 28 09:51 Egc.csv
-rw-rw-r-- 1 hyadav rcc-staff     11299 Jun 28 09:51 Ei.csv
-rw-rw-r-- 1 hyadav rcc-staff     11758 Jun 28 09:52 Nc.csv
-rw-rw-r-- 1 hyadav rcc-staff    171771 Jun 28 09:52 OPV.csv
-rw-rw-r-- 1 hyadav rcc-staff    171806 Jun 28 09:52 OPV_aug.csv
-rw-rw-r-- 1 hyadav rcc-staff   3861748 Jun 28 09:52 PE_I.csv
-rw-rw-r-- 1 hyadav rcc-staff     25123 Jun 28 09:52 PE_II.csv
-rw-rw-r-- 1 hyadav rcc-staff     28810 Jun 28 09:52 PE_II_aug.csv
-rw-rw-r-- 1 hyadav rcc-staff     19747 Jun 28 09:52 Xc.csv
drwxrwsr-x 2 hyadav rcc-staff      4096 Jun 28 09:52 original datasets
-rw-rw-r-- 1 hyadav rcc-staff 245980630 Jun 28 09:52 pretrain.csv
-rw-rw-r-- 1 hyadav rcc-staff  47564048 Jun 28 09:52 pretrain_1M.csv
-

In [4]:
df_comp = pd.read_csv('all_multi_comp.csv')
y = df_comp['conductivity_log']
rest_df_comp, test_df_comp, y_rest_comp, y_test_comp = train_test_split(df_comp, y, test_size=0.1, random_state=0)
train_df_comp, val_df_comp, y_train_comp, y_val_comp = train_test_split(rest_df_comp, y_rest_comp, test_size=0.11, random_state=0)

In [5]:
train_df_comp.to_csv('random_train_multi_comp.csv', index=False)
val_df_comp.to_csv('random_val_multi_comp.csv', index=False)
test_df_comp.to_csv('random_test_multi_comp.csv', index=False)



In [4]:
df = pd.read_csv('all_multi_comp_comb.csv')
df

Unnamed: 0,solv_comb_sm,salt_sm,conductivity_log
0,O=C1OCCO1.COC(=O)OC,[Li+].F[P-](F)(F)(F)(F)F,2.459589
1,O=C1OCCO1.CCCCOC(=O)OC,[Li+].F[P-](F)(F)(F)(F)F,1.774952
2,O=C1OCCO1.CC(C)COC(=O)OC,[Li+].F[P-](F)(F)(F)(F)F,1.686399
3,O=C1OCCO1.CCC(C)OC(=O)OC,[Li+].F[P-](F)(F)(F)(F)F,1.871802
4,CCOC(=O)OCC,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.741937
...,...,...,...
10191,CS(=O)C.COCCOC,[Li+].F[P-](F)(F)(F)(F)F,1.547563
10192,CS(=O)C.COCCOC,[Li+].F[P-](F)(F)(F)(F)F,2.624669
10193,CS(=O)C.COCCOC,[Li+].F[P-](F)(F)(F)(F)F,2.772589
10194,CS(=O)C.COCCOC,[Li+].F[P-](F)(F)(F)(F)F,2.687847


In [6]:
df_add = pd.read_csv('all_multi_comp_add.csv')
y = df_comp['conductivity_log']
rest_df_add, test_df_add, y_rest_add, y_test_add = train_test_split(df_add, y_add, test_size=0.1, random_state=0)
train_df_add, val_df_add, y_train_add, y_val_add = train_test_split(rest_df_add, y_rest_add, test_size=0.11, random_state=0)

KeyError: 'conductivity_log'

In [5]:
cols = df.columns
cols

Index(['solv_comb_sm', 'salt_sm', 'conductivity_log'], dtype='object')

In [6]:
cols_main = ['solvent_1', 'solvent_2', 'solvent_3', 'solvent_4', 'salt_1', 'conductivity_log']
cols_add = ['conc_salt', 'temperature', 'solv_ratio_1', 'solv_ratio_2', 'solv_ratio_3', 'solv_ratio_4']

In [7]:
y = df['conductivity_log']

In [9]:
rest_df, test_df, y_rest, y_test = train_test_split(df, y, test_size=0.1, random_state=0)
test_df

Unnamed: 0,solv_comb_sm,salt_sm,conductivity_log
6642,CC1COC(=O)O1.O=C1OCCO1,[Li+].F[P-](F)(F)(F)(F)F,2.063161
3424,CC1COC(=O)O1.CCOC(=O)OCC,[Li+].[B-]12(OC(=O)C(=O)O1)OC(=O)C(=O)O2,2.004640
3779,CC1COC(=O)O1.C1COC(=O)O1,[Li+].[B-]12(OC(=O)C(=O)O1)OC(=O)C(=O)O2,2.011817
7066,CC1COC(=O)O1.CCOC(=O)OCC,[Li+].[B-](F)(F)(F)F,-1.681759
176,O=C1OCCO1,[Li+].F[As-](F)(F)(F)(F)F,2.754934
...,...,...,...
7780,CC1CCCO1.O=C1OCCO1.CC1COC(=O)O1,[Li+].F[As-](F)(F)(F)(F)F,1.266948
7497,C1COB(OCCOB2OCCCO2)OC1,[Li+].[O-]Cl(=O)(=O)=O,-8.727914
5473,CC1COC(=O)O1.CCOC(=O)OCC,CC[N+](CC)(CC)CC.F[P-](F)(F)(F)(F)F,2.120264
8756,COCCOCCOCCOCCOC,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,1.079429


In [10]:
train_df, val_df, y_train, y_val = train_test_split(rest_df, y_rest, test_size=0.11, random_state=0)
train_df

Unnamed: 0,solv_comb_sm,salt_sm,conductivity_log
8357,CCOCCOCCF.O=C1OCCO1,[Li+].F[P-](F)(F)(F)(F)F,1.887070
4397,CC1COC(=O)O1,[Li+].F[P-](F)(F)(F)(F)F,1.077993
4649,CC1COC(=O)O1.CCOC(=O)OCC,[Li+].F[P-](F)(F)(F)(F)F,0.362224
8266,COCCOC.COCCOCC(F)(F)C(F)F,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,1.857859
1124,CC1COC(=O)O1,[Li+].F[P-](F)(F)(F)(F)F,1.629241
...,...,...,...
102,FC(F)C(F)(F)COC(F)(F)C(F)F.COC(=O)OCC(F)(F)F.O...,[Li+].F[P-](F)(F)(F)(F)F,1.098612
1664,CCCCCCCCCC1COC(=O)O1,[Li+].C(F)(F)(F)S(=O)(=O)[O-],-3.028255
8946,COCCOCCOCCOCCOC,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,1.182280
2190,CC1COC(=O)O1.CCOC(=O)OCC,[Li+].F[P-](F)(F)(F)(F)F,2.515274


In [11]:
val_df

Unnamed: 0,solv_comb_sm,salt_sm,conductivity_log
1992,O=C1OCCO1.CCOC(=O)OC.CC(=O)OC,[Li+].F[P-](F)(F)(F)(F)F,2.541602
9701,CS(=O)(=O)F,[Li+].C(F)(F)(F)S(=O)(=O)[N-]S(=O)(=O)C(F)(F)F,0.576613
7558,C1COB(OCCOB2OCCCO2)OC1.C1COC(=O)O1,[Li+].[O-]Cl(=O)(=O)=O,2.271271
7189,O=C1CCCO1,[B-](C1=CC=CC=C1)(C2=CC=CC=C2)(C3=CC=CC=C3)C4=...,-1.525329
2568,CC1COC(=O)O1.CCOC(=O)OCC,[Li+].F[P-](F)(F)(F)(F)F,1.332102
...,...,...,...
1822,C1COC(=O)O1.CCOC(=O)OC.COC(=O)OC,[Li+].[B-]12(OC(=O)C(=O)O1)OC(=O)C(=O)O2,0.683489
2923,CC1COC(=O)O1,[Li+].[B-]12(OC(=O)C(=O)O1)OC(=O)C(=O)O2,1.087214
6641,CC1COC(=O)O1.O=C1OCCO1,[Li+].F[P-](F)(F)(F)(F)F,2.389038
6578,CC1COC(=O)O1.CCOC(=O)OCC,[B-](F)(F)(F)F.CC[N+](CC)(CC)CC,1.590862


In [15]:
val_df.to_csv('val_multi_comp_2d.csv', index=False)

In [16]:
test_df.to_csv('test_multi_comp_2d.csv', index=False)

In [17]:
train_df.to_csv('train_multi_comp_2d.csv', index=False)