### Preparing conda environment 
~~~
conda create -n synthesis python=3.7 -y
conda activate synthesis
pip install pandas numpy
pip install ase==3.21.1

~~~

### Creating a SQL database of the COD (Do the SQLite instead)
```sh
sudo apt-get --purge remove mysql-server mysql-common mysql-client
sudo apt update && sudo apt dist-upgrade && sudo apt autoremove
sudo apt-get install -y mysql-server mysql-client

# Go to the directory where you have unpacked cod-cifs-mysql.tgz:
cd Local_Data/syn/data_bases/cod/mysql/
sudo mysql -u root -p
```
~~~sql
mysql> create database cod default character set utf8;
mysql> use cod;
mysql> SOURCE ./data.sql;
mysql> ALTER TABLE data DISABLE KEYS;
mysql> LOAD DATA LOCAL INFILE 'data.txt' INTO TABLE data CHARACTER SET utf8 FIELDS TERMINATED BY '\t';
mysql> ALTER TABLE data ENABLE KEYS;
mysql> quit
~~~

### [Creating a SQLite database of the COD](https://wiki.crystallography.net/creatingSQLdatabase/)

~~~sh
sudo apt-get install sqlite3 libsql-translator-perl
~~~
Go to the directory where you have unpacked cod-cifs-mysql.tgz:
~~~sql
sqlt -f MySQL -t SQLite data.sql | sqlite3 cod.db
~~~

**remove \ before the \$ sign in the following line**
Should be: perl -pe 's/"/\\"/g; s/\\\n$/\\n/' data.txt > data.txt.sqlite 

~~~sql
perl -pe 's/"/\\"/g; s/\\\n\$/\\n/' data.txt > data.txt.sqlite 
sqlite3 -separator "$(echo -e "\t")" cod.db '.import data.txt.sqlite data'

# The database which is created is in the file 'cod.db'. 
# You can now query it:
sqlite3 cod.db 'select count(*) from data'
~~~


In [2]:
from utility.utility_general import *
from ase.formula import Formula
from ase.spacegroup import Spacegroup
import ase
import imp
from tqdm import tqdm, tqdm_notebook
import crystals_tools
from joblib import Parallel, delayed
imp.reload(crystals_tools)

notebook_filename = 'positive_data_preparation.ipynb'
result_dir = os.path.expanduser('results/data_preparation/run_002')
print('Results: ', result_dir)

Results:  results/data_preparation/run_002


## Gathering all the information from the COD's CIF files

In [2]:
cif_files = list_all_files(f'{local_data_path}/data_bases/cod/cif',
                           pattern='**/*.cif')
print(f'COD CIF files: {len(cif_files):,}')

COD CIF files: 454,771


### Reading CIF files

In [3]:
time_1 = datetime.now()
n = 500
files_split = [cif_files[i:i + n] for i in range(0, len(cif_files), n)]
iterable = files_split
# output = Parallel(n_jobs=5, verbose=1, backend="threading")(map(parallel_read, iterable))
from tqdm import tqdm
output = Parallel(n_jobs=64)(
    delayed(crystals_tools.cif_parser)(it, error_ok=True)
    for it in tqdm(iterable, leave=False, position=0))
# run_in_parallel(parallel_read, iterable, n_jobs=20)
time_2 = datetime.now()
print('Finished converting to Atom Objects. Run time:',
      str(time_2 - time_1)[:-7])

                                                   

Finished converting to Atom Objects. Run time: 0:37:01


In [4]:
print(f'Saving the results: {data_path}/data_bases/cod/cif_pickle/')
crystals = [item for sublist in output for item in sublist]
save_var(crystals,
         f'{data_path}/data_bases/cod/cif_pickle/all.pkl',
         make_path=True)

ind = np.linspace(0, len(crystals), 11, dtype=int)
for i in range(len(ind) - 1):
    path = f'{data_path}/data_bases/cod/cif_pickle/{i:03}.pkl'
    print(path)
    save_var(crystals[ind[i]:ind[i + 1]], path)

Saving the results: 
/home/adavar2/Data/syn/data_bases/cod/cif_pickle/000.pkl
/home/adavar2/Data/syn/data_bases/cod/cif_pickle/001.pkl
/home/adavar2/Data/syn/data_bases/cod/cif_pickle/002.pkl
/home/adavar2/Data/syn/data_bases/cod/cif_pickle/003.pkl
/home/adavar2/Data/syn/data_bases/cod/cif_pickle/004.pkl
/home/adavar2/Data/syn/data_bases/cod/cif_pickle/005.pkl
/home/adavar2/Data/syn/data_bases/cod/cif_pickle/006.pkl
/home/adavar2/Data/syn/data_bases/cod/cif_pickle/007.pkl
/home/adavar2/Data/syn/data_bases/cod/cif_pickle/008.pkl
/home/adavar2/Data/syn/data_bases/cod/cif_pickle/009.pkl


In [143]:
crystals = load_var(f'{data_path}/data_bases/cod/cif_pickle/all.pkl')

### Processing

In [165]:
# for it in crystals[:10]:
# #     print(it)
#     crystals_tools.atoms_find_min_dist(it, return_dict=True, ok_error=True)

min_dist = Parallel(n_jobs=64)(delayed(crystals_tools.atoms_find_min_dist)
                             (it, return_dict=True, ok_error=True) 
                             for it in tqdm_notebook(crystals))

min_dist_df = pd.DataFrame(min_dist)
save_var(min_dist_df, 'tmp/min_dist_df.pkl')

HBox(children=(IntProgress(value=0, max=454771), HTML(value='')))

In [193]:
ids = np.zeros((len(crystals), ), dtype=int)
err = np.zeros((len(crystals), ), dtype=bool)
# min_dist = np.zeros((len(crystals), ), dtype=int)
# sg = np.array([ase.spacegroup.Spacegroup]*len(crystals))
sg = np.array([None] * len(crystals))
# sym = np.array([ase.symbols.Symbols]*len(crystals))
sym = np.array([None] * len(crystals))
occ_min = np.zeros((len(crystals), ))
occ_min[:] = np.NaN
files = np.array([None] * len(crystals))
for i, at in tqdm_notebook(enumerate(crystals)):
    if not isinstance(at, ase.atoms.Atoms):
        ids[i] = int(at['filename'].split('/')[-1].split('.')[0])
        files[i] = at['filename']
        err[i] = True
        continue
    ids[i] = int(at.info['filename'].split('/')[-1].split('.')[0])
    files[i] = at.info['filename']
    sg[i] = at.info.get('spacegroup', None)
    sym[i] = at.symbols.get_chemical_formula()
#     min_dist[i] = crystals_tools.atoms_find_min_dist(at)
    try:
        occ_min[i] = min([
            list(i.values())[0] for i in list(at.info['occupancy'].values())
        ]) if 'occupancy' in at.info else 1
    except Exception:
        occ_min[i] = 0
crystals_df = pd.DataFrame({
    'id': ids,
    'spacegroup': sg,
    'formula': sym,
#     'min atomic dist': min_dist,
    'minimum occupancy': occ_min,
    'error': err,
    'db': 'cod',
    'filename': files
})

# merging it with min_atomic_dist results
# To check if merging works:
# crystals_tools.atoms_find_min_dist(crystals_tools.cif_parser(crystals_df['filename'][454770]))
crystals_df = pd.merge(crystals_df, min_dist_df, how='left', on='filename')

save_var(crystals_df, f'{data_path}/data_bases/cod/cif_pickle/df.pkl')
crystals_df

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Unnamed: 0,id,spacegroup,formula,minimum occupancy,error,db,filename,min_atomic_dist
0,1000000,14 P 21/c\n setting 1\n centrosymmetric 1...,C20H68Al4N8O32P8,1.000,False,cod,/home/adavar2/Local_Data/syn/data_bases/cod/ci...,0.819938
1,1000001,19 P 21 21 21\n setting 1\n centrosymmetr...,C428H568N56O104,1.000,False,cod,/home/adavar2/Local_Data/syn/data_bases/cod/ci...,0.894671
2,1000002,14 P 21/c\n setting 1\n centrosymmetric 1...,C12H12O28Sr4,1.000,False,cod,/home/adavar2/Local_Data/syn/data_bases/cod/ci...,0.635776
3,1000003,14 P 21/c\n setting 1\n centrosymmetric 1...,C12O24Sr4,1.000,False,cod,/home/adavar2/Local_Data/syn/data_bases/cod/ci...,1.174074
4,1000004,2 P -1\n setting 1\n centrosymmetric 1\n...,C58H60Cu2I2P4,1.000,False,cod,/home/adavar2/Local_Data/syn/data_bases/cod/ci...,0.929547
...,...,...,...,...,...,...,...,...
454766,9016726,166 R -3 m\n setting 1\n centrosymmetric 1...,Cl6Fe3,1.000,False,cod,/home/adavar2/Local_Data/syn/data_bases/cod/ci...,2.489383
454767,9016727,14 P 21/c\n setting 1\n centrosymmetric 1...,Ag4Mn4Pb12S48Sb20,0.049,False,cod,/home/adavar2/Local_Data/syn/data_bases/cod/ci...,2.415235
454768,9016728,167 R -3 c\n setting 1\n centrosymmetric 1...,Cr12O18,1.000,False,cod,/home/adavar2/Local_Data/syn/data_bases/cod/ci...,1.964360
454769,9016729,,,,True,cod,/home/adavar2/Local_Data/syn/data_bases/cod/ci...,0.000000


In [31]:
crystals_df = load_var(f'{data_path}/data_bases/cod/cif_pickle/df.pkl')

## Converting crystals with partial occupancies to supercells

In [30]:
def cif_2_supercell(input_list,
                    super_cell_executable=None,
                    verbose=True,
                    output_list=None):
    if verbose:
        print('Running the Supercell Program to handle the occupancies')
    if super_cell_executable is None:
        super_cell_executable = prepare_supercell()
    if not isinstance(input_list, list):
        input_list = [input_list]
    if output_list is None:
        output_list = [
            i.replace('/cod/', '/cod_supercell/') for i in input_list
        ]
        output_list = [i.replace('.cif', '_supercell_') for i in output_list]

    for i, in_file in enumerate(input_list):
        out_file = output_list[i]
        if not exists(in_file):
            raise FileNotFoundError(in_file)

        op = '/'.join(out_file.split('/')[:-1])
        print(op)
        os.makedirs(op, exist_ok=True)

        if verbose:
            print(i, out_file)
        in_file = Command(
            f'{super_cell_executable} -i {in_file} -o {out_file} -n r1 -v 0')
        in_file.run(timeout=2)

In [24]:
n_po = np.count_nonzero(crystals_df['minimum occupancy'] < 1)
print(f'Crystals with partial occupancies: {n_po:,}')

Crystals with partial occupancies: 142,568


In [112]:
imp.reload(supercell)
sup_cell_inp = list(crystals_df[crystals_df['minimum occupancy'] < 1]['id'])
sup_cell_inp = crystals_tools.crystal_id_2_relative_path(
    sup_cell_inp, path=f'{local_data_path}cod/cif/')
# A = supercell.cif_2_supercell(sup_cell_inp[:5], verbose=False)

Unnamed: 0,input_file,status,output_file
0,/home/adavar2/Local_Data/exp_syn/cod/cif/1/00/...,0,/home/adavar2/Local_Data/exp_syn/cod_supercell...
1,/home/adavar2/Local_Data/exp_syn/cod/cif/1/00/...,3,/home/adavar2/Local_Data/exp_syn/cod_supercell...
2,/home/adavar2/Local_Data/exp_syn/cod/cif/1/00/...,0,/home/adavar2/Local_Data/exp_syn/cod_supercell...
3,/home/adavar2/Local_Data/exp_syn/cod/cif/1/00/...,3,/home/adavar2/Local_Data/exp_syn/cod_supercell...
4,/home/adavar2/Local_Data/exp_syn/cod/cif/1/00/...,3,/home/adavar2/Local_Data/exp_syn/cod_supercell...


### Converting to Supercell

In [None]:
time_1 = datetime.now()
n = 100
files_split = split_arr_into_chunks_of_n(sup_cell_inp, n)
iterable = files_split
# output = Parallel(n_jobs=5, verbose=1, backend="threading")(map(parallel_read, iterable))
from tqdm import tqdm
output = Parallel(n_jobs=20)(
    delayed(supercell.cif_2_supercell)(it, verbose=False)
    for it in tqdm(iterable, leave=False, position=0))
# run_in_parallel(parallel_read, iterable, n_jobs=20)
time_2 = datetime.now()
print('Finished using the Supercell program. Run time:',
      str(time_2 - time_1)[:-7])

### Reading the generated supercells

In [6]:
cif_files = list_all_files(f'{local_data_path}/data_bases/cod/cif_supercell',
                           pattern='**/*.cif')
print(f'COD Supercell CIF files: {len(cif_files):,}')

COD Supercell CIF files: 37,454


In [7]:
time_1 = datetime.now()
n = 100
files_split = [cif_files[i:i + n] for i in range(0, len(cif_files), n)]
iterable = files_split
# output = Parallel(n_jobs=5, verbose=1, backend="threading")(map(parallel_read, iterable))
from tqdm import tqdm
output = Parallel(n_jobs=64)(
    delayed(crystals_tools.cif_parser)(it, error_ok=True)
    for it in tqdm(iterable, leave=False, position=0))
# run_in_parallel(parallel_read, iterable, n_jobs=20)
time_2 = datetime.now()
print('Finished converting to Atom Objects. Run time:',
      str(time_2 - time_1)[:-7])

                                                   

Finished converting to Atom Objects. Run time: 0:01:24


In [8]:
print('Saving the results: ')
sc_crystals = [item for sublist in output for item in sublist]
save_var(sc_crystals,
         f'{data_path}/data_bases/cod/cif_supercell_pickle/all.pkl',
         make_path=True)

Saving the results: 


In [189]:
sc_crystals = load_var(f'{data_path}/data_bases/cod/cif_supercell_pickle/all.pkl')

In [190]:
sc_min_dist = Parallel(n_jobs=64)(delayed(crystals_tools.atoms_find_min_dist)
                             (it, return_dict=True, ok_error=True) 
                             for it in tqdm_notebook(sc_crystals))

sc_min_dist_df = pd.DataFrame(sc_min_dist)
save_var(sc_min_dist_df, 'tmp/sc_min_dist_df.pkl')

HBox(children=(IntProgress(value=0, max=37454), HTML(value='')))

In [194]:
ids = np.zeros((len(sc_crystals), ), dtype=int)
err = np.zeros((len(sc_crystals), ), dtype=bool)
# sg = np.array([ase.spacegroup.Spacegroup]*len(sc_crystals))
sg = np.array([None] * len(sc_crystals))
# sym = np.array([ase.symbols.Symbols]*len(sc_crystals))
sym = np.array([None] * len(sc_crystals))
occ_min = np.zeros((len(sc_crystals), ))
occ_min[:] = np.NaN
files = np.array([None] * len(sc_crystals))
for i, at in tqdm_notebook(enumerate(sc_crystals)):
    if not isinstance(at, ase.atoms.Atoms):
        ids[i] = int(at['filename'].split('/')[-1].split('.')[0].split('_')[0])
        files[i] = at['filename']
        err[i] = True
        continue
    ids[i] = int(at.info['filename'].split('/')[-1].split('.')[0].split('_')[0])
    files[i] = at.info['filename']
    sg[i] = at.info.get('spacegroup', None)
    sym[i] = at.symbols.get_chemical_formula()
    try:
        occ_min[i] = min([
            list(i.values())[0] for i in list(at.info['occupancy'].values())
        ]) if 'occupancy' in at.info else 1
    except Exception:
        occ_min[i] = 0
sc_crystals_df = pd.DataFrame({
    'id': ids,
    'spacegroup': sg,
    'formula': sym,
    'minimum occupancy': occ_min,
    'error': err,
    'db': 'cod-sc',
    'filename': files
})

# merging it with min_atomic_dist results
# To check if merging works:
# crystals_tools.atoms_find_min_dist(crystals_tools.cif_parser(sc_crystals_df['filename'][37452]))
sc_crystals_df = pd.merge(sc_crystals_df, sc_min_dist_df, how='left', on='filename')

save_var(sc_crystals_df, f'{data_path}/data_bases/cod/cif_supercell_pickle/df.pkl')
sc_crystals_df

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Unnamed: 0,id,spacegroup,formula,minimum occupancy,error,db,filename,min_atomic_dist
0,1000023,1 P 1\n setting 1\n centrosymmetric 0\n ...,Cu3Fe4O24P6,1.0,False,cod-sc,/home/adavar2/Local_Data/syn/data_bases/cod/ci...,1.508306
1,1000030,1 P 1\n setting 1\n centrosymmetric 0\n ...,Ba2Cu3O7Y,1.0,False,cod-sc,/home/adavar2/Local_Data/syn/data_bases/cod/ci...,1.850666
2,1000049,1 P 1\n setting 1\n centrosymmetric 0\n ...,K4O8S2,1.0,False,cod-sc,/home/adavar2/Local_Data/syn/data_bases/cod/ci...,1.340000
3,1000066,1 P 1\n setting 1\n centrosymmetric 0\n ...,Ba2Cu2O7PdY,1.0,False,cod-sc,/home/adavar2/Local_Data/syn/data_bases/cod/ci...,1.844018
4,1000077,1 P 1\n setting 1\n centrosymmetric 0\n ...,HCr12F36,1.0,False,cod-sc,/home/adavar2/Local_Data/syn/data_bases/cod/ci...,1.897003
...,...,...,...,...,...,...,...,...
37449,9016658,1 P 1\n setting 1\n centrosymmetric 0\n ...,Mg16O32Si8,1.0,False,cod-sc,/home/adavar2/Local_Data/syn/data_bases/cod/ci...,1.660093
37450,9016666,1 P 1\n setting 1\n centrosymmetric 0\n ...,H3Mg10O19Si4,1.0,False,cod-sc,/home/adavar2/Local_Data/syn/data_bases/cod/ci...,1.364036
37451,9016683,1 P 1\n setting 1\n centrosymmetric 0\n ...,B190,1.0,False,cod-sc,/home/adavar2/Local_Data/syn/data_bases/cod/ci...,1.499742
37452,9016691,1 P 1\n setting 1\n centrosymmetric 0\n ...,Al16Mg6O96Si24,1.0,False,cod-sc,/home/adavar2/Local_Data/syn/data_bases/cod/ci...,1.633527


In [None]:
sc_crystals_df = load_var(f'{data_path}/data_bases/cod/cif_supercell_pickle/df.pkl')

## Removing similar structures and structures used in the case studies

### Removing incompatible crystals
* removing min_atomic_dist < 0.5 
* removing min_occupancy < 1
* removing broken CIF files
* removing atomic_number < 1 (Neutroniums)

In [3]:
crystals_df = load_var(f'{data_path}/data_bases/cod/cif_pickle/df.pkl')
sc_crystals_df = load_var(f'{data_path}/data_bases/cod/cif_supercell_pickle/df.pkl')

# Removing: broken CIFs & min_occupancy < 1
crystals_df = crystals_df[(crystals_df['minimum occupancy'] == 1) & (crystals_df['error'] == False)]
sc_crystals_df = sc_crystals_df[(sc_crystals_df['minimum occupancy'] == 1) & (sc_crystals_df['error'] == False)]
sc_crystals_df = sc_crystals_df[~sc_crystals_df['id'].isin(crystals_df['id'])] # Only adding CIFs that 
# only exist in supercell part

In [4]:
df = pd.concat([crystals_df, sc_crystals_df]).drop(['minimum occupancy', 'error'], axis=1).reset_index(drop=True)
df['filename'] = df['filename'].str.replace(local_data_path, '')
# removing min_atomic_dist <= 0.5
df = df[df['min_atomic_dist'] > 0.5].reset_index(drop=True)
df

Unnamed: 0,id,spacegroup,formula,db,filename,min_atomic_dist
0,1000000,14 P 21/c\n setting 1\n centrosymmetric 1...,C20H68Al4N8O32P8,cod,/data_bases/cod/cif/1/00/00/1000000.cif,0.819938
1,1000001,19 P 21 21 21\n setting 1\n centrosymmetr...,C428H568N56O104,cod,/data_bases/cod/cif/1/00/00/1000001.cif,0.894671
2,1000002,14 P 21/c\n setting 1\n centrosymmetric 1...,C12H12O28Sr4,cod,/data_bases/cod/cif/1/00/00/1000002.cif,0.635776
3,1000003,14 P 21/c\n setting 1\n centrosymmetric 1...,C12O24Sr4,cod,/data_bases/cod/cif/1/00/00/1000003.cif,1.174074
4,1000004,2 P -1\n setting 1\n centrosymmetric 1\n...,C58H60Cu2I2P4,cod,/data_bases/cod/cif/1/00/00/1000004.cif,0.929547
...,...,...,...,...,...,...
342283,9016658,1 P 1\n setting 1\n centrosymmetric 0\n ...,Mg16O32Si8,cod-sc,/data_bases/cod/cif_supercell/9/01/66/9016658_...,1.660093
342284,9016666,1 P 1\n setting 1\n centrosymmetric 0\n ...,H3Mg10O19Si4,cod-sc,/data_bases/cod/cif_supercell/9/01/66/9016666_...,1.364036
342285,9016683,1 P 1\n setting 1\n centrosymmetric 0\n ...,B190,cod-sc,/data_bases/cod/cif_supercell/9/01/66/9016683_...,1.499742
342286,9016691,1 P 1\n setting 1\n centrosymmetric 0\n ...,Al16Mg6O96Si24,cod-sc,/data_bases/cod/cif_supercell/9/01/66/9016691_...,1.633527


In [5]:
# Removing Neutroniums

f = pd.Series([Formula(i) for i in df['formula']])
f = pd.Series([index for index, value in f.items() if 'X' in value])
df.drop(f, inplace=True)
df.reset_index(drop=True, inplace=True)

### Removing electrode and thermoelectric materials

In [6]:
electrode_df = pd.read_csv(data_path + '/data_banks/electrode-mp/mp_electrode_materials.csv')
electrode_df['Formula_hill'] = pd.Series([Formula(i).reduce()[0].format('hill') 
                                          for i in electrode_df['formula_discharge']])
# electrode_df['spacegroup'] = pd.Series([Spacegroup(i) for i in electrode_df['sgn']])
df['Formula_hill'] = [Formula(i).reduce()[0].format('hill') for i in df['formula']] # Considering stoichiometry
# df['Formula_hill'] = [Formula(i).format('hill') for i in df['formula']] # Considering composition
df['sgn'] = [i.no for i in df['spacegroup']] # Space group symbol: more conservative way of removing

cod_electrode_df = pd.merge(df, electrode_df, on=['Formula_hill', 'sgn'], how='inner')
cod_electrode_df.drop(columns=['spacegroup'], inplace=True)
cod_electrode_df.to_csv(data_path + '/electrode_materials/cod-electrode-materials.csv')
cod_electrode_df

Unnamed: 0,id,formula,db,filename,min_atomic_dist,Formula_hill,sgn,index,battid,working_ion,max_voltage,capacity_vol,formula_charge,id_charge,formula_discharge,id_discharge,hall,url,path
0,1000075,Bi2Li2O8Pd4,cod,/data_bases/cod/cif/1/00/00/1000075.cif,1.952114,BiLiO4Pd2,129,1535,mp-25201_Li,Li,4.138586,408.336004,Bi(PdO2)2,mp-25201,LiBi(PdO2)2,mp-559893,P 4ab 2ab -1ab,https://materialsproject.org/materials/mp-559893/,/home/adavar2/Data/syn/electrode_materials/dow...
1,1000268,Cu2La8Li2O16,cod,/data_bases/cod/cif/1/00/02/1000268.cif,1.786956,CuLa4LiO8,65,2178,mp-21496_Li,Li,3.933088,239.350906,La4CuO8,mp-772213,LiLa4CuO8,mp-21496,-C 2 2,https://materialsproject.org/materials/mp-21496/,/home/adavar2/Data/syn/electrode_materials/dow...
2,1001384,Na4O28P8Ti4,cod,/data_bases/cod/cif/1/00/13/1001384.cif,1.492221,NaO7P2Ti,14,3686,mp-560506_Na,Na,1.609686,315.744642,TiP2O7,mp-761010,NaTiP2O7,mp-560506,-P 2ybc,https://materialsproject.org/materials/mp-560506/,/home/adavar2/Data/syn/electrode_materials/dow...
3,1001384,Na4O28P8Ti4,cod,/data_bases/cod/cif/1/00/13/1001384.cif,1.492221,NaO7P2Ti,14,3687,mp-17461_Na,Na,1.764832,323.582741,TiP2O7,mp-761041,NaTiP2O7,mp-17461,-P 2ybc,https://materialsproject.org/materials/mp-17461/,/home/adavar2/Data/syn/electrode_materials/dow...
4,1001385,Na4O28P8Ti4,cod,/data_bases/cod/cif/1/00/13/1001385.cif,1.489720,NaO7P2Ti,14,3686,mp-560506_Na,Na,1.609686,315.744642,TiP2O7,mp-761010,NaTiP2O7,mp-560506,-P 2ybc,https://materialsproject.org/materials/mp-560506/,/home/adavar2/Data/syn/electrode_materials/dow...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277,1544720,KMn8O16,cod-sc,/data_bases/cod/cif_supercell/1/54/47/1544720_...,1.886621,KMn8O16,1,2872,mp-1003316_K,K,4.272564,512.975679,MnO2,mp-19395,KMn8O16,mp-1016153,P 1,https://materialsproject.org/materials/mp-1016...,/home/adavar2/Data/syn/electrode_materials/dow...
278,1533548,Cu8Ga4O28Sr8Y4,cod-sc,/data_bases/cod/cif_supercell/1/53/35/1533548_...,1.613466,Cu2GaO7Sr2Y,1,3128,mvc-15173_Y,Y,3.487032,511.353349,Sr6YGa3(Cu2O7)3,mvc-15173,Sr2YGaCu2O7,mp-556575,P 1,https://materialsproject.org/materials/mp-556575/,/home/adavar2/Data/syn/electrode_materials/dow...
279,2018541,Fe4Li4O16P4,cod-sc,/data_bases/cod/cif_supercell/2/01/85/2018541_...,1.521004,FeLiO4P,1,62,mp-767263_Li,Li,3.692890,255.900638,LiFe2(PO4)2,mp-767263,LiFePO4,mp-774251,P 1,https://materialsproject.org/materials/mp-774251/,/home/adavar2/Data/syn/electrode_materials/dow...
280,4344259,Li8NbO6,cod-sc,/data_bases/cod/cif_supercell/4/34/42/4344259_...,1.768640,Li8NbO6,1,1806,mp-37399_Li,Li,0.162446,327.063919,Li7NbO6,mp-37399,Li8NbO6,mp-774697,P 1,https://materialsproject.org/materials/mp-774697/,/home/adavar2/Data/syn/electrode_materials/dow...


In [7]:
# electrode_ids = pd.Series(load_var(f'{data_path}/electrode_materials/cod_ids.pkl'))
electrode_ids = cod_electrode_df['id']
thermo_ids = load_var(f'{data_path}/thermoelectric_materials/atoms_list_top_10.pkl')
thermo_ids = thermo_ids[thermo_ids['data_set'] == 'cod'].rename({'atoms': 'id'}, axis=1)

ind = df['id'].isin(thermo_ids['id'])
print('Removing {:,} theremoelectric samples from the COD.'.format(np.count_nonzero(ind)))
df = df[~ind]

ind = df['id'].isin(electrode_ids)
print('Removing {:,} electrode samples from the COD.'.format(np.count_nonzero(ind)))
df = df[~ind]

Removing 87 theremoelectric samples from the COD.
Removing 264 electrode samples from the COD.


### Removing similar crystals

In [8]:
df = df[(~df['spacegroup'].isnull()) & (~df['formula'].isnull())] # drop null values
# df['Formula_hill'] = [Formula(i).format('hill') for i in df['formula']] # Considering composition
df['Formula_hill'] = [Formula(i).reduce()[0].format('hill') for i in df['formula']] # Considering stoichiometry
# df['sg_symbol'] = [i.symbol for i in df['spacegroup']] # Space group symbol: more accurate
df['sgn'] = [i.no for i in df['spacegroup']] # Space group symbol: more conservative way of removing

print('Number of duplicates based on exact stoichiometry only: {:,}'
      .format(np.count_nonzero(df.duplicated(['Formula_hill'], keep=False))))
print('Number of duplicates based on spacegroups only: {:,}'
      .format(np.count_nonzero(df.duplicated(['sgn'], keep=False))))
print('Number of duplicates based on the stoichiometry and spacegroup: {:,}'
      .format(np.count_nonzero(df.duplicated(['Formula_hill', 'sgn'], keep=False))))

df = df.drop_duplicates(['Formula_hill', 'sgn']).drop(['Formula_hill', 'sgn'], axis=1)
df = df.reset_index(drop=True)
df

Number of duplicates based on exact stoichiometry only: 124,488
Number of duplicates based on spacegroups only: 341,934
Number of duplicates based on the stoichiometry and spacegroup: 84,208


Unnamed: 0,id,spacegroup,formula,db,filename,min_atomic_dist
0,1000000,14 P 21/c\n setting 1\n centrosymmetric 1...,C20H68Al4N8O32P8,cod,/data_bases/cod/cif/1/00/00/1000000.cif,0.819938
1,1000001,19 P 21 21 21\n setting 1\n centrosymmetr...,C428H568N56O104,cod,/data_bases/cod/cif/1/00/00/1000001.cif,0.894671
2,1000002,14 P 21/c\n setting 1\n centrosymmetric 1...,C12H12O28Sr4,cod,/data_bases/cod/cif/1/00/00/1000002.cif,0.635776
3,1000003,14 P 21/c\n setting 1\n centrosymmetric 1...,C12O24Sr4,cod,/data_bases/cod/cif/1/00/00/1000003.cif,1.174074
4,1000004,2 P -1\n setting 1\n centrosymmetric 1\n...,C58H60Cu2I2P4,cod,/data_bases/cod/cif/1/00/00/1000004.cif,0.929547
...,...,...,...,...,...,...
285616,9016520,1 P 1\n setting 1\n centrosymmetric 0\n ...,Al4Ca2Mg6Na2O48Si16,cod-sc,/data_bases/cod/cif_supercell/9/01/65/9016520_...,1.283376
285617,9016605,1 P 1\n setting 1\n centrosymmetric 0\n ...,Fe15Si32,cod-sc,/data_bases/cod/cif_supercell/9/01/66/9016605_...,2.226056
285618,9016621,1 P 1\n setting 1\n centrosymmetric 0\n ...,H4Cl2Cr4F2O20Pb8,cod-sc,/data_bases/cod/cif_supercell/9/01/66/9016621_...,0.981248
285619,9016666,1 P 1\n setting 1\n centrosymmetric 0\n ...,H3Mg10O19Si4,cod-sc,/data_bases/cod/cif_supercell/9/01/66/9016666_...,1.364036


## Creating the COD banks

In [54]:
import shutil
if exists(local_data_path + '/data_banks/cod'):
    shutil.rmtree(local_data_path + '/data_banks/cod')
    
for index, row in tqdm_notebook(df.iterrows()):
    output_path = local_data_path + '/'.join(row['filename'].split('/')[:-1]).\
                    replace('/data_bases/', '/data_banks/')
    output_file = output_path + '/' + row['filename'].split('/')[-1]
    os.makedirs(output_path, exist_ok=True)
    shutil.copyfile(local_data_path + row['filename'], output_file)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [11]:
# df = load_var(f'{local_data_path}/data_banks/cod/df.pkl')

In [12]:
cif_files = list_all_files(f'{local_data_path}/data_banks/cod/',
                           pattern='**/*.cif')
print(f'COD CIF files in the COD bank: {len(cif_files):,}')
save_var(df, f'{local_data_path}/data_banks/cod/df.pkl')
tmp = df.copy()
tmp['path'] = tmp['filename']
tmp['filename'] = tmp['filename'].str.replace('/data_bases/cod', '')
tmp['sgn'] = [i.no for i in tmp['spacegroup']]
tmp['spacegroup'] = [i.symbol for i in tmp['spacegroup']]
tmp.to_csv(f'{local_data_path}/data_banks/cod/df.csv')

COD CIF files in the COD bank: 285,621


## Preparing a data set of electrode and thermoelectric materials 

In [257]:
electrode_ids = pd.Series(load_var(f'{data_path}/electrode_materials/cod_ids.pkl'))
thermo_ids = load_var(f'{data_path}/thermoelectric_materials/atoms_list_top_10.pkl')
# thermo_ids = thermo_ids[thermo_ids['data_set'] == 'cod'].rename({'atoms': 'id'}, axis=1)

## Saving the notebook

In [57]:
from IPython.display import display, Javascript
display(Javascript('IPython.notebook.save_checkpoint();'))
!cp $notebook_filename $result_dir$notebook_filename
!jupyter nbconvert $notebook_filename --to html --output-dir $result_dir

<IPython.core.display.Javascript object>

[NbConvertApp] Converting notebook positive_data_preparation.ipynb to html
[NbConvertApp] Writing 695651 bytes to results/data_preparation/run_002/positive_data_preparation.html
