# Select structures

For the first iteration, we will use only structures from CoRE-ASR, hoping that this makes our lives a bit eaiser. 

We will also apply some additional filters like requiring carbons in the structure and dropping lanthanides and those for which our naive filters for clashing atoms report an error.

In [1]:
import os
from pathlib import Path 

import pandas as pd 
import matplotlib.pyplot as plt 
%matplotlib inline 

from pymatgen import Structure

In [2]:
df = pd.read_csv('2020-05-26-CORE_ASR.csv')

In [3]:
df.head()

Unnamed: 0,density,vpa,packing fraction,0-norm,2-norm,3-norm,5-norm,7-norm,10-norm,MagpieData minimum Number,...,metal_metallic_radius,metal_metallic_radius_c12,metal_oxidation_states,metal_vdw_radius,name,clashing,unbound,hydrogens,cif_problem,planes
0,1.648287,16.227836,0.12405,4,0.555556,0.48075,0.449949,0.445434,0.444531,1.0,...,118.0,127.0,"[7, 6, 4, 3, 2]",205.0,/Users/kevinmaikjablonka/Dropbox (LSMO)/proj73...,False,False,True,False,"['xz', 'xy', 'yz', 'xx', 'yy', 'zz']"
1,0.517074,34.453372,0.033666,5,0.589921,0.524647,0.49539,0.489748,0.488099,1.0,...,118.0,128.0,"[2, 1]",196.0,/Users/kevinmaikjablonka/Dropbox (LSMO)/proj73...,False,False,True,False,"['xz', 'xy', 'yz', 'xx', 'yy', 'zz']"
2,0.948963,25.380789,0.068103,3,0.677631,0.614911,0.581937,0.574422,0.571946,6.0,...,125.0,143.0,[3],184.0,/Users/kevinmaikjablonka/Dropbox (LSMO)/proj73...,False,False,False,False,"['xz', 'xy', 'yz']"
3,1.729486,15.569487,0.138372,4,0.55629,0.47212,0.423111,0.407,0.397075,1.0,...,134.0,144.0,"[2, 1]",211.0,/Users/kevinmaikjablonka/Dropbox (LSMO)/proj73...,False,False,True,False,"['xz', 'xy', 'yz', 'xx', 'yy', 'zz']"
4,1.313768,17.926967,0.090857,5,0.53397,0.446212,0.393012,0.376139,0.367538,1.0,...,138.0,151.0,[2],218.0,/Users/kevinmaikjablonka/Dropbox (LSMO)/proj73...,False,True,True,False,"['xz', 'xy', 'yz']"


In [27]:
from pathlib import Path

In [29]:
df['refcode']= [Path(p).stem for p in df['name'].values]

In [16]:
import ast
import numpy as np
ast.literal_eval(df['channel_indices'][0])

[(0, 3), (8, 10), (9, 11)]

In [17]:
distances = []
min_distances = []
max_distannces = []
mean_distances = []

for i, row in df.iterrows(): 
    s = Structure.from_file(row['name'])
    channel_indices = ast.literal_eval(row['channel_indices'])
    
    distances_in_mof = []
    
    for channel_pair in channel_indices:
        d = s.get_distance(channel_pair[0], channel_pair[1])
        distances_in_mof.append(d)
        
    distances.append(distances_in_mof)
    min_distances.append(np.min(distances_in_mof))
    max_distannces.append(np.max(distances_in_mof))
    mean_distances.append(np.mean(distances_in_mof))

        



In [19]:
df['min_distances'] = min_distances

In [21]:
df['max_distances'] = max_distannces

In [22]:
df['mean_distances'] = mean_distances

In [30]:
df.to_csv('core_data.csv', index=False)

In [7]:
s  = Structure.from_file(df['name'][0])

In [4]:
len(df)

1305

... which is maybe a bit over one-tenth of the full CoRE-ASR database.

In [5]:
len(df[df['clashing']==True]['name'].values)

0

In [6]:
len(df[df['unbound']==True]['name'].values)

133

## Drop those with _charged or _ion in name

In [7]:
df_cleaned = df[[(('ion' not in v) and ('charged' not in v)) for v in  df['name'].values]]

In [8]:
len(df_cleaned)

1254

## We do not have patience for too many f electrons

In [9]:
df_cleaned['is_lanthanid'] = (df_cleaned['metal_mendeelev_number'] >= 57) & (df_cleaned['metal_mendeelev_number'] <= 71)
df_cleaned['is_actinid'] = (df_cleaned['metal_mendeelev_number'] >= 89) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [10]:
df_clean = df_cleaned[(~df_cleaned['is_lanthanid']) & (~df_cleaned['is_actinid'])] 

## Now, we use a quite crude heuristic, we require that 'C' and 'H' is in the MOF

From my experience, the constraint for having H in the structure is actually more restrictive. 


Also, let's just read and write the structures with pymatgen to make sure that this does not hinder our workflow too much.

In [26]:
import numpy as np

In [27]:
for i, row in df_clean.iterrows():
    s = Structure.from_file(row['name'])
    if (6 in s.atomic_numbers) and (1 in s.atomic_numbers):
        if not any(57 < np.array(s.atomic_numbers)): 
            s.to('cif', os.path.join('rewritten_structures2', Path(row['name']).name))



Let's see how many we got ..

In [28]:
from glob import glob

In [29]:
all_cifs = glob('rewritten_structures2/*.cif')

In [30]:
len(all_cifs)

496

### Let's batch the structures up to make stuff a bit more controlled.

In [31]:
N = 100
import shutil
def move_files(abs_dirname):
    """Move files into subdirectories. https://gist.github.com/zupo/5849843"""

    files = [os.path.join(abs_dirname, f) for f in os.listdir(abs_dirname)]

    i = 0
    curr_subdir = None

    for f in files:
        # create new subdir if necessary
        if i % N == 0:
            subdir_name = os.path.join(abs_dirname, '{}'.format(int(i / N + 1)))
            os.mkdir(subdir_name)
            curr_subdir = subdir_name

        # move file to current dir
        f_base = os.path.basename(f)
        shutil.move(f, os.path.join(subdir_name, f_base))
        i += 1

In [32]:
move_files('rewritten_structures2')

In [2]:
ak

NameError: name 'a' is not defined