In [1]:
%matplotlib inline

In [2]:
import os
import sys

In [3]:
print sys.version

2.7.14 (default, Jan 17 2018, 15:13:18) 
[GCC 4.2.1 Compatible Apple LLVM 9.0.0 (clang-900.0.39.2)]


In [34]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm_notebook

In [5]:
from utils import get_xyz_data, get_size

Pymatgen will drop Py2k support from v2019.1.1. Pls consult the documentation
at https://www.pymatgen.org for more details.
  at https://www.pymatgen.org for more details.""")


In [6]:
from properties import get_element_properties
from properties import ElementExtended

In [7]:
DATA_DIR = './data'

#### Get element properties from files (as check)

In [8]:
elem_properties = get_element_properties(DATA_DIR)

In [9]:
elem_properties

Unnamed: 0,element,ea,ip,rs_max,rp_max,rd_max
0,In,-0.3125,-5.5374,1.09,1.39,1.94
1,Ga,-0.1081,-5.8182,0.99,1.33,2.16
2,Al,-0.2563,-5.78,1.13,1.5,3.11
3,O,-0.225633,-5.711867,1.07,1.406667,2.403333


#### Process all files (test and train)

In [35]:
def process_xyz_files(batch='train'):
    """
    Function to process each .xyz file and calculate average properties.
    Returns a dataframe with these quantities for each structure id.
    """
    
    summary_result = []
    
    pbar = tqdm_notebook(os.listdir(os.path.join(DATA_DIR, batch)))
    
    for i in pbar:
        # Load .xyz data
        fname = os.path.join(DATA_DIR, batch, i, 'geometry.xyz')
        pos, latvec, natoms = get_xyz_data(fname)
        
        # Element index
        idx = int(i)
        
        # Properties including O
        avg_mass = pos['atomic_mass'].mean()
        
        # Properties excluding O
        pos = pos[pos['element'] != 'O']
        avg_elec = pos['electroneg'].mean()
        avg_ea = pos['ea'].mean()
        avg_ip = pos['ip'].mean()
        avg_rs_max = pos['rs_max'].mean()
        avg_rp_max = pos['rp_max'].mean()
        avg_rd_max = pos['rd_max'].mean()

        o_cnt = get_size(natoms, 'O')

        row = [idx, avg_elec, avg_mass, o_cnt,
               avg_ea, avg_ip,
               avg_rs_max, avg_rp_max, avg_rd_max]
        
        summary_result.append(row)
    
    summary_result_df = pd.DataFrame(summary_result,
                                     columns=['id', 'avg_elec', 'avg_mass', 'o_cnt',
                                              'avg_ea', 'avg_ip',
                                              'avg_rs_max', 'avg_rp_max', 'avg_rd_max'])
    return summary_result_df.sort_values('id')

#### Process files

In [36]:
data_train = process_xyz_files(batch='train')
data_test = process_xyz_files(batch='test')







In [18]:
data_train.head()

Unnamed: 0,id,avg_elec,avg_mass,o_cnt,avg_ea,avg_ip,avg_rs_max,avg_rp_max,avg_rd_max
1067,1,1.685,26.803475,48,-0.200725,-5.794325,1.0775,1.43625,2.75375
2327,2,1.685,26.803475,48,-0.200725,-5.794325,1.0775,1.43625,2.75375
2219,3,1.6475,23.597865,24,-0.228512,-5.787162,1.10375,1.468125,2.931875
2201,4,1.6525,29.175902,18,-0.27035,-5.71935,1.12,1.4725,2.8175
2342,5,1.79875,44.25309,48,-0.18475,-5.7129,1.0275,1.3525,2.0775


In [19]:
data_test.head()

Unnamed: 0,id,avg_elec,avg_mass,o_cnt,avg_ea,avg_ip,avg_rs_max,avg_rp_max,avg_rd_max
260,1,1.762188,40.483793,48,-0.20615,-5.714512,1.050625,1.3825,2.2625
578,2,1.66,24.666402,48,-0.21925,-5.78955,1.095,1.4575,2.8725
543,3,1.671667,29.09745,18,-0.240967,-5.745933,1.1,1.453333,2.756667
539,4,1.6975,27.872011,48,-0.191462,-5.796712,1.06875,1.425625,2.694375
583,5,1.763125,39.920105,48,-0.199762,-5.723288,1.0475,1.380625,2.269375


#### Write out extended test and train data files

In [20]:
data_train.to_csv(os.path.join(DATA_DIR, 'train_ext.csv'), index=False)

In [21]:
data_test.to_csv(os.path.join(DATA_DIR, 'test_ext.csv'), index=False)