# Converting data

In general, there are 3 general data types that can be distinguished:

*   ```pandas.DataFrame```
*   ```numpy.array```
*   ```list```

It may be necessary to change to and from either of these, depending on the selected featurization method. However, most of the time the selected featurization method will convert from `list` to either `numpy.array` or `pandas.DataFrame`

In [None]:
import pandas as pd
import numpy as np

## Dataframes
Converting to pandas Dataframes

In [None]:
'''1D Arrays to Dataframe'''
Dataframe = pd.DataFrame(np.vstack((array_1,array_2,...)).T, columns = ['Column 1','Column 2', '...'])

'''2D Array to Dataframe'''
Dataframe = pd.DataFrame(array_2D, columns = ['Column 1', 'Column 2', '...'])

'''List to Dataframe'''
Dataframe = pd.DataFrame(list_1, columns = ['Column 1', 'Column 2', '...'])

'''Pymatgen to Dataframe'''
Dataframe = Structure.as_dataframe() #CAREFUL: Only 1 entry can be converted at a time

## Arrays
Converting to numpy arrays

In [None]:
'''Dataframe to nD Arrays'''
Array = Dataframe.to_numpy()

'''List to Array'''
Array = np.array(list_1)

# Featurization

## CBFV
*Composition-Based Feature Vectors*

Input Format: chemical formula `pandas.DataFrame`

Output Format: `pandas.DataFrame`

Uses a lookup table to generate compositional vectors based on "chemical" formulae.

In [None]:
!pip install CBFV
from CBFV import composition
import pandas as pd

Function to generate CBFV vector (which is separated) and combines X and Y into a single vector

In [None]:
def gen_CBFV(df,form_col,target_col):
    
    df_prev = df.copy()
    
    rename_dict = {form_col: 'formula', target_col: 'target'}
    df_prev = df_prev.rename(columns=rename_dict)
    #print(df_prev)

    X, y, formulae, skipped = composition.generate_features(df_prev, elem_prop='oliynyk',
                                                            drop_duplicates=False, 
                                                            extend_features=False, 
                                                            sum_feat=True)
    print(f'\n the skipped elements are: {skipped[:15]}...')
    
    df_CBFV = X.copy()
    df_CBFV[target_col] = y
    #print(df_CBFV)
    
    df_CBFV = df_CBFV.set_index(target_col)
    return df_CBFV

Identify chemical formula column

In [None]:
parse_form_str = 'COMP_PARSED FORMULA'

'CBFV features (composition-based feature vector)'
df_temp = df_import[parse_form_str].copy()
df_temp = df_temp.reset_index()

df_temp.head()

Execute the CBFV function

In [None]:
feature_vectors = gen_CBFV(df_temp, parse_form_str, 'group')

## Dscribe

### SOAP
*Smooth Overlap of Atomic Positions*

Input Format: **ASE** - `list`

Output Format: `numpy.array`

Generates large vectors to provide coordinates of the Atomic Positions (~150'000+)

In [None]:
!pip install dscribe
from dscribe.descriptors import SOAP
import numpy as np

function to generate SOAP vector

In [None]:
def gen_SOAP(structures, per=False, rc=5, nm=8, lm=8):
    # get chemical species
    species = set()
    for structure in structures:
      species.update(structure.get_chemical_symbols())
    # set up descriptor parameters
    soap = SOAP(
        species=species,
        periodic=per,
        rcut=rc,
        nmax=nm,
        lmax=lm,
        average="outer",
        sparse=False
        )
    # generate feature vector
    array_SOAP = soap.create(structures, n_jobs=1)
    return array_SOAP

generate feature vectors

In [None]:
feature_vectors = gen_SOAP(structures)

### CM
*Coulomb Matrix*

Input Format: `ase.structure`

Output Format: `numpy.array`

Generates vectors which describe coulombic interactions (~4000+)

In [None]:
!pip install dscribe
from dscribe.descriptors import CoulombMatrix
import numpy as np

function to generate CM vector

In [None]:
def gen_CM(structures, an= [1, 8], rc=6.0, nm=8, lm=6):
    # get chemical species
    species = set()
    for structure in structures:
      species.update(structure.get_chemical_symbols())
    # set up descriptor parameters
    atomic_numbers = an
    rcut = rc
    nmax = nm
    lmax = lm
    cm = CoulombMatrix(
    n_atoms_max=64,
    )
    # generate feature vector
    array_CM = cm.create(structures, n_jobs=1)
    return array_CM

generate feature vectors

In [None]:
feature_vectors = gen_CM(structures)

## Matminer


### GSF
*Global Symmetry Features*

Input Format: `pymatgen.structure`

Output Format: `numpy.array`

Generates vectors which describe basic symmetry features (~5)

In [None]:
import pandas as pd
import os
import pymatgen
from pymatgen.core import Structure
from matminer.featurizers.structure.symmetry import GlobalSymmetryFeatures

function to generate GSF vector

In [None]:
def gen_GSF(structures):
    all_features = ['spacegroup_num', 'crystal_system', 'crystal_system_int', 'is_centrosymmetric', 'n_symmetry_ops']

    gsf = GlobalSymmetryFeatures(desired_features=None)

    array_GSF = []
    for structure in structures:
        gsf_feature = gsf.featurize(structure)
        array_GSF.append(gsf_feature)
    return array_GSF

generate feature vectors

In [None]:
feature_vectors = gen_GSF(structures)

### JarvisCFID
*Jarvis Classical Force-Field Inspired Descriptors*

Input Format: `pymatgen.structure`

Output Format: `numpy.array`

Generates vectors which describe a composite of many features (~1500)

In [None]:
import pandas as pd
import os
import pymatgen
from pymatgen.core import Structure
from matminer.featurizers.structure import JarvisCFID

function to generate CFID vector

In [None]:
def gen_CFID(structures):
    jarvis = JarvisCFID()

    array_cfid = []
    n = 0
    for structure in structures:
        try:
            cfid_feature = jarvis.featurize(structure)
            array_cfid.append(cfid_feature)
        except:
            n += 1
    print(str(n) + " Operations failed!")
    return array_cfid

generate feature vectors

In [None]:
feature_vectors = gen_GSF(structures)

# Data Cleanup

### Clean up unknown values

drop rows with NaN values in them which where created in the feature generation in a `pandas.DataFrame`

In [None]:
feat_dataframe = feat_dataframe.dropna(axis=0, how='any')

drop rows with NaN values in them which where created in the feature generation in a `numpy.array`

In [None]:
feat_array = feat_array[~np.isnan(feat_array).any(axis=1)]

### restrict features and CTE to overlapping entries

In [None]:
print(df_import.shape, feat_CBFV.shape)
df_labeled = material_group_select(df_import, feat_CBFV.index)

Comparing two sets of data and adjusting them to the same length

In [None]:
def material_group_select(df,group_list):
    '''restrict materials to materials listed in group_list'''
    #print(df.head())
    df_return = df.copy()
    df_return = df_return.loc[group_list]
    #print(df.shape,df_return.shape,len(group_list))

    print('By restricting dataframe to selected material list:')
    compare_rows(df, df_return)
    return df_return

def compare_rows(df_before,df_after):
    # print(f'DataFrame shape before processing: {df_before.shape}')
    # print(f'DataFrame shape after processing: {df_after.shape}')
    dif_row = df_before.shape[0] - df_after.shape[0]
    print(f'\t you deleted {dif_row} entries')
