# Get dataset

In [99]:
from pymatgen.ext.matproj import MPRester
from matminer.featurizers.conversions import StrToComposition
import scipy
from matminer.featurizers.composition import ElementProperty, ElementFraction, Meredig, Stoichiometry
import sklearn
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.metrics import mean_squared_error as mse

#from materials_project_api_key import api_key as api_key
MP_API_KEY="zJTA8MUZaSc3lxtI"

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import scipy

pd.set_option('display.max_columns', None)

import sys
from atomic_number import atomic_number as an

In [161]:
!jupyter nbconvert --to script Main.ipynb

[NbConvertApp] Converting notebook Main.ipynb to script
[NbConvertApp] Writing 6431 bytes to Main.py


In [100]:
properties = ['material_id',
 'formula','spacegroup.symbol',
 'structure',
'decomposes_to',
'elasticity',
 'elements',
 'formula_anonymous',
 'chemsys',
'task_ids',
 'last_updated',
 'created_at',
 "band_gap",
'energy_per_atom',
'formation_energy_per_atom',
'e_above_hull',
 'nsites',
 'nelements',
 'volume',
 'density']

elements = ["Zn"]
data = []
for e in elements:
    criteria = {"elements": {"$all": [e]},'band_gap': {'$gt': 0.00}}
    with MPRester(MP_API_KEY) as m:  
        temp = m.query(criteria=criteria, properties=properties)
        data.append(temp)
data = [item for sublist in data for item in sublist]
data = pd.DataFrame(data)

100%|██████████| 2687/2687 [00:22<00:00, 116.93it/s]


In [101]:
# Get atom counts
crystal_atom_counts = {}
for row in tqdm(data.formula):
    for key in row.keys():
        crystal_atom_counts[key] = crystal_atom_counts[key] + 1 if key in crystal_atom_counts.keys() else 1
crystal_atom_counts = pd.Series({ key:crystal_atom_counts[key] for key in sorted(crystal_atom_counts.keys()) })

# Only choose structures with elements that are well represented in the dataset
n_crystals_cutoff = (len(data)*0.04)
elements = [key for key in crystal_atom_counts.keys() if crystal_atom_counts[key] >= n_crystals_cutoff]
print(f"Allowed Elements: {elements}")
print(f"Number of Elements: {len(elements)}")

# Lets now filter the structures
mask = [row[1].name for row in data.iterrows() if np.all(np.isin(list(row[1].formula.keys()), elements))]
filtered = data.loc[mask]

100%|██████████| 2687/2687 [00:00<00:00, 293599.09it/s]


Allowed Elements: ['As', 'Ba', 'C', 'Cu', 'F', 'Fe', 'H', 'K', 'Li', 'Mn', 'N', 'Na', 'O', 'P', 'S', 'Se', 'Si', 'Sn', 'V', 'Zn']
Number of Elements: 20


# Featurize

In [102]:
import numpy as np
import sys

In [103]:
from get_custom_features3 import get_custom_features as get_custom_features

In [104]:
X_desc_custom = get_custom_features(filtered.structure.values,filtered)

In [105]:
columns=np.array(X_desc_custom.columns)
X_desc_custom=np.array(X_desc_custom)
X_desc_custom=pd.DataFrame(X_desc_custom)
X_desc_custom.columns = columns

In [110]:
pip install openpyxl

Note: you may need to restart the kernel to use updated packages.


In [111]:
from get_features_formula1 import get_features_formula

In [112]:
X_desc_form= get_features_formula(filtered.formula,filtered['spacegroup.symbol'].values)
X_desc_form = X_desc_form.loc[(X_desc_form!=0).any(1), (X_desc_form!=0).any(0)]

  X_desc_form = X_desc_form.loc[(X_desc_form!=0).any(1), (X_desc_form!=0).any(0)]


In [116]:
X_desc_form_cus=pd.concat((X_desc_custom,X_desc_form),
    axis=1)

In [117]:
from get_features_structure1 import get_features_structure

In [118]:
X_desc_struc = get_features_structure(filtered.structure.values)
X_desc_struc = X_desc_struc.loc[(X_desc_struc!=0).any(1), (X_desc_struc!=0).any(0)]

  X_desc_struc = X_desc_struc.loc[(X_desc_struc!=0).any(1), (X_desc_struc!=0).any(0)]


In [119]:
X_desc_struc_cus=pd.concat([X_desc_custom,X_desc_struc],axis=1)

In [120]:
semiconductors = StrToComposition().featurize_dataframe(filtered, "formula")

StrToComposition:   0%|          | 0/1014 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
ep_feat_mag = ElementProperty.from_preset(preset_name="magpie")
X_desc_mag = ep_feat_mag.featurize_dataframe(semiconductors, col_id="composition")
X_desc_mag = X_desc_mag.loc[(X_desc_mag!=0).any(1), (X_desc_mag!=0).any(0)]
X_desc_mag = X_desc_mag.dropna(how='any',axis=1)

In [None]:
ep_feat_mat = ElementProperty.from_preset(preset_name="matminer")
X_desc_mat = ep_feat_mat.featurize_dataframe(semiconductors, col_id="composition")
X_desc_mat = X_desc_mat.loc[(X_desc_mat!=0).any(1), (X_desc_mat!=0).any(0)]
X_desc_mat = X_desc_mat.dropna(how='any',axis=1)

In [None]:
ep_feat_deml = ElementProperty.from_preset(preset_name="deml")
X_desc_deml = ep_feat_deml.featurize_dataframe(semiconductors, col_id="composition")
X_desc_deml = X_desc_deml.loc[(X_desc_deml!=0).any(1), (X_desc_deml!=0).any(0)]
X_desc_deml = X_desc_deml.dropna(how='any',axis=1)

In [None]:
ep_feat_deml = ElementProperty.from_preset(preset_name="deml")
X_desc_deml = ep_feat_deml.featurize_dataframe(semiconductors, col_id="composition")
X_desc_deml = X_desc_deml.loc[(X_desc_deml!=0).any(1), (X_desc_deml!=0).any(0)]
X_desc_deml = X_desc_deml.dropna(how='any',axis=1)

In [None]:
ep_feat_schol = ElementProperty.from_preset(preset_name="matscholar_el")
X_desc_schol = ep_feat_schol.featurize_dataframe(semiconductors, col_id="composition")
X_desc_schol = X_desc_schol.loc[(X_desc_schol!=0).any(1), (X_desc_schol!=0).any(0)]
X_desc_schol = X_desc_schol.dropna(how='any',axis=1)

In [None]:
ep_feat_megnet = ElementProperty.from_preset(preset_name="megnet_el")
X_desc_megnet = ep_feat_megnet.featurize_dataframe(semiconductors, col_id="composition")
X_desc_megnet = X_desc_megnet.loc[(X_desc_megnet!=0).any(1), (X_desc_megnet!=0).any(0)]
X_desc_megnet = X_desc_megnet.dropna(how='any',axis=1)

In [132]:
filtered.structure.values[0]

Structure Summary
Lattice
    abc : 3.205217815824846 3.205217815824846 5.516927
 angles : 90.0 90.0 119.99999619840808
 volume : 49.08434295009371
      A : 1.602609 -2.7758 0.0
      B : 1.602609 2.7758 0.0
      C : 0.0 0.0 5.516927
    pbc : True True True
PeriodicSite: Zn (0.0000, 0.0000, 2.7648) [0.0000, 0.0000, 0.5011]
PeriodicSite: Zn (0.0000, 0.0000, 0.0063) [0.0000, 0.0000, 0.0011]
PeriodicSite: O (1.6026, -0.9253, 4.8883) [0.6667, 0.3333, 0.8861]
PeriodicSite: O (1.6026, 0.9253, 2.1298) [0.3333, 0.6667, 0.3861]

In [154]:
type(filtered.structure.values[0])
filtered.structure.values[0]

Structure Summary
Lattice
    abc : 3.205217815824846 3.205217815824846 5.516927
 angles : 90.0 90.0 119.99999619840808
 volume : 49.08434295009371
      A : 1.602609 -2.7758 0.0
      B : 1.602609 2.7758 0.0
      C : 0.0 0.0 5.516927
    pbc : True True True
PeriodicSite: Zn (0.0000, 0.0000, 2.7648) [0.0000, 0.0000, 0.5011]
PeriodicSite: Zn (0.0000, 0.0000, 0.0063) [0.0000, 0.0000, 0.0011]
PeriodicSite: O (1.6026, -0.9253, 4.8883) [0.6667, 0.3333, 0.8861]
PeriodicSite: O (1.6026, 0.9253, 2.1298) [0.3333, 0.6667, 0.3861]

In [158]:
from pymatgen.analysis.graphs import StructureGraph
sg = StructureGraph(filtered.structure.values[0],graph_data=None)
sg
#sg.draw_graph_to_file = ('graph_test.png')

AttributeError: 'NoneType' object has no attribute 'get'

In [126]:
def draw_graphs(structures):
    from pymatgen.analysis.graphs import StructureGraph
    number=0
    for i in structures:
        sg = StructureGraph(i)
        draw_graph_to_file = ('graph'+str(number)+'.png')
        number=number+1

    

Structure Summary
Lattice
    abc : 3.205217815824846 3.205217815824846 5.516927
 angles : 90.0 90.0 119.99999619840808
 volume : 49.08434295009371
      A : 1.602609 -2.7758 0.0
      B : 1.602609 2.7758 0.0
      C : 0.0 0.0 5.516927
    pbc : True True True
PeriodicSite: Zn (0.0000, 0.0000, 2.7648) [0.0000, 0.0000, 0.5011]
PeriodicSite: Zn (0.0000, 0.0000, 0.0063) [0.0000, 0.0000, 0.0011]
PeriodicSite: O (1.6026, -0.9253, 4.8883) [0.6667, 0.3333, 0.8861]
PeriodicSite: O (1.6026, 0.9253, 2.1298) [0.3333, 0.6667, 0.3861]

# Feature importances

In [None]:
from feature_importances_rf_reg import feature_importances_rf_reg

In [None]:
rf_reg_formula = feature_importances_rf_reg(X_desc_form_cus.iloc[:,13:],filtered.band_gap.values,500,0.25)

In [None]:
rf_reg_struc = feature_importances_reg(X_desc_struc_cus.iloc[:,13:],filtered.band_gap.values,500,0.25)

# Train models