# Get dataset

In [342]:
from pymatgen.ext.matproj import MPRester
from matminer.featurizers.conversions import StrToComposition
import scipy
from matminer.featurizers.composition import ElementProperty, ElementFraction, Meredig, Stoichiometry
import sklearn
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.metrics import mean_squared_error as mse

#from materials_project_api_key import api_key as api_key
MP_API_KEY="zJTA8MUZaSc3lxtI"

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import scipy

pd.set_option('display.max_columns', None)

import sys
from atomic_number import atomic_number as an

In [343]:
properties = ['material_id',
 'formula','spacegroup.symbol',
 'structure',
'decomposes_to',
'elasticity',
 'elements',
 'formula_anonymous',
 'chemsys',
'task_ids',
 'last_updated',
 'created_at',
 "band_gap",
'energy_per_atom',
'formation_energy_per_atom',
'e_above_hull',
 'nsites',
 'nelements',
 'volume',
 'density']

elements = ["Zn"]
data = []
for e in elements:
    criteria = {"elements": {"$all": [e]},'band_gap': {'$gt': 0.00}}
    with MPRester(MP_API_KEY) as m:  
        temp = m.query(criteria=criteria, properties=properties)
        data.append(temp)
data = [item for sublist in data for item in sublist]
data = pd.DataFrame(data)

100%|██████████| 2687/2687 [00:28<00:00, 95.91it/s] 


In [344]:
# Get atom counts
crystal_atom_counts = {}
for row in tqdm(data.formula):
    for key in row.keys():
        crystal_atom_counts[key] = crystal_atom_counts[key] + 1 if key in crystal_atom_counts.keys() else 1
crystal_atom_counts = pd.Series({ key:crystal_atom_counts[key] for key in sorted(crystal_atom_counts.keys()) })

# Only choose structures with elements that are well represented in the dataset
n_crystals_cutoff = (len(data)*0.04)
elements = [key for key in crystal_atom_counts.keys() if crystal_atom_counts[key] >= n_crystals_cutoff]
print(f"Allowed Elements: {elements}")
print(f"Number of Elements: {len(elements)}")

# Lets now filter the structures
mask = [row[1].name for row in data.iterrows() if np.all(np.isin(list(row[1].formula.keys()), elements))]
filtered = data.loc[mask]

100%|██████████| 2687/2687 [00:00<00:00, 669762.57it/s]


Allowed Elements: ['As', 'Ba', 'C', 'Cu', 'F', 'Fe', 'H', 'K', 'Li', 'Mn', 'N', 'Na', 'O', 'P', 'S', 'Se', 'Si', 'Sn', 'V', 'Zn']
Number of Elements: 20


# Featurize

In [345]:
import numpy as np
import sys

In [346]:
from get_features import get_custom_features
X_desc_custom = get_custom_features(filtered.structure.values,filtered)
columns=np.array(X_desc_custom.columns)
X_desc_custom=np.array(X_desc_custom)
X_desc_custom=pd.DataFrame(X_desc_custom)
X_desc_custom.columns = columns

In [347]:
from get_features import get_features_formula
X_desc_form= get_features_formula(filtered.formula,filtered['spacegroup.symbol'].values)
X_desc_form = X_desc_form.loc[(X_desc_form!=0).any(1), (X_desc_form!=0).any(0)]
X_desc_form_cus=pd.concat((X_desc_custom,X_desc_form),axis=1)

  X_desc_form = X_desc_form.loc[(X_desc_form!=0).any(1), (X_desc_form!=0).any(0)]


In [348]:
from get_features import get_features_structure
X_desc_struc = get_features_structure(filtered.structure.values)
X_desc_struc = X_desc_struc.loc[(X_desc_struc!=0).any(1), (X_desc_struc!=0).any(0)]
X_desc_struc_cus=pd.concat([X_desc_custom,X_desc_struc],axis=1)

  X_desc_struc = X_desc_struc.loc[(X_desc_struc!=0).any(1), (X_desc_struc!=0).any(0)]


In [349]:
semiconductors = StrToComposition().featurize_dataframe(X_desc_custom, "formula")

StrToComposition:   0%|          | 0/1014 [00:00<?, ?it/s]

In [350]:
from get_features import get_features_mag
X_desc_mag = get_features_mag(semiconductors,'composition')
X_desc_mag

ElementProperty:   0%|          | 0/1014 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
from get_features import get_features_mat
X_desc_mat = get_features_mat(semiconductors,'composition')
X_desc_mat

In [None]:
from get_features import get_features_deml
X_desc_deml = get_features_deml(semiconductors,'composition')
X_desc_deml

In [None]:
from get_features import get_features_schol
X_desc_schol = get_features_schol(semiconductors,'composition')
X_desc_schol

In [None]:
from get_features import get_features_megnet
X_desc_megnet = get_features_megnet(semiconductors,'composition')
X_desc_megnet

In [None]:
semiconductors.composition.values[0]

In [None]:

from matminer.featurizers.composition.tests.test_ion import IonFeaturesTest
ift = IonFeaturesTest.test_ionic(IonFeaturesTest.test_ionic)
X_desc_ift = ift.featurize_dataframe(semiconductors, col_id='composition')
X_desc_ift

In [None]:
filtered.structure.values[0]

In [None]:
from matminer.featurizers.structure.bonding import BagofBonds
from matminer.featurizers.structure.matrix import CoulombMatrix, SineCoulombMatrix, OrbitalFieldMatrix
#bob_c=BagofBonds(coulomb_matrix=CoulombMatrix)
bob_c=BagofBonds(semiconductors.structure.values)
new=bob_c.fit
new

In [None]:
from matminer.featurizers.structure.bonding import GlobalInstabilityIndex
neww=GlobalInstabilityIndex.citations
neww

In [None]:
aaa=semiconductors.structure.values[0]

In [None]:
from pymatgen.analysis.graphs import StructureGraph
sg = StructureGraph(filtered.structure.values[0],graph_data=None)
sg
#sg.draw_graph_to_file = ('graph_test.png')

In [None]:
def draw_graphs(structures):
    from pymatgen.analysis.graphs import StructureGraph
    number=0
    for i in structures:
        sg = StructureGraph(i)
        draw_graph_to_file = ('graph'+str(number)+'.png')
        number=number+1

    

# Feature importances

In [None]:
from feature_importances import feature_importances_rf_reg

In [None]:
rf_reg_formula = feature_importances_rf_reg(X_desc_form_cus.iloc[:,13:],filtered.band_gap.values,500,0.25)

In [None]:
rf_reg_struc = feature_importances_reg(X_desc_struc_cus.iloc[:,13:],filtered.band_gap.values,500,0.25)

# Train models

In [364]:
X_desc_form_cus.iloc[:,13:].values.shape

(1014, 223)

In [374]:
from get_models import simple_nn
simple_nn(X_desc_form_cus.iloc[:,13:].values,semiconductors.band_gap.values)

-528.2949340984816