# Point Colab to our files

In [None]:
!git clone https://github.com/icomse/5th_workshop_MachineLearning.git
import os
os.chdir('5th_workshop_MachineLearning/data')
data_dir = os.getcwd()

Cloning into '5th_workshop_MachineLearning'...
remote: Enumerating objects: 82, done.[K
remote: Counting objects: 100% (82/82), done.[K
remote: Compressing objects: 100% (64/64), done.[K
remote: Total 82 (delta 26), reused 46 (delta 12), pack-reused 0[K
Unpacking objects: 100% (82/82), 8.05 MiB | 6.54 MiB/s, done.


# We'll use pymatgen and matminer in this module, so we need to install them

In [None]:
!pip install matminer

Collecting matminer
  Downloading matminer-0.9.0-py3-none-any.whl (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Collecting pymongo (from matminer)
  Downloading pymongo-4.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (648 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m648.9/648.9 kB[0m [31m40.3 MB/s[0m eta [36m0:00:00[0m
Collecting monty (from matminer)
  Downloading monty-2023.5.8-py3-none-any.whl (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymatgen (from matminer)
  Downloading pymatgen-2023.6.28.tar.gz (9.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m86.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backen

# Import some Python modules

In [None]:
import pandas as pd
import numpy as np
import os
import json
import matplotlib.pyplot as plt

from matminer.featurizers.composition.composite import ElementProperty
from pymatgen.core.composition import Composition

plt.style.use(os.path.join(data_dir, 'configs', 'plot_style.mplstyle'))

# New dataset: (Al, Ga, In)2O3 alloys
- candidate transparent conducting oxides
- [Kaggle competition](https://www.kaggle.com/c/nomad2018-predict-transparent-conductors)
- [Paper on competition](https://www.nature.com/articles/s41524-019-0239-3)

In [None]:
df = pd.read_csv(os.path.join(data_dir, 'nomad', 'nomad_train.csv'))
df.head()

Unnamed: 0,id,spacegroup,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,lattice_angle_gamma_degree,formation_energy_ev_natom,bandgap_energy_ev
0,1,33,80.0,0.625,0.375,0.0,9.9523,8.5513,9.1775,90.0026,90.0023,90.0017,0.068,3.4387
1,2,194,80.0,0.625,0.375,0.0,6.184,6.1838,23.6287,90.0186,89.998,120.0025,0.249,2.921
2,3,227,40.0,0.8125,0.1875,0.0,9.751,5.6595,13.963,90.9688,91.1228,30.5185,0.1821,2.7438
3,4,167,30.0,0.75,0.0,0.25,5.0036,5.0034,13.5318,89.9888,90.0119,120.0017,0.2172,3.3492
4,5,194,80.0,0.0,0.625,0.375,6.6614,6.6612,24.5813,89.996,90.0006,119.9893,0.0505,1.3793


# Rename some columns to make things easier

In [None]:
df = df.rename(columns={'spacegroup' : 'sg',
                        'number_of_total_atoms' : 'Natoms',
                        'percent_atom_al' : 'x_Al',
                        'percent_atom_ga' : 'x_Ga',
                        'percent_atom_in' : 'x_In',
                        'lattice_vector_1_ang' : 'a',
                        'lattice_vector_2_ang' : 'b',
                        'lattice_vector_3_ang' : 'c',
                        'lattice_angle_alpha_degree' : 'alpha',
                        'lattice_angle_beta_degree' : 'beta',
                        'lattice_angle_gamma_degree' : 'gamma',
                        'formation_energy_ev_natom' : 'E',
                        'bandgap_energy_ev' : 'Eg'})

# Compute a couple features that may be helpful

In [None]:
def get_volume(row):
    a, b, c, alpha, beta, gamma = row['a'], row['b'], row['c'], row['alpha'], row['beta'], row['gamma']
    alpha, beta, gamma = np.deg2rad(alpha), np.deg2rad(beta), np.deg2rad(gamma)
    return a*b*c*np.sqrt(1 - np.cos(alpha)**2 - np.cos(beta)**2 - np.cos(gamma)**2 + 2*np.cos(alpha)*np.cos(beta)*np.cos(gamma))
df['volume'] = df.apply(get_volume, axis=1)

df['atomic_density'] = df['Natoms'] / df['volume']
df.head()

Unnamed: 0,id,sg,Natoms,x_Al,x_Ga,x_In,a,b,c,alpha,beta,gamma,E,Eg,volume,atomic_density
0,1,33,80.0,0.625,0.375,0.0,9.9523,8.5513,9.1775,90.0026,90.0023,90.0017,0.068,3.4387,781.052081,0.102426
1,2,194,80.0,0.625,0.375,0.0,6.184,6.1838,23.6287,90.0186,89.998,120.0025,0.249,2.921,782.50011,0.102236
2,3,227,40.0,0.8125,0.1875,0.0,9.751,5.6595,13.963,90.9688,91.1228,30.5185,0.1821,2.7438,391.227531,0.102242
3,4,167,30.0,0.75,0.0,0.25,5.0036,5.0034,13.5318,89.9888,90.0119,120.0017,0.2172,3.3492,293.377334,0.102257
4,5,194,80.0,0.0,0.625,0.375,6.6614,6.6612,24.5813,89.996,90.0006,119.9893,0.0505,1.3793,944.713843,0.084682


# Convert the composition fractions into chemical formulas
- this will make it easier to generate "compositional" features with `matminer`

In [None]:
def get_formula(row):
    elements = ['Al', 'Ga', 'In']
    columns = ['x_%s' % el for el in elements]
    mole_fractions = [row[column] for column in columns]
    data = dict(zip(elements, mole_fractions))

    for el in elements:
        orig_frac = data[el]
        data[el] = orig_frac*2

    data['O'] = 3

    formula = Composition(data).formula
    return formula

df['formula'] = df.apply(get_formula, axis=1)
df.head()

Unnamed: 0,id,sg,Natoms,x_Al,x_Ga,x_In,a,b,c,alpha,beta,gamma,E,Eg,volume,atomic_density,formula
0,1,33,80.0,0.625,0.375,0.0,9.9523,8.5513,9.1775,90.0026,90.0023,90.0017,0.068,3.4387,781.052081,0.102426,Al1.25 Ga0.75 O3
1,2,194,80.0,0.625,0.375,0.0,6.184,6.1838,23.6287,90.0186,89.998,120.0025,0.249,2.921,782.50011,0.102236,Al1.25 Ga0.75 O3
2,3,227,40.0,0.8125,0.1875,0.0,9.751,5.6595,13.963,90.9688,91.1228,30.5185,0.1821,2.7438,391.227531,0.102242,Al1.625 Ga0.375 O3
3,4,167,30.0,0.75,0.0,0.25,5.0036,5.0034,13.5318,89.9888,90.0119,120.0017,0.2172,3.3492,293.377334,0.102257,Al1.5 In0.5 O3
4,5,194,80.0,0.0,0.625,0.375,6.6614,6.6612,24.5813,89.996,90.0006,119.9893,0.0505,1.3793,944.713843,0.084682,In0.75 Ga1.25 O3


# Now, we'll use `matminer` to compute new features

In [None]:
elprop = ElementProperty.from_preset('matminer')

elprop

## Let's see what these look like

In [None]:
some_formula = 'Al2O3'
elprop.featurize(Composition(some_formula))

[1.61,
 3.44,
 1.8299999999999998,
 2.708,
 1.2940054095713818,
 2,
 3,
 1,
 2.4,
 0.7071067811865476,
 13,
 16,
 3,
 14.8,
 2.121320343559643,
 2.0,
 2.0,
 0.0,
 2.0,
 0.0,
 15.9994,
 26.9815386,
 10.9821386,
 20.39225544,
 7.7655446759905375,
 0.6,
 1.25,
 0.65,
 0.86,
 0.45961940777125593,
 80.0,
 101.0,
 21.0,
 92.6,
 14.849242404917499,
 nan,
 nan,
 nan,
 nan,
 nan,
 317.5,
 5100.0,
 4782.5,
 2230.5,
 3381.7381810246634,
 0.02658,
 235.0,
 234.97342,
 94.01594800000001,
 166.15129868059472,
 54.8,
 933.47,
 878.6700000000001,
 406.26800000000003,
 621.3135154251837,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan,
 nan]

# What do these numbers mean?

In [None]:
feature_values = elprop.featurize(Composition(some_formula))
feature_labels = elprop.feature_labels()
labels_to_values = dict(zip(feature_labels, feature_values))
labels_to_values = {k : v for i, (k, v) in enumerate(labels_to_values.items()) if v}
labels_to_values

{'PymatgenData minimum X': 1.61,
 'PymatgenData maximum X': 3.44,
 'PymatgenData range X': 1.8299999999999998,
 'PymatgenData mean X': 2.708,
 'PymatgenData std_dev X': 1.2940054095713818,
 'PymatgenData minimum row': 2,
 'PymatgenData maximum row': 3,
 'PymatgenData range row': 1,
 'PymatgenData mean row': 2.4,
 'PymatgenData std_dev row': 0.7071067811865476,
 'PymatgenData minimum group': 13,
 'PymatgenData maximum group': 16,
 'PymatgenData range group': 3,
 'PymatgenData mean group': 14.8,
 'PymatgenData std_dev group': 2.121320343559643,
 'PymatgenData minimum block': 2.0,
 'PymatgenData maximum block': 2.0,
 'PymatgenData mean block': 2.0,
 'PymatgenData minimum atomic_mass': 15.9994,
 'PymatgenData maximum atomic_mass': 26.9815386,
 'PymatgenData range atomic_mass': 10.9821386,
 'PymatgenData mean atomic_mass': 20.39225544,
 'PymatgenData std_dev atomic_mass': 7.7655446759905375,
 'PymatgenData minimum atomic_radius': 0.6,
 'PymatgenData maximum atomic_radius': 1.25,
 'PymatgenD

# To save time, I generated these features for all formulas in this dataset and saved as a `.json` file

In [None]:
with open(os.path.join(data_dir, 'nomad', 'nomad_train_composition_features.json')) as f:
    composition_features = json.load(f)

some_formula = df.formula.values[1234]
print(some_formula)
composition_features[some_formula]

Al1.125 In0.25 Ga0.625 O3


{'PymatgenData minimum X': 1.61,
 'PymatgenData maximum X': 3.44,
 'PymatgenData range X': 1.8299999999999998,
 'PymatgenData mean X': 2.7415,
 'PymatgenData std_dev X': 1.1346870321068756,
 'PymatgenData minimum row': 2,
 'PymatgenData maximum row': 5,
 'PymatgenData range row': 3,
 'PymatgenData mean row': 2.625,
 'PymatgenData std_dev row': 1.171787241425887,
 'PymatgenData minimum group': 13,
 'PymatgenData maximum group': 16,
 'PymatgenData range group': 3,
 'PymatgenData mean group': 14.8,
 'PymatgenData std_dev group': 1.9445260654606864,
 'PymatgenData minimum block': 2.0,
 'PymatgenData maximum block': 2.0,
 'PymatgenData mean block': 2.0,
 'PymatgenData minimum atomic_mass': 15.9994,
 'PymatgenData maximum atomic_mass': 114.818,
 'PymatgenData range atomic_mass': 98.8186,
 'PymatgenData mean atomic_mass': 30.126761185,
 'PymatgenData std_dev atomic_mass': 34.41510682280223,
 'PymatgenData minimum atomic_radius': 0.6,
 'PymatgenData maximum atomic_radius': 1.55,
 'PymatgenData

# Let's consider only the "mean" stat for each feature

In [None]:
mean_features = [f for f in feature_labels if 'mean' in f]
mean_features

['PymatgenData mean X',
 'PymatgenData mean row',
 'PymatgenData mean group',
 'PymatgenData mean block',
 'PymatgenData mean atomic_mass',
 'PymatgenData mean atomic_radius',
 'PymatgenData mean mendeleev_no',
 'PymatgenData mean electrical_resistivity',
 'PymatgenData mean velocity_of_sound',
 'PymatgenData mean thermal_conductivity',
 'PymatgenData mean melting_point',
 'PymatgenData mean bulk_modulus',
 'PymatgenData mean coefficient_of_linear_thermal_expansion']

# Add these to our `DataFrame`

In [None]:
def add_feature_to_df(feature, df, composition_features):
    formulas = df.formula.values
    feature_values = [composition_features[formula][feature] for formula in formulas]
    df['_'.join(feature.split(' ')[-2:])] = feature_values
    return df

for feature in mean_features:
    df = add_feature_to_df(feature, df, composition_features)

df.head()

Unnamed: 0,id,sg,Natoms,x_Al,x_Ga,x_In,a,b,c,alpha,...,mean_block,mean_atomic_mass,mean_atomic_radius,mean_mendeleev_no,mean_electrical_resistivity,mean_velocity_of_sound,mean_thermal_conductivity,mean_melting_point,mean_bulk_modulus,mean_coefficient_of_linear_thermal_expansion
0,1,33,80.0,0.625,0.375,0.0,9.9523,8.5513,9.1775,90.0026,...,2.0,26.803475,0.8675,92.75,,1876.5,63.115948,311.684,,
1,2,194,80.0,0.625,0.375,0.0,6.184,6.1838,23.6287,90.0186,...,2.0,26.803475,0.8675,92.75,,1876.5,63.115948,311.684,,
2,3,227,40.0,0.8125,0.1875,0.0,9.751,5.6595,13.963,90.9688,...,2.0,23.597865,0.86375,92.675,,2053.5,78.565948,358.976,,
3,4,167,30.0,0.75,0.0,0.25,5.0036,5.0034,13.5318,89.9888,...,2.0,29.175902,0.89,92.5,,1842.0,78.715948,355.896,,
4,5,194,80.0,0.0,0.625,0.375,6.6614,6.6612,24.5813,89.996,...,2.0,44.25309,0.9175,92.7,,1057.75,19.565948,173.07,,


# **Hands-on**: Play with this data for a minute.
- Our goal will be to predict the total energy, E, and band gap, Eg, using the other columns in the `DataFrame`
- Do you notice anything we should take care of first?

In [None]:
# write your code here

# **Hands-on**: Determine which features are important for predicting `E` and `Eg`

## Guidelines:
- Use any model you would like
- Are the same features important for predicting both quantities?
- Can you show why you observe any disparities in importances?

In [None]:
# write your code here