In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from utils import plot_series
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, LeaveOneOut
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import SelectKBest, VarianceThreshold, mutual_info_regression
import matminer
from matminer.featurizers.conversions import StrToComposition
from matminer.featurizers.composition import ElementProperty
import warnings
warnings.filterwarnings('ignore')

In [2]:
raw_data = pd.read_csv("dataset.csv")
raw_data = raw_data[raw_data['Tc'].notna()]
df = raw_data[["compound","a","Tc","elements"]]
df = StrToComposition().featurize_dataframe(df, "compound")
ep_feat = ElementProperty.from_preset(preset_name="magpie")
df = ep_feat.featurize_dataframe(df, col_id="composition",ignore_errors=True)
df.to_csv('dataset_features.csv')
df.head()

StrToComposition:   0%|          | 0/147 [00:00<?, ?it/s]

ElementProperty:   0%|          | 0/147 [00:00<?, ?it/s]

Unnamed: 0,compound,a,Tc,elements,composition,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,MagpieData mean Number,MagpieData avg_dev Number,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,MoC,4.39,14.3,BASE,"(Mo, C)",6.0,42.0,36.0,24.0,18.0,...,0.0,0.0,0.0,0.0,194.0,229.0,35.0,211.5,17.5,194.0
1,TaC,4.51,10.0,BASE,"(Ta, C)",6.0,73.0,67.0,39.5,33.5,...,0.0,0.0,0.0,0.0,194.0,229.0,35.0,211.5,17.5,194.0
2,NbC,4.5,11.2,BASE,"(Nb, C)",6.0,41.0,35.0,23.5,17.5,...,0.0,0.0,0.0,0.0,194.0,229.0,35.0,211.5,17.5,194.0
3,HfN,4.54,8.83,BASE,"(Hf, N)",7.0,72.0,65.0,39.5,32.5,...,0.0,0.0,0.0,0.0,194.0,194.0,0.0,194.0,0.0,194.0
4,NbN,4.44,15.0,BASE,"(Nb, N)",7.0,41.0,34.0,24.0,17.0,...,0.0,0.0,0.0,0.0,194.0,229.0,35.0,211.5,17.5,194.0


In [3]:
excluded = ["Tc", "compound", "composition",'elements']
X = df.drop(excluded, axis=1)
y = df['Tc'].values

var_thr = VarianceThreshold(threshold = 0) #Removing both constant and quasi-constant
X_vt = var_thr.fit_transform(X)

a = var_thr.get_support(indices=True)
b = [i for i in np.arange(0,133,1) if i not in a]

concol = [column for column in X.columns 
          if column not in X.columns[var_thr.get_support()]]

print(f'Number of zero-variance features : {len(concol)}\n')
for features in concol:
    print(features)

Number of zero-variance features : 19

MagpieData minimum Row
MagpieData maximum NsValence
MagpieData minimum NpValence
MagpieData minimum NdValence
MagpieData minimum NfValence
MagpieData mode NfValence
MagpieData minimum NsUnfilled
MagpieData minimum NpUnfilled
MagpieData minimum NdUnfilled
MagpieData minimum NfUnfilled
MagpieData maximum NfUnfilled
MagpieData range NfUnfilled
MagpieData mean NfUnfilled
MagpieData avg_dev NfUnfilled
MagpieData mode NfUnfilled
MagpieData minimum GSbandgap
MagpieData minimum GSmagmom
MagpieData mode GSmagmom
MagpieData minimum SpaceGroupNumber


In [4]:
# Generate a sample data frame
feature_columns = X.columns

# Get the mutual information coefficients and convert them to a data frame
coeff_df =pd.DataFrame(mutual_info_regression(X, y, random_state=4).reshape(-1, 1),
                         columns=['Coefficient'], index=feature_columns)
X_mi = X[coeff_df.sort_values(by='Coefficient',ascending=False)[:10].index]
coeff_df.sort_values(by='Coefficient',ascending=False)[:12]

Unnamed: 0,Coefficient
MagpieData range MendeleevNumber,0.630043
MagpieData avg_dev CovalentRadius,0.593411
a,0.573869
MagpieData mean GSvolume_pa,0.559791
MagpieData range Electronegativity,0.544154
MagpieData avg_dev Number,0.498249
MagpieData mean CovalentRadius,0.493282
MagpieData avg_dev Row,0.487205
MagpieData mean Number,0.477378
MagpieData range CovalentRadius,0.459039
