In [1]:
import pandas as pd
import numpy as np
import torch
import os
from utils import data

dataset_path = './dataset/'
dataset_files = ['TcData.csv' ,'BsData.csv']

pd.set_option("display.max_columns", None)

dfs = dict([])
for file in dataset_files:
    df = pd.read_csv(os.path.join(dataset_path,file), na_values=0)
    df = df.rename(columns={'Composition': 'formula'})
    df.name = file[:2]
    dfs[df.name] = df

load_df = True
for df in dfs.values():
    df = data.featurize_data(df,load_existing_df=True, pkl_path = os.path.join(dataset_path, f"./df_data_{df.name}.pkl"))
    
from utils.surrogate import surrogate_model

Tc_surrogate = surrogate_model(name='Tc', df=dfs['Tc'])
Bs_surrogate = surrogate_model(name='Bs', df=dfs['Bs'])

Tc_surrogate.cleanup_df(drop_NaN = False, 
                        drop_col_with_NaN = True)
Bs_surrogate.cleanup_df(drop_NaN = False, 
                        drop_col_with_NaN = True)

surrogates = [Tc_surrogate, Bs_surrogate] 

In [2]:
data.describe_non_zero_mean(Tc_surrogate.df)

Unnamed: 0,Thickness (mu m),Tc (K)
count,220.0,220.0
mean,21.677273,599.232518
std,3.351607,73.767177
min,17.5,370.0
25%,20.0,587.0
50%,20.0,605.983795
75%,22.5,626.768258
max,30.0,769.0


In [3]:
data.describe_non_zero_mean(Bs_surrogate.df)

Unnamed: 0,Thickness (mu m),Bs (T)
count,294.0,294.0
mean,23.423469,1.556457
std,5.14799,0.281092
min,13.0,0.25986
25%,20.0,1.473255
50%,22.0,1.65
75%,25.0,1.738235
max,76.0,1.88865


In [4]:
data.describe_non_zero_mean(pd.concat([Tc_surrogate.df, Bs_surrogate.df]))

Unnamed: 0,Thickness (mu m),Tc (K),Bs (T)
count,514.0,220.0,294.0
mean,22.67607,599.232518,1.556457
std,4.547522,73.767177,0.281092
min,13.0,370.0,0.25986
25%,20.0,587.0,1.473255
50%,21.0,605.983795,1.65
75%,25.0,626.768258,1.738235
max,76.0,769.0,1.88865


In [51]:
df = surrogates[0].df

subset = ['Nb', 'Cu', 'Si', 'B', 'Fe']
df_subset = data.subset_df_by_elements(pd.concat([Tc_surrogate.df, Bs_surrogate.df]), subset)
data.describe_non_zero_mean(df_subset)

Unnamed: 0,Thickness (mu m),Tc (K),B,Si,V,Fe,Ni,Cu,Nb,Mo,Bs (T)
count,132.0,78.0,132.0,132.0,2.0,132.0,3.0,132.0,132.0,2.0,54.0
mean,21.954545,619.232528,0.091839,0.12065,0.01,0.749189,0.0133,0.009671,0.028045,0.01,1.378643
std,2.648242,34.815406,0.017761,0.037582,0.007071,0.024254,0.0,0.001643,0.006349,0.007071,0.277504
min,20.0,541.0,0.07,0.04,0.005,0.72,0.0133,0.0021,0.004,0.005,0.68
25%,20.0,600.55796,0.075,0.0916,0.0075,0.735,0.0133,0.01,0.029412,0.0075,1.182972
50%,21.0,618.472175,0.09,0.135,0.01,0.735,0.0133,0.01,0.03,0.01,1.308785
75%,22.5,625.880637,0.092409,0.15,0.0125,0.759,0.0133,0.01,0.03,0.0125,1.61892
max,30.0,769.0,0.13,0.155,0.015,0.803922,0.0133,0.013,0.04,0.015,1.80111


In [5]:
def EF_presence(df):
    stat = df.astype(bool).sum(axis=0)
    return stat[stat>0][4:]

EF_presence(surrogates[0].df)

Series([], dtype: int64)

In [8]:
EF_presence(surrogates[1].df)

B     279
C     110
Al     20
Si    226
P     168
V       2
Cr      2
Fe    294
Co     20
Ni      5
Cu    178
Ga      8
Ge      7
Zr     27
Nb     67
Mo      4
Sn      3
Hf     17
dtype: int64