In [1]:
import pandas as pd
import numpy as np
import torch
import os
from utils import data

dataset_path = './dataset/'
dataset_files = ['TcData.csv' ,'BsData.csv']

pd.set_option("display.max_columns", None)

dfs = dict([])
for file in dataset_files:
    df = pd.read_csv(os.path.join(dataset_path,file), na_values=0)
    df = df.rename(columns={'Composition': 'formula'})
    df = data.featurize_data(df,load_existing_df=True, pkl_path = os.path.join(dataset_path, f"./df_data_{file[:2]}.pkl"))
    df.name = file[:2]
    dfs[df.name] = df

from utils.surrogate import surrogate_model

Tc_surrogate = surrogate_model(name='Tc', df=dfs['Tc'])
Bs_surrogate = surrogate_model(name='Bs', df=dfs['Bs'])

Tc_surrogate.cleanup_df(drop_NaN = False, 
                        drop_col_with_NaN = True)
Bs_surrogate.cleanup_df(drop_NaN = False, 
                        drop_col_with_NaN = True)

surrogates = [Tc_surrogate, Bs_surrogate] 

In [2]:
data.describe_non_zero_mean(Tc_surrogate.df)

Unnamed: 0,Thickness (mu m),Tc (K),B,C,Al,Si,P,Mn,Fe,Co,Ni,Cu,Ga,Zr,Nb,Mo,Sn
count,220.0,220.0,219.0,69.0,14.0,190.0,99.0,2.0,220.0,24.0,4.0,101.0,14.0,16.0,88.0,28.0,3.0
mean,21.677273,599.232518,0.088052,0.042502,0.047857,0.082934,0.053133,0.1,0.763396,0.109583,0.024975,0.009359,0.023571,0.051369,0.028205,0.020714,0.02
std,3.351607,73.767177,0.037089,0.023124,0.004258,0.05428,0.036586,0.0,0.065848,0.06721,0.02335,0.001894,0.009288,0.022386,0.010904,0.019231,0.0
min,17.5,370.0,0.01,0.01,0.04,0.01,0.01,0.1,0.51,0.005,0.0133,0.001,0.02,0.01,0.004,0.01,0.02
25%,20.0,587.0,0.075,0.02,0.05,0.03,0.01,0.1,0.735,0.06,0.0133,0.01,0.02,0.030501,0.03,0.01,0.02
50%,20.0,605.983795,0.09,0.04,0.05,0.08,0.05,0.1,0.745337,0.117,0.0133,0.01,0.02,0.07,0.03,0.01,0.02
75%,22.5,626.768258,0.1,0.06,0.05,0.135,0.09,0.1,0.83,0.1665,0.024975,0.01,0.02,0.07,0.03,0.02,0.02
max,30.0,769.0,0.235625,0.09,0.05,0.16,0.12,0.1,0.9,0.234,0.06,0.01,0.05,0.07,0.07,0.08,0.02


In [3]:
data.describe_non_zero_mean(Bs_surrogate.df)

Unnamed: 0,Thickness (mu m),Bs (T),B,C,Al,Si,P,V,Cr,Fe,Co,Ni,Cu,Ga,Ge,Zr,Nb,Mo,Sn,Hf
count,294.0,294.0,279.0,110.0,20.0,226.0,168.0,2.0,2.0,294.0,20.0,5.0,178.0,8.0,7.0,27.0,67.0,4.0,3.0,17.0
mean,23.423469,1.556457,0.095298,0.030713,0.039,0.044465,0.047602,0.01,0.02,0.802945,0.0968,0.234,0.009077,0.0325,0.04,0.017848,0.030268,0.015,0.02,0.07
std,5.14799,0.281092,0.036627,0.018662,0.015183,0.038768,0.033016,0.007071,0.0,0.066226,0.076237,0.123329,0.002148,0.010351,0.0,0.013497,0.009571,0.007071,0.0,0.0
min,13.0,0.25986,0.01,0.005,0.01,0.005,0.01,0.005,0.02,0.39,0.04,0.078,0.0025,0.02,0.04,0.01,0.01,0.005,0.02,0.07
25%,20.0,1.473255,0.08,0.01125,0.03,0.02,0.03,0.0075,0.02,0.78,0.04,0.156,0.0075,0.02,0.04,0.01,0.02785,0.0125,0.02,0.07
50%,22.0,1.65,0.09,0.03,0.04,0.03,0.0325,0.01,0.02,0.828,0.05,0.234,0.008,0.04,0.04,0.01,0.029412,0.0175,0.02,0.07
75%,25.0,1.738235,0.12,0.04,0.04,0.05098,0.07,0.0125,0.02,0.833,0.1665,0.312,0.01,0.04,0.04,0.0238,0.03,0.02,0.02,0.07
max,76.0,1.88865,0.235625,0.06,0.08,0.155,0.12,0.015,0.02,0.9,0.292,0.39,0.015,0.04,0.04,0.07,0.07,0.02,0.02,0.07


In [4]:
data.describe_non_zero_mean(pd.concat([Tc_surrogate.df, Bs_surrogate.df]))

Unnamed: 0,Thickness (mu m),Tc (K),B,C,Al,Si,P,V,Cr,Mn,Fe,Co,Ni,Cu,Ga,Ge,Zr,Nb,Mo,Sn,Hf,Bs (T)
count,514.0,220.0,498.0,179.0,34.0,416.0,267.0,2.0,2.0,2.0,514.0,44.0,9.0,279.0,22.0,7.0,43.0,155.0,32.0,6.0,17.0,294.0
mean,22.67607,599.232518,0.092111,0.035257,0.042647,0.062035,0.049653,0.01,0.02,0.1,0.786017,0.103773,0.1411,0.009179,0.026818,0.04,0.030321,0.029097,0.02,0.02,0.07,1.556457
std,4.547522,73.767177,0.036969,0.021229,0.012627,0.050247,0.034419,0.007071,0.0,0.0,0.068845,0.070892,0.14123,0.002061,0.010414,0.0,0.023676,0.010367,0.018184,0.0,0.0,0.281092
min,13.0,370.0,0.01,0.005,0.01,0.005,0.01,0.005,0.02,0.1,0.39,0.005,0.0133,0.001,0.02,0.04,0.01,0.004,0.005,0.02,0.07,0.25986
25%,20.0,587.0,0.075,0.02,0.0325,0.02,0.03,0.0075,0.02,0.1,0.735,0.04,0.0133,0.007843,0.02,0.04,0.01,0.029003,0.01,0.02,0.07,1.473255
50%,21.0,605.983795,0.09,0.03,0.04,0.04,0.04,0.01,0.02,0.1,0.803922,0.11625,0.078,0.01,0.02,0.04,0.0276,0.03,0.01,0.02,0.07,1.65
75%,25.0,626.768258,0.11,0.05,0.05,0.1,0.08,0.0125,0.02,0.1,0.833,0.1665,0.234,0.01,0.04,0.04,0.0333,0.03,0.02,0.02,0.07,1.738235
max,76.0,769.0,0.235625,0.09,0.08,0.16,0.12,0.015,0.02,0.1,0.9,0.292,0.39,0.015,0.05,0.04,0.07,0.07,0.08,0.02,0.07,1.88865


In [5]:
df = surrogates[0].df

subset = ['Nb', 'Cu', 'Si', 'B', 'Fe']
df_subset = data.subset_df_by_elements(pd.concat([Tc_surrogate.df, Bs_surrogate.df]), subset)
data.describe_non_zero_mean(df_subset)

Unnamed: 0,Thickness (mu m),Tc (K),B,Si,V,Fe,Ni,Cu,Nb,Mo,Bs (T)
count,132.0,78.0,132.0,132.0,2.0,132.0,3.0,132.0,132.0,2.0,54.0
mean,21.954545,619.232528,0.091839,0.12065,0.01,0.749189,0.0133,0.009671,0.028045,0.01,1.378643
std,2.648242,34.815406,0.017761,0.037582,0.007071,0.024254,0.0,0.001643,0.006349,0.007071,0.277504
min,20.0,541.0,0.07,0.04,0.005,0.72,0.0133,0.0021,0.004,0.005,0.68
25%,20.0,600.55796,0.075,0.0916,0.0075,0.735,0.0133,0.01,0.029412,0.0075,1.182972
50%,21.0,618.472175,0.09,0.135,0.01,0.735,0.0133,0.01,0.03,0.01,1.308785
75%,22.5,625.880637,0.092409,0.15,0.0125,0.759,0.0133,0.01,0.03,0.0125,1.61892
max,30.0,769.0,0.13,0.155,0.015,0.803922,0.0133,0.013,0.04,0.015,1.80111


In [9]:
data.describe_non_zero_mean(df_subset)['Bs (T)']['mean']

1.3786429629629626

In [6]:
def EF_presence(df):
    stat = df.astype(bool).sum(axis=0)
    return stat[stat>0][4:]

EF_presence(surrogates[0].df)

B     219
C      69
Al     14
Si    190
P      99
Mn      2
Fe    220
Co     24
Ni      4
Cu    101
Ga     14
Zr     16
Nb     88
Mo     28
Sn      3
dtype: int64

In [7]:
EF_presence(surrogates[1].df)

B     279
C     110
Al     20
Si    226
P     168
V       2
Cr      2
Fe    294
Co     20
Ni      5
Cu    178
Ga      8
Ge      7
Zr     27
Nb     67
Mo      4
Sn      3
Hf     17
dtype: int64