In [2]:
!pip install numpy
!pip install scipy
!pip install seaborn
!pip install pandas
!pip install matplotlib
!pip install sklearn
!pip install pymatgen

Collecting numpy
  Downloading numpy-1.21.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.7 MB)
     |████████████████████████████████| 15.7 MB 4.8 MB/s            
[?25hInstalling collected packages: numpy
Successfully installed numpy-1.21.4
Collecting scipy
  Downloading scipy-1.7.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (38.1 MB)
     |████████████████████████████████| 38.1 MB 4.6 MB/s            
Installing collected packages: scipy
Successfully installed scipy-1.7.3
Collecting seaborn
  Downloading seaborn-0.11.2-py3-none-any.whl (292 kB)
     |████████████████████████████████| 292 kB 4.7 MB/s            
[?25hCollecting matplotlib>=2.2
  Downloading matplotlib-3.5.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (11.2 MB)
     |████████████████████████████████| 11.2 MB 58.2 MB/s            
[?25hCollecting pandas>=0.23
  Downloading pandas-1.3.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.3 MB)
     |█████████████████

In [3]:
# Importing Librariesb
import numpy as np
import scipy as sp
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import AdaBoostRegressor, AdaBoostClassifier, \
    GradientBoostingClassifier, GradientBoostingRegressor, \
    RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import r2_score
from pymatgen.core import Element, Composition, periodic_table
from functools import partial


In [4]:
#  --NotebookApp.iopub_data_rate_limit=1.0e10


In [None]:
# Loading training and testing data
train= pd.read_csv("train.csv",index_col=False)
test= pd.read_csv("test.csv",index_col=False)

# Querying materials and their properties based on training data
from pymatgen.ext.matproj import MPRester
mpr = MPRester("241iWwhTEOaNmC6V")

# Using material IDs provided in training data to get corresponding information from MPD
data = mpr.query(criteria={"task_id": {"$in":train["material_id"].to_list()}}, properties=["material_id","energy",
        "energy_per_atom",
        "volume",
        "formation_energy_per_atom",
        "nsites",
        "pretty_formula",                                                                                  
        "nelements",
        "density",  "band_gap"])
df = pd.DataFrame(data)
display(df)

In [None]:
# Making list of possible noble gases 
nobles = ["He","Ne", "Ar", "Kr", "Xe", "Rn", "Og"]
filt_data = df

#Iterating through data and using pandas drop function to filter out materials containing noble gas elements 
for i in nobles:
    filt_data = filt_data.drop(filt_data[filt_data['pretty_formula'].str.contains(i)].index)
    
n = len(filt_data)
print(f'There are {n} materials left')    
display(filt_data)


print(f'There are {n} materials left')    
display(filt_data)


In [None]:
filt_data

In [8]:
listA = [a for a in filt_data["pretty_formula"]]
listB = [Composition(a) for a in listA]
listC = [a.elements for a in listB]

editC = [item for sublist in listC for item in sublist]
unique_editC = set(editC)
unique_editC

UL_editC = [a for a in unique_editC]
cprops = [a.data for a in UL_editC]


In [None]:
peel=set(UL_editC)
sy_editC = [a.symbol for a in UL_editC]
sy_editC


pd.set_option('display.max_columns', None)

cprops_df = pd.DataFrame(cprops, index=sy_editC)

cprops_df.head()

In [None]:
droplist = ['Ionic radii',
    'Ionic radii hs', 
     'Ionic radii ls',
     'iupac_ordering', 
     'IUPAC ordering', 
     'NMR Quadrupole Moment', 
     'Reflectivity',
     'Refractive index', 
     'Rigidity modulus', 
     'Shannon radii',
     'Superconduction temperature',
     'Mendeleev no',       
     'Mineral hardness',
     'Molar volume',
     'Name',
     'Oxidation states',
     'ICSD oxidation states',
     'Brinell hardness',
     'Atomic orbitals', 
     'Coefficient of linear thermal expansion',
     'Atomic orbitals',
     'Electronic structure',
     'Electrical resistivity',
     'Ground level'
           ]


cprops_df = cprops_df.drop(columns=droplist)
cprops_df

In [11]:
#pd.set_option('display.max_rows', None)
cprops_df['Melting point']

H      14.01 K
Li    453.69 K
Be      1560 K
B       2349 K
C       3800 K
        ...   
Pb    600.61 K
Bi     544.4 K
Ac      1323 K
Th      2115 K
U     1405.3 K
Name: Melting point, Length: 80, dtype: object

In [11]:
cprops_df['Boiling point'] = [(a.replace('K', '', 1)) for a in cprops_df['Boiling point']]
cprops_df['Bulk modulus'] = cprops_df["Bulk modulus"].str.replace("GPa", "")
cprops_df['Critical temperature'] = cprops_df["Critical temperature"].str.replace("K", "")
cprops_df['Density of solid'] = cprops_df['Density of solid'].str.replace("no data", "NaN")
cprops_df['Density of solid'] = cprops_df["Density of solid"].str.replace("kg m<sup>-3</sup>", "")
cprops_df['Liquid range'] = [(a.replace('K', '', 1)) for a in cprops_df['Liquid range']]
cprops_df['Poissons ratio'] = [(a.replace('no data', 'NaN', 1)) for a in cprops_df['Poissons ratio']]
cprops_df['Poissons ratio'] = cprops_df['Poissons ratio'].str.replace("no data", "").astype(float)
cprops_df['Thermal conductivity'] = [float(a.replace('W m<sup>-1</sup> K<sup>-1</sup>', "", 1)) for a in cprops_df['Thermal conductivity']]
cprops_df['Velocity of sound'] = cprops_df['Velocity of sound'].str.replace("no data","NaN")
cprops_df['Velocity of sound'] = [(a.replace('m s<sup>-1</sup>', '', 1)) for a in cprops_df['Velocity of sound']]
cprops_df['Vickers hardness'] = cprops_df['Vickers hardness'].str.replace("no data", "NaN")
cprops_df['Vickers hardness'] = [(a.replace('MN m<sup>-2</sup>', '', 1)) for a in cprops_df['Vickers hardness']]
cprops_df['Youngs modulus'] = cprops_df['Youngs modulus'].str.replace("no data", "NaN")
cprops_df['Youngs modulus'] = [(a.replace('GPa', '', 1)) for a in cprops_df['Youngs modulus']]
cprops_df['Bulk modulus'] = [(a.replace('no data', 'NaN', 1)) for a in cprops_df['Bulk modulus']]
cprops_df['Bulk modulus'] = [(a.replace('liquid', '', 1)) for a in cprops_df['Bulk modulus']]
cprops_df['Bulk modulus'] = cprops_df['Bulk modulus'].str.replace(r"\(.*\)","",  regex=True).astype(float)
cprops_df 


Unnamed: 0,Atomic mass,Atomic no,Atomic radius,Atomic radius calculated,Boiling point,Bulk modulus,Common oxidation states,Critical temperature,Density of solid,Liquid range,Melting point,Poissons ratio,Thermal conductivity,Van der waals radius,Velocity of sound,Vickers hardness,X,Youngs modulus,Metallic radius,Ionization energies,Electron affinity
H,1.007940,1,0.25,0.53,20.28,,"[-1, 1]",33,,6.27,14.01 K,,0.1805,1.10,1270,,2.20,,no data,[13.598434599702],0.754598
Li,6.941000,3,1.45,1.67,1615,11.0,[1],3223,535,1161.31,453.69 K,,85.0000,1.82,6000,,0.98,4.9,1.52,"[5.391714996, 75.640097, 122.45435914]",0.618049
Be,9.012182,4,1.05,1.12,2742,130.0,[2],no data,1848,1182,1560 K,0.032,190.0000,1.53,13000,1670,1.57,287,1.12,"[9.322699, 18.21115, 153.896205, 217.7185861]",-0.520000
B,10.811000,5,0.85,0.87,4200,320.0,[3],no data,2460,1851,2349 K,,27.0000,1.92,16200,49000,2.04,,no data,"[8.298019, 25.15483, 37.93059, 259.3715, 340.2...",0.279723
C,12.010700,6,0.70,0.67,4300,33.0,"[-4, 4]",no data,2267,500,3800 K,,140.0000,1.70,18350,,2.55,,no data,"[11.260288, 24.383154, 47.88778, 64.49352, 392...",1.262114
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Pb,207.200000,82,1.80,1.54,2022,46.0,"[2, 4]",no data,11340,1421.39,600.61 K,0.440,35.0000,2.02,1260,,2.33,16,1.75,"[7.4166799, 15.032499, 31.9373, 42.33256, 68.8...",0.356721
Bi,208.980400,83,1.60,1.43,1837,31.0,[3],no data,9780,1292.6,544.4 K,0.330,8.0000,2.07,1790,,2.02,32,1.82,"[7.285516, 16.703, 25.57075, 45.37, 54.856, 88...",0.942362
Ac,227.000000,89,1.95,no data,3573,,[3],no data,10070,2250,1323 K,,12.0000,2.47,,,1.10,,1.878,"[5.380226, 11.75, 17.431, 44.8, 55.0, 67.0, 79...",0.350000
Th,232.038060,90,1.80,no data,5093,54.0,[4],no data,11724,2978,2115 K,0.270,54.0000,2.45,2490,350,1.30,79,1.798,"[6.3067, 12.1, 18.32, 28.648, 58.0, 69.1, 82.0...",1.170000


In [None]:


cprops_df['Boiling point'] = [(a.replace('K', '', 1)) for a in cprops_df['Boiling point']]
cprops_df['Bulk modulus'] = cprops_df["Bulk modulus"].str.replace("GPa", "")
cprops_df['Critical temperature'] = cprops_df["Critical temperature"].str.replace("K", "")
cprops_df['Density of solid'] = cprops_df['Density of solid'].str.replace("no data", "NaN")
cprops_df['Density of solid'] = cprops_df["Density of solid"].str.replace("kg m<sup>-3</sup>", "")
cprops_df['Liquid range'] = [(a.replace('K', '', 1)) for a in cprops_df['Liquid range']]
cprops_df['Poissons ratio'] = [(a.replace('no data', 'NaN', 1)) for a in cprops_df['Poissons ratio']]
cprops_df['Poissons ratio'] = cprops_df['Poissons ratio'].str.replace("no data", "").astype(float)
cprops_df['Thermal conductivity'] = [float(a.replace('W m<sup>-1</sup> K<sup>-1</sup>', "", 1)) for a in cprops_df['Thermal conductivity']]
cprops_df['Velocity of sound'] = cprops_df['Velocity of sound'].str.replace("no data","NaN")
cprops_df['Velocity of sound'] = [(a.replace('m s<sup>-1</sup>', '', 1)) for a in cprops_df['Velocity of sound']]
cprops_df['Vickers hardness'] = cprops_df['Vickers hardness'].str.replace("no data", "NaN")
cprops_df['Vickers hardness'] = [(a.replace('MN m<sup>-2</sup>', '', 1)) for a in cprops_df['Vickers hardness']]
cprops_df['Youngs modulus'] = cprops_df['Youngs modulus'].str.replace("no data", "NaN")
cprops_df['Youngs modulus'] = [(a.replace('GPa', '', 1)) for a in cprops_df['Youngs modulus']]



#cprops_df['Boiling point'] = [float(a.replace('K', '', 1)) for a in cprops_df['Boiling point']]
cprops_df

In [13]:
cprops_df['Bulk modulus'] = [(a.replace('no data', 'NaN', 1)) for a in cprops_df['Bulk modulus']]
cprops_df['Bulk modulus'] = [(a.replace('liquid', '', 1)) for a in cprops_df['Bulk modulus']]
cprops_df['Bulk modulus'] = cprops_df['Bulk modulus'].str.replace(r"\(.*\)","",  regex=True).astype(float)
cprops_df 
#cprops_df


Unnamed: 0,Atomic mass,Atomic no,Atomic radius,Atomic radius calculated,Boiling point,Bulk modulus,Common oxidation states,Critical temperature,Density of solid,Liquid range,Melting point,Poissons ratio,Thermal conductivity,Van der waals radius,Velocity of sound,Vickers hardness,X,Youngs modulus,Metallic radius,Ionization energies,Electron affinity
H,1.007940,1,0.25,0.53,20.28,,"[-1, 1]",33,,6.27,14.01 K,,0.1805,1.10,1270,,2.20,,no data,[13.598434599702],0.754598
Li,6.941000,3,1.45,1.67,1615,11.0,[1],3223,535,1161.31,453.69 K,,85.0000,1.82,6000,,0.98,4.9,1.52,"[5.391714996, 75.640097, 122.45435914]",0.618049
Be,9.012182,4,1.05,1.12,2742,130.0,[2],no data,1848,1182,1560 K,0.032,190.0000,1.53,13000,1670,1.57,287,1.12,"[9.322699, 18.21115, 153.896205, 217.7185861]",-0.520000
B,10.811000,5,0.85,0.87,4200,320.0,[3],no data,2460,1851,2349 K,,27.0000,1.92,16200,49000,2.04,,no data,"[8.298019, 25.15483, 37.93059, 259.3715, 340.2...",0.279723
C,12.010700,6,0.70,0.67,4300,33.0,"[-4, 4]",no data,2267,500,3800 K,,140.0000,1.70,18350,,2.55,,no data,"[11.260288, 24.383154, 47.88778, 64.49352, 392...",1.262114
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Pb,207.200000,82,1.80,1.54,2022,46.0,"[2, 4]",no data,11340,1421.39,600.61 K,0.440,35.0000,2.02,1260,,2.33,16,1.75,"[7.4166799, 15.032499, 31.9373, 42.33256, 68.8...",0.356721
Bi,208.980400,83,1.60,1.43,1837,31.0,[3],no data,9780,1292.6,544.4 K,0.330,8.0000,2.07,1790,,2.02,32,1.82,"[7.285516, 16.703, 25.57075, 45.37, 54.856, 88...",0.942362
Ac,227.000000,89,1.95,no data,3573,,[3],no data,10070,2250,1323 K,,12.0000,2.47,,,1.10,,1.878,"[5.380226, 11.75, 17.431, 44.8, 55.0, 67.0, 79...",0.350000
Th,232.038060,90,1.80,no data,5093,54.0,[4],no data,11724,2978,2115 K,0.270,54.0000,2.45,2490,350,1.30,79,1.798,"[6.3067, 12.1, 18.32, 28.648, 58.0, 69.1, 82.0...",1.170000


In [12]:
cprops_df['Melting point'] = cprops_df['Melting point'].str.replace("K", "")
cprops_df['Melting point'] = cprops_df['Melting point'].str.replace("white P", "")
cprops_df['Melting point'] = cprops_df['Melting point'].str.replace(r"\(.*\)","",  regex=True).astype(float)
#cprops_df['Melting point'] = [float(a.replace('K', '', 1)) for a in cprops_df['Melting point']]

In [13]:
cprops_df


Unnamed: 0,Atomic mass,Atomic no,Atomic radius,Atomic radius calculated,Boiling point,Bulk modulus,Common oxidation states,Critical temperature,Density of solid,Liquid range,Melting point,Poissons ratio,Thermal conductivity,Van der waals radius,Velocity of sound,Vickers hardness,X,Youngs modulus,Metallic radius,Ionization energies,Electron affinity
H,1.007940,1,0.25,0.53,20.28,,"[-1, 1]",33,,6.27,14.01,,0.1805,1.10,1270,,2.20,,no data,[13.598434599702],0.754598
Li,6.941000,3,1.45,1.67,1615,11.0,[1],3223,535,1161.31,453.69,,85.0000,1.82,6000,,0.98,4.9,1.52,"[5.391714996, 75.640097, 122.45435914]",0.618049
Be,9.012182,4,1.05,1.12,2742,130.0,[2],no data,1848,1182,1560.00,0.032,190.0000,1.53,13000,1670,1.57,287,1.12,"[9.322699, 18.21115, 153.896205, 217.7185861]",-0.520000
B,10.811000,5,0.85,0.87,4200,320.0,[3],no data,2460,1851,2349.00,,27.0000,1.92,16200,49000,2.04,,no data,"[8.298019, 25.15483, 37.93059, 259.3715, 340.2...",0.279723
C,12.010700,6,0.70,0.67,4300,33.0,"[-4, 4]",no data,2267,500,3800.00,,140.0000,1.70,18350,,2.55,,no data,"[11.260288, 24.383154, 47.88778, 64.49352, 392...",1.262114
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Pb,207.200000,82,1.80,1.54,2022,46.0,"[2, 4]",no data,11340,1421.39,600.61,0.440,35.0000,2.02,1260,,2.33,16,1.75,"[7.4166799, 15.032499, 31.9373, 42.33256, 68.8...",0.356721
Bi,208.980400,83,1.60,1.43,1837,31.0,[3],no data,9780,1292.6,544.40,0.330,8.0000,2.07,1790,,2.02,32,1.82,"[7.285516, 16.703, 25.57075, 45.37, 54.856, 88...",0.942362
Ac,227.000000,89,1.95,no data,3573,,[3],no data,10070,2250,1323.00,,12.0000,2.47,,,1.10,,1.878,"[5.380226, 11.75, 17.431, 44.8, 55.0, 67.0, 79...",0.350000
Th,232.038060,90,1.80,no data,5093,54.0,[4],no data,11724,2978,2115.00,0.270,54.0000,2.45,2490,350,1.30,79,1.798,"[6.3067, 12.1, 18.32, 28.648, 58.0, 69.1, 82.0...",1.170000


In [14]:
cprops_df['Metallic radius'] = cprops_df['Metallic radius'].astype(str)
cprops_df['Metallic radius'] = [(a.replace('no data', 'NaN', 1)) for a in cprops_df['Metallic radius']]
cprops_df['Metallic radius'] = cprops_df['Metallic radius'].astype(float)
cprops_df


Unnamed: 0,Atomic mass,Atomic no,Atomic radius,Atomic radius calculated,Boiling point,Bulk modulus,Common oxidation states,Critical temperature,Density of solid,Liquid range,Melting point,Poissons ratio,Thermal conductivity,Van der waals radius,Velocity of sound,Vickers hardness,X,Youngs modulus,Metallic radius,Ionization energies,Electron affinity
H,1.007940,1,0.25,0.53,20.28,,"[-1, 1]",33,,6.27,14.01,,0.1805,1.10,1270,,2.20,,,[13.598434599702],0.754598
Li,6.941000,3,1.45,1.67,1615,11.0,[1],3223,535,1161.31,453.69,,85.0000,1.82,6000,,0.98,4.9,1.520,"[5.391714996, 75.640097, 122.45435914]",0.618049
Be,9.012182,4,1.05,1.12,2742,130.0,[2],no data,1848,1182,1560.00,0.032,190.0000,1.53,13000,1670,1.57,287,1.120,"[9.322699, 18.21115, 153.896205, 217.7185861]",-0.520000
B,10.811000,5,0.85,0.87,4200,320.0,[3],no data,2460,1851,2349.00,,27.0000,1.92,16200,49000,2.04,,,"[8.298019, 25.15483, 37.93059, 259.3715, 340.2...",0.279723
C,12.010700,6,0.70,0.67,4300,33.0,"[-4, 4]",no data,2267,500,3800.00,,140.0000,1.70,18350,,2.55,,,"[11.260288, 24.383154, 47.88778, 64.49352, 392...",1.262114
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Pb,207.200000,82,1.80,1.54,2022,46.0,"[2, 4]",no data,11340,1421.39,600.61,0.440,35.0000,2.02,1260,,2.33,16,1.750,"[7.4166799, 15.032499, 31.9373, 42.33256, 68.8...",0.356721
Bi,208.980400,83,1.60,1.43,1837,31.0,[3],no data,9780,1292.6,544.40,0.330,8.0000,2.07,1790,,2.02,32,1.820,"[7.285516, 16.703, 25.57075, 45.37, 54.856, 88...",0.942362
Ac,227.000000,89,1.95,no data,3573,,[3],no data,10070,2250,1323.00,,12.0000,2.47,,,1.10,,1.878,"[5.380226, 11.75, 17.431, 44.8, 55.0, 67.0, 79...",0.350000
Th,232.038060,90,1.80,no data,5093,54.0,[4],no data,11724,2978,2115.00,0.270,54.0000,2.45,2490,350,1.30,79,1.798,"[6.3067, 12.1, 18.32, 28.648, 58.0, 69.1, 82.0...",1.170000


In [17]:
type(cprops_df['Metallic radius'][1])



numpy.float64

In [15]:
cprops_df['Common oxidation states'] = [len(a) for a in cprops_df['Common oxidation states']]
cprops_df['Common oxidation states'] 

#executivejojodecision 841 11:30

H     2
Li    1
Be    1
B     1
C     2
     ..
Pb    2
Bi    1
Ac    1
Th    1
U     1
Name: Common oxidation states, Length: 80, dtype: int64

In [16]:
cprops_df

Unnamed: 0,Atomic mass,Atomic no,Atomic radius,Atomic radius calculated,Boiling point,Bulk modulus,Common oxidation states,Critical temperature,Density of solid,Liquid range,Melting point,Poissons ratio,Thermal conductivity,Van der waals radius,Velocity of sound,Vickers hardness,X,Youngs modulus,Metallic radius,Ionization energies,Electron affinity
H,1.007940,1,0.25,0.53,20.28,,2,33,,6.27,14.01,,0.1805,1.10,1270,,2.20,,,[13.598434599702],0.754598
Li,6.941000,3,1.45,1.67,1615,11.0,1,3223,535,1161.31,453.69,,85.0000,1.82,6000,,0.98,4.9,1.520,"[5.391714996, 75.640097, 122.45435914]",0.618049
Be,9.012182,4,1.05,1.12,2742,130.0,1,no data,1848,1182,1560.00,0.032,190.0000,1.53,13000,1670,1.57,287,1.120,"[9.322699, 18.21115, 153.896205, 217.7185861]",-0.520000
B,10.811000,5,0.85,0.87,4200,320.0,1,no data,2460,1851,2349.00,,27.0000,1.92,16200,49000,2.04,,,"[8.298019, 25.15483, 37.93059, 259.3715, 340.2...",0.279723
C,12.010700,6,0.70,0.67,4300,33.0,2,no data,2267,500,3800.00,,140.0000,1.70,18350,,2.55,,,"[11.260288, 24.383154, 47.88778, 64.49352, 392...",1.262114
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Pb,207.200000,82,1.80,1.54,2022,46.0,2,no data,11340,1421.39,600.61,0.440,35.0000,2.02,1260,,2.33,16,1.750,"[7.4166799, 15.032499, 31.9373, 42.33256, 68.8...",0.356721
Bi,208.980400,83,1.60,1.43,1837,31.0,1,no data,9780,1292.6,544.40,0.330,8.0000,2.07,1790,,2.02,32,1.820,"[7.285516, 16.703, 25.57075, 45.37, 54.856, 88...",0.942362
Ac,227.000000,89,1.95,no data,3573,,1,no data,10070,2250,1323.00,,12.0000,2.47,,,1.10,,1.878,"[5.380226, 11.75, 17.431, 44.8, 55.0, 67.0, 79...",0.350000
Th,232.038060,90,1.80,no data,5093,54.0,1,no data,11724,2978,2115.00,0.270,54.0000,2.45,2490,350,1.30,79,1.798,"[6.3067, 12.1, 18.32, 28.648, 58.0, 69.1, 82.0...",1.170000


In [18]:
cprops_df['First Ionization Energy'] = [a[0] for a in cprops_df['Ionization energies']]

In [19]:
cprops_df

Unnamed: 0,Atomic mass,Atomic no,Atomic radius,Atomic radius calculated,Boiling point,Bulk modulus,Common oxidation states,Critical temperature,Density of solid,Liquid range,Melting point,Poissons ratio,Thermal conductivity,Van der waals radius,Velocity of sound,Vickers hardness,X,Youngs modulus,Metallic radius,Ionization energies,Electron affinity,First Ionization Energy
H,1.007940,1,0.25,0.53,20.28,,2,33,,6.27,14.01,,0.1805,1.10,1270,,2.20,,,[13.598434599702],0.754598,13.598435
Li,6.941000,3,1.45,1.67,1615,11.0,1,3223,535,1161.31,453.69,,85.0000,1.82,6000,,0.98,4.9,1.520,"[5.391714996, 75.640097, 122.45435914]",0.618049,5.391715
Be,9.012182,4,1.05,1.12,2742,130.0,1,no data,1848,1182,1560.00,0.032,190.0000,1.53,13000,1670,1.57,287,1.120,"[9.322699, 18.21115, 153.896205, 217.7185861]",-0.520000,9.322699
B,10.811000,5,0.85,0.87,4200,320.0,1,no data,2460,1851,2349.00,,27.0000,1.92,16200,49000,2.04,,,"[8.298019, 25.15483, 37.93059, 259.3715, 340.2...",0.279723,8.298019
C,12.010700,6,0.70,0.67,4300,33.0,2,no data,2267,500,3800.00,,140.0000,1.70,18350,,2.55,,,"[11.260288, 24.383154, 47.88778, 64.49352, 392...",1.262114,11.260288
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Pb,207.200000,82,1.80,1.54,2022,46.0,2,no data,11340,1421.39,600.61,0.440,35.0000,2.02,1260,,2.33,16,1.750,"[7.4166799, 15.032499, 31.9373, 42.33256, 68.8...",0.356721,7.416680
Bi,208.980400,83,1.60,1.43,1837,31.0,1,no data,9780,1292.6,544.40,0.330,8.0000,2.07,1790,,2.02,32,1.820,"[7.285516, 16.703, 25.57075, 45.37, 54.856, 88...",0.942362,7.285516
Ac,227.000000,89,1.95,no data,3573,,1,no data,10070,2250,1323.00,,12.0000,2.47,,,1.10,,1.878,"[5.380226, 11.75, 17.431, 44.8, 55.0, 67.0, 79...",0.350000,5.380226
Th,232.038060,90,1.80,no data,5093,54.0,1,no data,11724,2978,2115.00,0.270,54.0000,2.45,2490,350,1.30,79,1.798,"[6.3067, 12.1, 18.32, 28.648, 58.0, 69.1, 82.0...",1.170000,6.306700


In [20]:
cprops_df = cprops_df.drop("Ionization energies", axis=1)

In [21]:
cprops_df

Unnamed: 0,Atomic mass,Atomic no,Atomic radius,Atomic radius calculated,Boiling point,Bulk modulus,Common oxidation states,Critical temperature,Density of solid,Liquid range,Melting point,Poissons ratio,Thermal conductivity,Van der waals radius,Velocity of sound,Vickers hardness,X,Youngs modulus,Metallic radius,Electron affinity,First Ionization Energy
H,1.007940,1,0.25,0.53,20.28,,2,33,,6.27,14.01,,0.1805,1.10,1270,,2.20,,,0.754598,13.598435
Li,6.941000,3,1.45,1.67,1615,11.0,1,3223,535,1161.31,453.69,,85.0000,1.82,6000,,0.98,4.9,1.520,0.618049,5.391715
Be,9.012182,4,1.05,1.12,2742,130.0,1,no data,1848,1182,1560.00,0.032,190.0000,1.53,13000,1670,1.57,287,1.120,-0.520000,9.322699
B,10.811000,5,0.85,0.87,4200,320.0,1,no data,2460,1851,2349.00,,27.0000,1.92,16200,49000,2.04,,,0.279723,8.298019
C,12.010700,6,0.70,0.67,4300,33.0,2,no data,2267,500,3800.00,,140.0000,1.70,18350,,2.55,,,1.262114,11.260288
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Pb,207.200000,82,1.80,1.54,2022,46.0,2,no data,11340,1421.39,600.61,0.440,35.0000,2.02,1260,,2.33,16,1.750,0.356721,7.416680
Bi,208.980400,83,1.60,1.43,1837,31.0,1,no data,9780,1292.6,544.40,0.330,8.0000,2.07,1790,,2.02,32,1.820,0.942362,7.285516
Ac,227.000000,89,1.95,no data,3573,,1,no data,10070,2250,1323.00,,12.0000,2.47,,,1.10,,1.878,0.350000,5.380226
Th,232.038060,90,1.80,no data,5093,54.0,1,no data,11724,2978,2115.00,0.270,54.0000,2.45,2490,350,1.30,79,1.798,1.170000,6.306700


In [22]:
cprops_df['Critical temperature'] = [(a.replace('no data', 'NaN', 1)) for a in cprops_df['Critical temperature']]
cprops_df

Unnamed: 0,Atomic mass,Atomic no,Atomic radius,Atomic radius calculated,Boiling point,Bulk modulus,Common oxidation states,Critical temperature,Density of solid,Liquid range,Melting point,Poissons ratio,Thermal conductivity,Van der waals radius,Velocity of sound,Vickers hardness,X,Youngs modulus,Metallic radius,Electron affinity,First Ionization Energy
H,1.007940,1,0.25,0.53,20.28,,2,33,,6.27,14.01,,0.1805,1.10,1270,,2.20,,,0.754598,13.598435
Li,6.941000,3,1.45,1.67,1615,11.0,1,3223,535,1161.31,453.69,,85.0000,1.82,6000,,0.98,4.9,1.520,0.618049,5.391715
Be,9.012182,4,1.05,1.12,2742,130.0,1,,1848,1182,1560.00,0.032,190.0000,1.53,13000,1670,1.57,287,1.120,-0.520000,9.322699
B,10.811000,5,0.85,0.87,4200,320.0,1,,2460,1851,2349.00,,27.0000,1.92,16200,49000,2.04,,,0.279723,8.298019
C,12.010700,6,0.70,0.67,4300,33.0,2,,2267,500,3800.00,,140.0000,1.70,18350,,2.55,,,1.262114,11.260288
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Pb,207.200000,82,1.80,1.54,2022,46.0,2,,11340,1421.39,600.61,0.440,35.0000,2.02,1260,,2.33,16,1.750,0.356721,7.416680
Bi,208.980400,83,1.60,1.43,1837,31.0,1,,9780,1292.6,544.40,0.330,8.0000,2.07,1790,,2.02,32,1.820,0.942362,7.285516
Ac,227.000000,89,1.95,no data,3573,,1,,10070,2250,1323.00,,12.0000,2.47,,,1.10,,1.878,0.350000,5.380226
Th,232.038060,90,1.80,no data,5093,54.0,1,,11724,2978,2115.00,0.270,54.0000,2.45,2490,350,1.30,79,1.798,1.170000,6.306700


In [25]:
# OG_names = ['AtomicRadius', 'AtomicVolume', 'AtomicWeight', 'BulkModulus',
#        'BoilingT', 'Column', 'CovalentRadius', 'Density', 'ElectronAffinity',
#        'Electronegativity', 'FirstIonizationEnergy', 'HeatCapacityMass', 'Row',
#        'phi', 'SecondIonizationEnergy', 'ShearModulus', 'Superconduction temperature', 'Velocity of sound']



In [23]:
cprops_df.dtypes

Atomic mass                 float64
Atomic no                     int64
Atomic radius               float64
Atomic radius calculated     object
Boiling point                object
Bulk modulus                float64
Common oxidation states       int64
Critical temperature         object
Density of solid             object
Liquid range                 object
Melting point               float64
Poissons ratio              float64
Thermal conductivity        float64
Van der waals radius        float64
Velocity of sound            object
Vickers hardness             object
X                           float64
Youngs modulus               object
Metallic radius             float64
Electron affinity           float64
First Ionization Energy     float64
dtype: object

In [24]:
cprops_df = cprops_df.apply(pd.to_numeric, errors='coerce')

In [25]:
cprops_df.dtypes

Atomic mass                 float64
Atomic no                     int64
Atomic radius               float64
Atomic radius calculated    float64
Boiling point               float64
Bulk modulus                float64
Common oxidation states       int64
Critical temperature        float64
Density of solid            float64
Liquid range                float64
Melting point               float64
Poissons ratio              float64
Thermal conductivity        float64
Van der waals radius        float64
Velocity of sound           float64
Vickers hardness            float64
X                           float64
Youngs modulus              float64
Metallic radius             float64
Electron affinity           float64
First Ionization Energy     float64
dtype: object

In [26]:
# Computing mean values using mean function 
mvals = dict(cprops_df.mean())
mvals

{'Atomic mass': 107.06698397475,
 'Atomic no': 44.7,
 'Atomic radius': 1.48,
 'Atomic radius calculated': 1.6537837837837837,
 'Boiling point': 2696.401625,
 'Bulk modulus': 91.64411764705883,
 'Common oxidation states': 1.6625,
 'Critical temperature': 1285.5176470588235,
 'Density of solid': 8067.890410958904,
 'Liquid range': 1317.882875,
 'Melting point': 1383.99375,
 'Poissons ratio': 0.2942692307692308,
 'Thermal conductivity': 66.76124387499999,
 'Van der waals radius': 2.1265,
 'Velocity of sound': 3723.0015151515154,
 'Vickers hardness': 2108.3076923076924,
 'X': 1.7591250000000003,
 'Youngs modulus': 111.78688524590164,
 'Metallic radius': 1.6225909090909092,
 'Electron affinity': 0.7264296744125001,
 'First Ionization Energy': 7.683228095414151}

In [27]:
# Iterating through variable with averages to replace the NaN values in element_data
for key, value in mvals.items():
    cprops_df.loc[cprops_df[key].isnull(),key] = value

display(cprops_df)

Unnamed: 0,Atomic mass,Atomic no,Atomic radius,Atomic radius calculated,Boiling point,Bulk modulus,Common oxidation states,Critical temperature,Density of solid,Liquid range,Melting point,Poissons ratio,Thermal conductivity,Van der waals radius,Velocity of sound,Vickers hardness,X,Youngs modulus,Metallic radius,Electron affinity,First Ionization Energy
H,1.007940,1.0,0.25,0.530000,20.28,91.644118,2.0,33.000000,8067.890411,6.27,14.01,0.294269,0.1805,1.10,1270.000000,2108.307692,2.20,111.786885,1.622591,0.754598,13.598435
Li,6.941000,3.0,1.45,1.670000,1615.00,11.000000,1.0,3223.000000,535.000000,1161.31,453.69,0.294269,85.0000,1.82,6000.000000,2108.307692,0.98,4.900000,1.520000,0.618049,5.391715
Be,9.012182,4.0,1.05,1.120000,2742.00,130.000000,1.0,1285.517647,1848.000000,1182.00,1560.00,0.032000,190.0000,1.53,13000.000000,1670.000000,1.57,287.000000,1.120000,-0.520000,9.322699
B,10.811000,5.0,0.85,0.870000,4200.00,320.000000,1.0,1285.517647,2460.000000,1851.00,2349.00,0.294269,27.0000,1.92,16200.000000,49000.000000,2.04,111.786885,1.622591,0.279723,8.298019
C,12.010700,6.0,0.70,0.670000,4300.00,33.000000,2.0,1285.517647,2267.000000,500.00,3800.00,0.294269,140.0000,1.70,18350.000000,2108.307692,2.55,111.786885,1.622591,1.262114,11.260288
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Pb,207.200000,82.0,1.80,1.540000,2022.00,46.000000,2.0,1285.517647,11340.000000,1421.39,600.61,0.440000,35.0000,2.02,1260.000000,2108.307692,2.33,16.000000,1.750000,0.356721,7.416680
Bi,208.980400,83.0,1.60,1.430000,1837.00,31.000000,1.0,1285.517647,9780.000000,1292.60,544.40,0.330000,8.0000,2.07,1790.000000,2108.307692,2.02,32.000000,1.820000,0.942362,7.285516
Ac,227.000000,89.0,1.95,1.653784,3573.00,91.644118,1.0,1285.517647,10070.000000,2250.00,1323.00,0.294269,12.0000,2.47,3723.001515,2108.307692,1.10,111.786885,1.878000,0.350000,5.380226
Th,232.038060,90.0,1.80,1.653784,5093.00,54.000000,1.0,1285.517647,11724.000000,2978.00,2115.00,0.270000,54.0000,2.45,2490.000000,350.000000,1.30,79.000000,1.798000,1.170000,6.306700


In [31]:
# # Initializing variables
# Kp3_2 = list(filt_data["pretty_formula"])
# CFp3_2 = []
# task_id = filt_data["material_id"]

# properties = list(cprops_df.columns)
# compound_names  =  list(filt_data["pretty_formula"])
# average_dict = {}
# max_dict = {}
# min_dict = {}

# # Implementing same method as question 4 to compute max, min, and averages for all properties in data
# for elms in range(len(Kp3_2)): 
#     #Kp3_2 = filt_data["pretty_formula"]
#     #task = task_id[elms]
#     Kp3_2 = filt_data["pretty_formula"]
#     comp_eli_2 = Composition(CFp3_2)
#     CEl_2 = comp_eli_2.elements
#     q2dict = comp_eli_2.to_data_dict['unit_cell_composition']
#     row_avgs = {}
#     row_max = {}
#     row_min = {}
    
#     for col in properties:
#       overall = 0
#       allval = []
#       for key, value in q2dict.items():
#         property_ = cprops_df.loc[key, col]
#         overall += property_ * value
#         allval.append(property_)
#       num_total = sum(q2dict.values())
#       avg = overall/num_total
#       row_avgs[col + '_avg'] = avg
#       row_max[col + '_max'] = max(allval)
#       row_min[col + '_min'] = min(allval)
#     average_dict[task] = row_avgs
#     max_dict[task] = row_max
#     min_dict[task] = row_min
    

In [32]:
#jojoversion

In [28]:
filt_data_comp = filt_data
filt_data_comp['Composition'] = [Composition(c) for c in filt_data_comp["pretty_formula"]]
filt_data_comp['num_atoms'] = [c.num_atoms for c in filt_data_comp['Composition']]
filt_data_comp['volume_per_atom'] = filt_data_comp['volume']/filt_data_comp['num_atoms']
filt_data_comp
#filt_data

Unnamed: 0,material_id,energy,energy_per_atom,volume,formation_energy_per_atom,nsites,pretty_formula,nelements,density,band_gap,Composition,num_atoms,volume_per_atom
0,mp-1001034,-52.019078,-3.715648,376.145863,-0.703661,14,Mg(InSe2)2,3,5.030727,0.7432,"(Mg, In, Se)",7.0,53.735123
1,mp-1001780,-22.364406,-5.591101,78.515319,-1.699567,4,LuCuS2,3,6.400668,1.5031,"(Lu, Cu, S)",4.0,19.628830
2,mp-1001786,-23.600913,-5.900228,71.701237,-2.103385,4,LiScS2,3,2.687084,1.5296,"(Li, Sc, S)",4.0,17.925309
3,mp-1002124,-19.780759,-9.890380,32.055765,-0.298402,2,HfC,2,9.868236,0.5774,"(Hf, C)",2.0,16.027883
4,mp-1004528,-95.454964,-5.614998,324.979430,-3.162482,17,CsB3PbF12,4,3.068458,6.3125,"(Cs, B, Pb, F)",17.0,19.116437
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5614,mvc-6916,-99.385289,-7.098949,156.686110,-2.307339,14,MgMn2O4,3,4.200544,0.0516,"(Mg, Mn, O)",7.0,22.383730
5615,mvc-6928,-91.603914,-6.543137,173.941832,-2.192463,14,Ca(FeO2)2,3,4.119621,1.7463,"(Ca, Fe, O)",7.0,24.848833
5616,mvc-6946,-36.735653,-6.122609,97.529160,-1.961400,6,SnO2,2,5.131960,2.1009,"(Sn, O)",3.0,32.509720
5617,mvc-7040,-145.212422,-8.067357,418.920727,-1.999722,18,Re2O7,2,3.840256,3.4689,"(Re, O)",9.0,46.546747


In [34]:
filt_data_comp['Composition'][0].num_atoms





7.0

In [29]:
cprops_df_dict = cprops_df.to_dict()

rad_dict = cprops_df_dict['Atomic radius']

def radius_mean(composition):
    sumofradii = 0
    totnumatoms = 0
    for element, number in composition.items():
        sumofradii += (number*rad_dict[str(element)])
        totnumatoms += number
    return sumofradii/totnumatoms

atomic_radius = filt_data_comp['Composition'].apply(radius_mean)
atomic_radius

0       1.314286
1       1.275000
2       1.262500
3       1.125000
4       0.761765
          ...   
5614    0.957143
5615    1.000000
5616    0.883333
5617    0.766667
5618    0.835000
Name: Composition, Length: 5614, dtype: float64

In [30]:
#jojo version

def propertymean(property, composition):
    sumofproperty = 0
    totalnumatoms = 0
    for element, number in composition.items():
        sumofproperty += (number*cprops_df_dict[property][str(element)])
        totalnumatoms += number
    return sumofproperty/totalnumatoms

def maxofproperty(property, composition):
    propmax = None
    for element, number in composition.items():
        propertyvalue = cprops_df_dict[property][str(element)]
        if propmax:
            propmax = propertyvalue if propertyvalue > propmax else propmax
        else:
            propmax = propertyvalue
    return propmax

def minofproperty(property, composition):
    propmin = None
    for element, number in composition.items():
        propertyvalue = cprops_df_dict[property][str(element)]
        if propmin:
            propmin = propertyvalue if propertyvalue < propmin else propmin
        else:
            propmin = propertyvalue
    return propmin

In [31]:
cprops_df

Unnamed: 0,Atomic mass,Atomic no,Atomic radius,Atomic radius calculated,Boiling point,Bulk modulus,Common oxidation states,Critical temperature,Density of solid,Liquid range,Melting point,Poissons ratio,Thermal conductivity,Van der waals radius,Velocity of sound,Vickers hardness,X,Youngs modulus,Metallic radius,Electron affinity,First Ionization Energy
H,1.007940,1.0,0.25,0.530000,20.28,91.644118,2.0,33.000000,8067.890411,6.27,14.01,0.294269,0.1805,1.10,1270.000000,2108.307692,2.20,111.786885,1.622591,0.754598,13.598435
Li,6.941000,3.0,1.45,1.670000,1615.00,11.000000,1.0,3223.000000,535.000000,1161.31,453.69,0.294269,85.0000,1.82,6000.000000,2108.307692,0.98,4.900000,1.520000,0.618049,5.391715
Be,9.012182,4.0,1.05,1.120000,2742.00,130.000000,1.0,1285.517647,1848.000000,1182.00,1560.00,0.032000,190.0000,1.53,13000.000000,1670.000000,1.57,287.000000,1.120000,-0.520000,9.322699
B,10.811000,5.0,0.85,0.870000,4200.00,320.000000,1.0,1285.517647,2460.000000,1851.00,2349.00,0.294269,27.0000,1.92,16200.000000,49000.000000,2.04,111.786885,1.622591,0.279723,8.298019
C,12.010700,6.0,0.70,0.670000,4300.00,33.000000,2.0,1285.517647,2267.000000,500.00,3800.00,0.294269,140.0000,1.70,18350.000000,2108.307692,2.55,111.786885,1.622591,1.262114,11.260288
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Pb,207.200000,82.0,1.80,1.540000,2022.00,46.000000,2.0,1285.517647,11340.000000,1421.39,600.61,0.440000,35.0000,2.02,1260.000000,2108.307692,2.33,16.000000,1.750000,0.356721,7.416680
Bi,208.980400,83.0,1.60,1.430000,1837.00,31.000000,1.0,1285.517647,9780.000000,1292.60,544.40,0.330000,8.0000,2.07,1790.000000,2108.307692,2.02,32.000000,1.820000,0.942362,7.285516
Ac,227.000000,89.0,1.95,1.653784,3573.00,91.644118,1.0,1285.517647,10070.000000,2250.00,1323.00,0.294269,12.0000,2.47,3723.001515,2108.307692,1.10,111.786885,1.878000,0.350000,5.380226
Th,232.038060,90.0,1.80,1.653784,5093.00,54.000000,1.0,1285.517647,11724.000000,2978.00,2115.00,0.270000,54.0000,2.45,2490.000000,350.000000,1.30,79.000000,1.798000,1.170000,6.306700


In [32]:
avg_properties_df = pd.DataFrame()

for property in cprops_df.columns:
    individualpropertymean = partial(propertymean, property)
    averages = filt_data_comp['Composition'].apply(individualpropertymean)
    avg_properties_df[("average_" + property)] = averages
    
avg_properties_df.head()

print("Average properties Dimension: ", avg_properties_df.shape)

max_properties = pd.DataFrame()

for property in cprops_df.columns:
    individualpropertymax = partial(maxofproperty, property)
    max = filt_data_comp['Composition'].apply(individualpropertymax)
    max_properties[("max_" + property)] = max
    
min_properties = pd.DataFrame()

for property in cprops_df.columns:
    individualpropertymin = partial(minofproperty, property)
    min = filt_data_comp['Composition'].apply(individualpropertymin)
    min_properties[("min_" + property)] = min

Average properties Dimension:  (5614, 21)


In [39]:
avg_properties_df

Unnamed: 0,average_Atomic mass,average_Atomic no,average_Atomic radius,average_Atomic radius calculated,average_Boiling point,average_Bulk modulus,average_Common oxidation states,average_Critical temperature,average_Density of solid,average_Liquid range,average_Melting point,average_Poissons ratio,average_Thermal conductivity,average_Van der waals radius,average_Velocity of sound,average_Vickers hardness,average_X,average_Youngs modulus,average_Metallic radius,average_Electron affinity
0,81.397286,35.142857,1.314286,1.241429,1412.142857,37.355462,2.714286,1560.078992,5090.571429,875.214286,536.928571,0.314077,46.582857,1.884286,2918.857143,2108.307692,2.152857,15.285714,1.632909,1.204324
1,75.660750,33.000000,1.275000,1.345000,2077.685000,50.850000,2.500000,1299.758824,5670.250000,1062.812500,1014.872500,0.297135,104.102500,1.950000,3684.751136,1436.403846,2.082500,105.643443,1.564545,1.407216
2,29.006728,14.000000,1.262500,1.317500,1538.435000,20.850000,2.500000,1784.129412,1860.000000,777.332500,761.102500,0.290702,25.352500,1.892500,4292.251136,2108.307692,1.875000,75.618443,1.601545,1.240115
3,95.250350,39.000000,1.125000,1.375000,4588.000000,71.500000,1.500000,1285.517647,7788.500000,1435.000000,3153.000000,0.332135,81.500000,1.965000,10680.000000,1934.153846,1.925000,94.893443,1.601295,0.720092
4,35.324664,15.294118,0.761765,0.715882,975.668235,123.960554,1.058824,518.121799,6906.687349,470.282353,505.385882,0.302842,8.960729,1.697059,5779.942335,10383.312217,3.352941,99.676663,1.694580,2.498929
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5614,28.311241,13.428571,0.957143,0.941429,913.114286,93.082353,1.571429,639.278992,6992.794521,315.942857,597.171429,0.293659,25.100903,1.701429,2310.285714,2108.307692,2.595714,126.878220,1.524909,0.626346
5615,30.823657,14.857143,1.000000,0.997143,1197.971429,103.368067,1.285714,639.278992,7081.365949,489.942857,708.028571,0.295297,51.443760,1.781429,2128.571429,1679.648352,2.631429,127.021077,1.574338,0.882206
5616,50.236267,22.000000,0.883333,0.803333,1018.466667,80.429412,1.666667,531.572549,7815.260274,813.573333,204.893333,0.316179,22.351053,1.736667,1045.000000,2108.307692,2.946667,91.191257,1.608394,1.344760
5617,53.823311,22.888889,0.766667,0.791111,1374.377778,153.500980,1.000000,405.915033,10946.136986,563.088889,811.288889,0.295543,10.687340,1.662222,1291.388889,2184.239316,3.097778,189.834244,1.567571,1.149837


In [40]:
min_properties


Unnamed: 0,min_Atomic mass,min_Atomic no,min_Atomic radius,min_Atomic radius calculated,min_Boiling point,min_Bulk modulus,min_Common oxidation states,min_Critical temperature,min_Density of solid,min_Liquid range,min_Melting point,min_Poissons ratio,min_Thermal conductivity,min_Van der waals radius,min_Velocity of sound,min_Vickers hardness,min_X,min_Youngs modulus,min_Metallic radius,min_Electron affinity
0,24.3050,12.0,1.15,1.03,958.00,8.300000,1.0,1285.517647,1738.000000,440.00,429.75,0.290000,0.52000,1.73,1215.000000,2108.307692,1.31,10.000000,1.600000,-0.420000
1,32.0650,16.0,1.00,0.88,717.87,7.700000,1.0,1285.517647,1960.000000,329.51,388.36,0.260000,0.20500,1.80,3570.000000,369.000000,1.27,69.000000,1.278000,0.238870
2,6.9410,3.0,1.00,0.88,717.87,7.700000,1.0,1285.517647,535.000000,329.51,388.36,0.280000,0.20500,1.80,3723.001515,2108.307692,0.98,4.900000,1.520000,0.188200
3,12.0107,6.0,0.70,0.67,4300.00,33.000000,1.0,1285.517647,2267.000000,500.00,2506.00,0.294269,23.00000,1.70,3010.000000,1760.000000,1.30,78.000000,1.580000,0.178070
4,10.8110,5.0,0.50,0.42,85.03,1.600000,1.0,144.000000,1879.000000,31.50,53.53,0.294269,0.02770,1.47,1260.000000,2108.307692,0.79,1.700000,1.622591,0.279723
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5614,15.9994,8.0,0.60,0.48,90.20,45.000000,1.0,154.600000,1738.000000,35.40,54.80,0.290000,0.02658,1.52,317.500000,2108.307692,1.31,45.000000,1.292000,-0.520000
5615,15.9994,8.0,0.60,0.48,90.20,17.000000,1.0,154.600000,1550.000000,35.40,54.80,0.290000,0.02658,1.52,317.500000,608.000000,1.00,20.000000,1.277000,0.024551
5616,15.9994,8.0,0.60,0.48,90.20,58.000000,1.0,154.600000,7310.000000,35.40,54.80,0.294269,0.02658,1.52,317.500000,2108.307692,1.96,50.000000,1.580000,1.112070
5617,15.9994,8.0,0.60,0.48,90.20,91.644118,1.0,154.600000,8067.890411,35.40,54.80,0.294269,0.02658,1.52,317.500000,2108.307692,1.90,111.786885,1.375000,0.060397


In [41]:
max_properties

Unnamed: 0,max_Atomic mass,max_Atomic no,max_Atomic radius,max_Atomic radius calculated,max_Boiling point,max_Bulk modulus,max_Common oxidation states,max_Critical temperature,max_Density of solid,max_Liquid range,max_Melting point,max_Poissons ratio,max_Thermal conductivity,max_Van der waals radius,max_Velocity of sound,max_Vickers hardness,max_X,max_Youngs modulus,max_Metallic radius,max_Electron affinity
0,114.818000,49.0,1.55,1.56,2345.0,91.644118,4.0,1766.000000,7310.000000,1915.25,923.00,0.330000,160.0,1.93,4602.000000,2108.307692,2.55,45.000000,1.670000,2.020605
1,174.967000,71.0,1.75,2.17,3675.0,140.000000,4.0,1314.000000,9841.000000,1842.23,1925.00,0.340000,400.0,2.24,3723.001515,2108.307692,2.58,130.000000,1.735000,2.077105
2,44.955912,21.0,1.60,1.84,3103.0,57.000000,4.0,3223.000000,2985.000000,1289.00,1814.00,0.294269,85.0,2.15,6000.000000,2108.307692,2.58,111.786885,1.641000,2.077105
3,178.490000,72.0,1.55,2.08,4876.0,110.000000,2.0,1285.517647,13310.000000,2370.00,3800.00,0.370000,140.0,2.23,18350.000000,2108.307692,2.55,111.786885,1.622591,1.262114
4,207.200000,82.0,2.60,2.98,4200.0,320.000000,2.0,1938.000000,11340.000000,1851.00,2349.00,0.440000,36.0,3.43,16200.000000,49000.000000,3.98,111.786885,2.719000,3.401190
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5614,54.938045,25.0,1.50,1.61,2334.0,120.000000,3.0,1285.517647,8067.890411,815.00,1519.00,0.294269,160.0,2.05,5150.000000,2108.307692,3.44,198.000000,1.622591,1.461105
5615,55.845000,26.0,1.80,1.94,3134.0,170.000000,2.0,1285.517647,8067.890411,1323.00,1811.00,0.310000,200.0,2.31,4910.000000,2108.307692,3.44,211.000000,1.976000,1.461105
5616,118.710000,50.0,1.45,1.45,2875.0,91.644118,3.0,1285.517647,8067.890411,2369.92,505.08,0.360000,67.0,2.17,2500.000000,2108.307692,3.44,111.786885,1.622591,1.461105
5617,186.207000,75.0,1.35,1.88,5869.0,370.000000,1.0,1285.517647,21020.000000,2410.00,3459.00,0.300000,48.0,2.16,4700.000000,2450.000000,3.44,463.000000,1.622591,1.461105


In [33]:
design_matrix= pd.concat([filt_data_comp, avg_properties_df, min_properties, max_properties], axis=1)
design_matrix.columns

droplist2 = ['volume','energy', 'pretty_formula', 'Composition', 'average_Common oxidation states', 'min_Common oxidation states','max_Common oxidation states']
design_matrix = design_matrix.drop(columns=droplist2)
design_matrix

Unnamed: 0,material_id,energy_per_atom,formation_energy_per_atom,nsites,nelements,density,band_gap,num_atoms,volume_per_atom,average_Atomic mass,average_Atomic no,average_Atomic radius,average_Atomic radius calculated,average_Boiling point,average_Bulk modulus,average_Critical temperature,average_Density of solid,average_Liquid range,average_Melting point,average_Poissons ratio,average_Thermal conductivity,average_Van der waals radius,average_Velocity of sound,average_Vickers hardness,average_X,average_Youngs modulus,average_Metallic radius,average_Electron affinity,average_First Ionization Energy,min_Atomic mass,min_Atomic no,min_Atomic radius,min_Atomic radius calculated,min_Boiling point,min_Bulk modulus,min_Critical temperature,min_Density of solid,min_Liquid range,min_Melting point,min_Poissons ratio,min_Thermal conductivity,min_Van der waals radius,min_Velocity of sound,min_Vickers hardness,min_X,min_Youngs modulus,min_Metallic radius,min_Electron affinity,min_First Ionization Energy,max_Atomic mass,max_Atomic no,max_Atomic radius,max_Atomic radius calculated,max_Boiling point,max_Bulk modulus,max_Critical temperature,max_Density of solid,max_Liquid range,max_Melting point,max_Poissons ratio,max_Thermal conductivity,max_Van der waals radius,max_Velocity of sound,max_Vickers hardness,max_X,max_Youngs modulus,max_Metallic radius,max_Electron affinity,max_First Ionization Energy
0,mp-1001034,-3.715648,-0.703661,14,3,5.030727,0.7432,7.0,53.735123,81.397286,35.142857,1.314286,1.241429,1412.142857,37.355462,1560.078992,5090.571429,875.214286,536.928571,0.314077,46.582857,1.884286,2918.857143,2108.307692,2.152857,15.285714,1.632909,1.204324,8.318359,24.3050,12.0,1.15,1.03,958.00,8.300000,1285.517647,1738.000000,440.00,429.75,0.290000,0.52000,1.73,1215.000000,2108.307692,1.31,10.000000,1.600000,-0.420000,5.786356,114.818000,49.0,1.55,1.56,2345.0,91.644118,1766.000000,7310.000000,1915.25,923.00,0.330000,160.0,1.93,4602.000000,2108.307692,2.55,45.000000,1.670000,2.020605,9.752392
1,mp-1001780,-5.591101,-1.699567,4,3,6.400668,1.5031,4.0,19.628830,75.660750,33.000000,1.275000,1.345000,2077.685000,50.850000,1299.758824,5670.250000,1062.812500,1014.872500,0.297135,104.102500,1.950000,3684.751136,1436.403846,2.082500,105.643443,1.564545,1.407216,8.468068,32.0650,16.0,1.00,0.88,717.87,7.700000,1285.517647,1960.000000,329.51,388.36,0.260000,0.20500,1.80,3570.000000,369.000000,1.27,69.000000,1.278000,0.238870,5.425871,174.967000,71.0,1.75,2.17,3675.0,140.000000,1314.000000,9841.000000,1842.23,1925.00,0.340000,400.0,2.24,3723.001515,2108.307692,2.58,130.000000,1.735000,2.077105,10.360010
2,mp-1001786,-5.900228,-2.103385,4,3,2.687084,1.5296,4.0,17.925309,29.006728,14.000000,1.262500,1.317500,1538.435000,20.850000,1784.129412,1860.000000,777.332500,761.102500,0.290702,25.352500,1.892500,4292.251136,2108.307692,1.875000,75.618443,1.601545,1.240115,8.168306,6.9410,3.0,1.00,0.88,717.87,7.700000,1285.517647,535.000000,329.51,388.36,0.280000,0.20500,1.80,3723.001515,2108.307692,0.98,4.900000,1.520000,0.188200,5.391715,44.955912,21.0,1.60,1.84,3103.0,57.000000,3223.000000,2985.000000,1289.00,1814.00,0.294269,85.0,2.15,6000.000000,2108.307692,2.58,111.786885,1.641000,2.077105,10.360010
3,mp-1002124,-9.890380,-0.298402,2,2,9.868236,0.5774,2.0,16.027883,95.250350,39.000000,1.125000,1.375000,4588.000000,71.500000,1285.517647,7788.500000,1435.000000,3153.000000,0.332135,81.500000,1.965000,10680.000000,1934.153846,1.925000,94.893443,1.601295,0.720092,9.042679,12.0107,6.0,0.70,0.67,4300.00,33.000000,1285.517647,2267.000000,500.00,2506.00,0.294269,23.00000,1.70,3010.000000,1760.000000,1.30,78.000000,1.580000,0.178070,6.825070,178.490000,72.0,1.55,2.08,4876.0,110.000000,1285.517647,13310.000000,2370.00,3800.00,0.370000,140.0,2.23,18350.000000,2108.307692,2.55,111.786885,1.622591,1.262114,11.260288
4,mp-1004528,-5.614998,-3.162482,17,4,3.068458,6.3125,17.0,19.116437,35.324664,15.294118,0.761765,0.715882,975.668235,123.960554,518.121799,6906.687349,470.282353,505.385882,0.302842,8.960729,1.697059,5779.942335,10383.312217,3.352941,99.676663,1.694580,2.498929,14.428146,10.8110,5.0,0.50,0.42,85.03,1.600000,144.000000,1879.000000,31.50,53.53,0.294269,0.02770,1.47,1260.000000,2108.307692,0.79,1.700000,1.622591,0.279723,3.893906,207.200000,82.0,2.60,2.98,4200.0,320.000000,1938.000000,11340.000000,1851.00,2349.00,0.440000,36.0,3.43,16200.000000,49000.000000,3.98,111.786885,2.719000,3.401190,17.422820
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5614,mvc-6916,-7.098949,-2.307339,14,3,4.200544,0.0516,7.0,22.383730,28.311241,13.428571,0.957143,0.941429,913.114286,93.082353,639.278992,6992.794521,315.942857,597.171429,0.293659,25.100903,1.701429,2310.285714,2108.307692,2.595714,126.878220,1.524909,0.626346,10.998076,15.9994,8.0,0.60,0.48,90.20,45.000000,154.600000,1738.000000,35.40,54.80,0.290000,0.02658,1.52,317.500000,2108.307692,1.31,45.000000,1.292000,-0.520000,7.434038,54.938045,25.0,1.50,1.61,2334.0,120.000000,1285.517647,8067.890411,815.00,1519.00,0.294269,160.0,2.05,5150.000000,2108.307692,3.44,198.000000,1.622591,1.461105,13.618055
5615,mvc-6928,-6.543137,-2.192463,14,3,4.119621,1.7463,7.0,24.848833,30.823657,14.857143,1.000000,0.997143,1197.971429,103.368067,639.278992,7081.365949,489.942857,708.028571,0.295297,51.443760,1.781429,2128.571429,1679.648352,2.631429,127.021077,1.574338,0.882206,10.912902,15.9994,8.0,0.60,0.48,90.20,17.000000,154.600000,1550.000000,35.40,54.80,0.290000,0.02658,1.52,317.500000,608.000000,1.00,20.000000,1.277000,0.024551,6.113155,55.845000,26.0,1.80,1.94,3134.0,170.000000,1285.517647,8067.890411,1323.00,1811.00,0.310000,200.0,2.31,4910.000000,2108.307692,3.44,211.000000,1.976000,1.461105,13.618055
5616,mvc-6946,-6.122609,-1.961400,6,2,5.131960,2.1009,3.0,32.509720,50.236267,22.000000,0.883333,0.803333,1018.466667,80.429412,531.572549,7815.260274,813.573333,204.893333,0.316179,22.351053,1.736667,1045.000000,2108.307692,2.946667,91.191257,1.608394,1.344760,11.526676,15.9994,8.0,0.60,0.48,90.20,58.000000,154.600000,7310.000000,35.40,54.80,0.294269,0.02658,1.52,317.500000,2108.307692,1.96,50.000000,1.580000,1.112070,7.343918,118.710000,50.0,1.45,1.45,2875.0,91.644118,1285.517647,8067.890411,2369.92,505.08,0.360000,67.0,2.17,2500.000000,2108.307692,3.44,111.786885,1.622591,1.461105,13.618055
5617,mvc-7040,-8.067357,-1.999722,18,2,3.840256,3.4689,9.0,46.546747,53.823311,22.888889,0.766667,0.791111,1374.377778,153.500980,405.915033,10946.136986,563.088889,811.288889,0.295543,10.687340,1.662222,1291.388889,2184.239316,3.097778,189.834244,1.567571,1.149837,12.332603,15.9994,8.0,0.60,0.48,90.20,91.644118,154.600000,8067.890411,35.40,54.80,0.294269,0.02658,1.52,317.500000,2108.307692,1.90,111.786885,1.375000,0.060397,7.833520,186.207000,75.0,1.35,1.88,5869.0,370.000000,1285.517647,21020.000000,2410.00,3459.00,0.300000,48.0,2.16,4700.000000,2450.000000,3.44,463.000000,1.622591,1.461105,13.618055


In [43]:
#train[train['material_id'].isin(design_matrix['material_id'])]
# #train
# train[train.material_id.isin(design_matrix.material_id)]

# train.material_id



In [34]:

IDs = list(filt_data["material_id"])
IDs = pd.DataFrame(IDs)

train_filt = train[train.index.isin(IDs.index)] 
train_filt
# y_jojo = train_filt.drop('material_id', axis=1)
#y_jojo = y_jojo.rename(index=IDs)
#y_jojo


# targets = df.loc[:,["energy",
#         "energy_per_atom",
#         "volume",
#         "formation_energy_per_atom",
#         "nsites",
#         "nelements",
#         "density",
#         "band_gap"]]

# # x_jojo = df[targets]
# # x_jojo

# cols = ["energy",
#         "energy_per_atom",
#         "volume",
#         "formation_energy_per_atom",
#         "nsites",
#         "nelements",
#         "density",
#         "band_gap"] 
# design_matrix= df[cols]
train

Unnamed: 0,material_id,dielectric_poly_total
0,mp-555903,8.337936
1,mp-752658,14.735277
2,mp-3439,17.195305
3,mp-16135,21.593507
4,mp-36447,9.507068
...,...,...
5614,mp-643378,20.365294
5615,mp-3536,8.476483
5616,mp-760402,27.401830
5617,mp-28109,8.384639


In [35]:
train_filt

Unnamed: 0,material_id,dielectric_poly_total
0,mp-555903,8.337936
1,mp-752658,14.735277
2,mp-3439,17.195305
3,mp-16135,21.593507
4,mp-36447,9.507068
...,...,...
5609,mp-754117,13.378949
5610,mp-1539137,16.908907
5611,mp-1079559,11.776195
5612,mp-555908,6.241000


In [36]:
design_matrix

Unnamed: 0,material_id,energy_per_atom,formation_energy_per_atom,nsites,nelements,density,band_gap,num_atoms,volume_per_atom,average_Atomic mass,average_Atomic no,average_Atomic radius,average_Atomic radius calculated,average_Boiling point,average_Bulk modulus,average_Critical temperature,average_Density of solid,average_Liquid range,average_Melting point,average_Poissons ratio,average_Thermal conductivity,average_Van der waals radius,average_Velocity of sound,average_Vickers hardness,average_X,average_Youngs modulus,average_Metallic radius,average_Electron affinity,average_First Ionization Energy,min_Atomic mass,min_Atomic no,min_Atomic radius,min_Atomic radius calculated,min_Boiling point,min_Bulk modulus,min_Critical temperature,min_Density of solid,min_Liquid range,min_Melting point,min_Poissons ratio,min_Thermal conductivity,min_Van der waals radius,min_Velocity of sound,min_Vickers hardness,min_X,min_Youngs modulus,min_Metallic radius,min_Electron affinity,min_First Ionization Energy,max_Atomic mass,max_Atomic no,max_Atomic radius,max_Atomic radius calculated,max_Boiling point,max_Bulk modulus,max_Critical temperature,max_Density of solid,max_Liquid range,max_Melting point,max_Poissons ratio,max_Thermal conductivity,max_Van der waals radius,max_Velocity of sound,max_Vickers hardness,max_X,max_Youngs modulus,max_Metallic radius,max_Electron affinity,max_First Ionization Energy
0,mp-1001034,-3.715648,-0.703661,14,3,5.030727,0.7432,7.0,53.735123,81.397286,35.142857,1.314286,1.241429,1412.142857,37.355462,1560.078992,5090.571429,875.214286,536.928571,0.314077,46.582857,1.884286,2918.857143,2108.307692,2.152857,15.285714,1.632909,1.204324,8.318359,24.3050,12.0,1.15,1.03,958.00,8.300000,1285.517647,1738.000000,440.00,429.75,0.290000,0.52000,1.73,1215.000000,2108.307692,1.31,10.000000,1.600000,-0.420000,5.786356,114.818000,49.0,1.55,1.56,2345.0,91.644118,1766.000000,7310.000000,1915.25,923.00,0.330000,160.0,1.93,4602.000000,2108.307692,2.55,45.000000,1.670000,2.020605,9.752392
1,mp-1001780,-5.591101,-1.699567,4,3,6.400668,1.5031,4.0,19.628830,75.660750,33.000000,1.275000,1.345000,2077.685000,50.850000,1299.758824,5670.250000,1062.812500,1014.872500,0.297135,104.102500,1.950000,3684.751136,1436.403846,2.082500,105.643443,1.564545,1.407216,8.468068,32.0650,16.0,1.00,0.88,717.87,7.700000,1285.517647,1960.000000,329.51,388.36,0.260000,0.20500,1.80,3570.000000,369.000000,1.27,69.000000,1.278000,0.238870,5.425871,174.967000,71.0,1.75,2.17,3675.0,140.000000,1314.000000,9841.000000,1842.23,1925.00,0.340000,400.0,2.24,3723.001515,2108.307692,2.58,130.000000,1.735000,2.077105,10.360010
2,mp-1001786,-5.900228,-2.103385,4,3,2.687084,1.5296,4.0,17.925309,29.006728,14.000000,1.262500,1.317500,1538.435000,20.850000,1784.129412,1860.000000,777.332500,761.102500,0.290702,25.352500,1.892500,4292.251136,2108.307692,1.875000,75.618443,1.601545,1.240115,8.168306,6.9410,3.0,1.00,0.88,717.87,7.700000,1285.517647,535.000000,329.51,388.36,0.280000,0.20500,1.80,3723.001515,2108.307692,0.98,4.900000,1.520000,0.188200,5.391715,44.955912,21.0,1.60,1.84,3103.0,57.000000,3223.000000,2985.000000,1289.00,1814.00,0.294269,85.0,2.15,6000.000000,2108.307692,2.58,111.786885,1.641000,2.077105,10.360010
3,mp-1002124,-9.890380,-0.298402,2,2,9.868236,0.5774,2.0,16.027883,95.250350,39.000000,1.125000,1.375000,4588.000000,71.500000,1285.517647,7788.500000,1435.000000,3153.000000,0.332135,81.500000,1.965000,10680.000000,1934.153846,1.925000,94.893443,1.601295,0.720092,9.042679,12.0107,6.0,0.70,0.67,4300.00,33.000000,1285.517647,2267.000000,500.00,2506.00,0.294269,23.00000,1.70,3010.000000,1760.000000,1.30,78.000000,1.580000,0.178070,6.825070,178.490000,72.0,1.55,2.08,4876.0,110.000000,1285.517647,13310.000000,2370.00,3800.00,0.370000,140.0,2.23,18350.000000,2108.307692,2.55,111.786885,1.622591,1.262114,11.260288
4,mp-1004528,-5.614998,-3.162482,17,4,3.068458,6.3125,17.0,19.116437,35.324664,15.294118,0.761765,0.715882,975.668235,123.960554,518.121799,6906.687349,470.282353,505.385882,0.302842,8.960729,1.697059,5779.942335,10383.312217,3.352941,99.676663,1.694580,2.498929,14.428146,10.8110,5.0,0.50,0.42,85.03,1.600000,144.000000,1879.000000,31.50,53.53,0.294269,0.02770,1.47,1260.000000,2108.307692,0.79,1.700000,1.622591,0.279723,3.893906,207.200000,82.0,2.60,2.98,4200.0,320.000000,1938.000000,11340.000000,1851.00,2349.00,0.440000,36.0,3.43,16200.000000,49000.000000,3.98,111.786885,2.719000,3.401190,17.422820
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5614,mvc-6916,-7.098949,-2.307339,14,3,4.200544,0.0516,7.0,22.383730,28.311241,13.428571,0.957143,0.941429,913.114286,93.082353,639.278992,6992.794521,315.942857,597.171429,0.293659,25.100903,1.701429,2310.285714,2108.307692,2.595714,126.878220,1.524909,0.626346,10.998076,15.9994,8.0,0.60,0.48,90.20,45.000000,154.600000,1738.000000,35.40,54.80,0.290000,0.02658,1.52,317.500000,2108.307692,1.31,45.000000,1.292000,-0.520000,7.434038,54.938045,25.0,1.50,1.61,2334.0,120.000000,1285.517647,8067.890411,815.00,1519.00,0.294269,160.0,2.05,5150.000000,2108.307692,3.44,198.000000,1.622591,1.461105,13.618055
5615,mvc-6928,-6.543137,-2.192463,14,3,4.119621,1.7463,7.0,24.848833,30.823657,14.857143,1.000000,0.997143,1197.971429,103.368067,639.278992,7081.365949,489.942857,708.028571,0.295297,51.443760,1.781429,2128.571429,1679.648352,2.631429,127.021077,1.574338,0.882206,10.912902,15.9994,8.0,0.60,0.48,90.20,17.000000,154.600000,1550.000000,35.40,54.80,0.290000,0.02658,1.52,317.500000,608.000000,1.00,20.000000,1.277000,0.024551,6.113155,55.845000,26.0,1.80,1.94,3134.0,170.000000,1285.517647,8067.890411,1323.00,1811.00,0.310000,200.0,2.31,4910.000000,2108.307692,3.44,211.000000,1.976000,1.461105,13.618055
5616,mvc-6946,-6.122609,-1.961400,6,2,5.131960,2.1009,3.0,32.509720,50.236267,22.000000,0.883333,0.803333,1018.466667,80.429412,531.572549,7815.260274,813.573333,204.893333,0.316179,22.351053,1.736667,1045.000000,2108.307692,2.946667,91.191257,1.608394,1.344760,11.526676,15.9994,8.0,0.60,0.48,90.20,58.000000,154.600000,7310.000000,35.40,54.80,0.294269,0.02658,1.52,317.500000,2108.307692,1.96,50.000000,1.580000,1.112070,7.343918,118.710000,50.0,1.45,1.45,2875.0,91.644118,1285.517647,8067.890411,2369.92,505.08,0.360000,67.0,2.17,2500.000000,2108.307692,3.44,111.786885,1.622591,1.461105,13.618055
5617,mvc-7040,-8.067357,-1.999722,18,2,3.840256,3.4689,9.0,46.546747,53.823311,22.888889,0.766667,0.791111,1374.377778,153.500980,405.915033,10946.136986,563.088889,811.288889,0.295543,10.687340,1.662222,1291.388889,2184.239316,3.097778,189.834244,1.567571,1.149837,12.332603,15.9994,8.0,0.60,0.48,90.20,91.644118,154.600000,8067.890411,35.40,54.80,0.294269,0.02658,1.52,317.500000,2108.307692,1.90,111.786885,1.375000,0.060397,7.833520,186.207000,75.0,1.35,1.88,5869.0,370.000000,1285.517647,21020.000000,2410.00,3459.00,0.300000,48.0,2.16,4700.000000,2450.000000,3.44,463.000000,1.622591,1.461105,13.618055


In [47]:

train_filt.set_index('material_id')

Unnamed: 0_level_0,dielectric_poly_total
material_id,Unnamed: 1_level_1
mp-555903,8.337936
mp-752658,14.735277
mp-3439,17.195305
mp-16135,21.593507
mp-36447,9.507068
...,...
mp-754117,13.378949
mp-1539137,16.908907
mp-1079559,11.776195
mp-555908,6.241000


In [37]:
design_matrix.set_index('material_id',inplace=True)
design_matrix


Unnamed: 0_level_0,energy_per_atom,formation_energy_per_atom,nsites,nelements,density,band_gap,num_atoms,volume_per_atom,average_Atomic mass,average_Atomic no,average_Atomic radius,average_Atomic radius calculated,average_Boiling point,average_Bulk modulus,average_Critical temperature,average_Density of solid,average_Liquid range,average_Melting point,average_Poissons ratio,average_Thermal conductivity,average_Van der waals radius,average_Velocity of sound,average_Vickers hardness,average_X,average_Youngs modulus,average_Metallic radius,average_Electron affinity,average_First Ionization Energy,min_Atomic mass,min_Atomic no,min_Atomic radius,min_Atomic radius calculated,min_Boiling point,min_Bulk modulus,min_Critical temperature,min_Density of solid,min_Liquid range,min_Melting point,min_Poissons ratio,min_Thermal conductivity,min_Van der waals radius,min_Velocity of sound,min_Vickers hardness,min_X,min_Youngs modulus,min_Metallic radius,min_Electron affinity,min_First Ionization Energy,max_Atomic mass,max_Atomic no,max_Atomic radius,max_Atomic radius calculated,max_Boiling point,max_Bulk modulus,max_Critical temperature,max_Density of solid,max_Liquid range,max_Melting point,max_Poissons ratio,max_Thermal conductivity,max_Van der waals radius,max_Velocity of sound,max_Vickers hardness,max_X,max_Youngs modulus,max_Metallic radius,max_Electron affinity,max_First Ionization Energy
material_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1
mp-1001034,-3.715648,-0.703661,14,3,5.030727,0.7432,7.0,53.735123,81.397286,35.142857,1.314286,1.241429,1412.142857,37.355462,1560.078992,5090.571429,875.214286,536.928571,0.314077,46.582857,1.884286,2918.857143,2108.307692,2.152857,15.285714,1.632909,1.204324,8.318359,24.3050,12.0,1.15,1.03,958.00,8.300000,1285.517647,1738.000000,440.00,429.75,0.290000,0.52000,1.73,1215.000000,2108.307692,1.31,10.000000,1.600000,-0.420000,5.786356,114.818000,49.0,1.55,1.56,2345.0,91.644118,1766.000000,7310.000000,1915.25,923.00,0.330000,160.0,1.93,4602.000000,2108.307692,2.55,45.000000,1.670000,2.020605,9.752392
mp-1001780,-5.591101,-1.699567,4,3,6.400668,1.5031,4.0,19.628830,75.660750,33.000000,1.275000,1.345000,2077.685000,50.850000,1299.758824,5670.250000,1062.812500,1014.872500,0.297135,104.102500,1.950000,3684.751136,1436.403846,2.082500,105.643443,1.564545,1.407216,8.468068,32.0650,16.0,1.00,0.88,717.87,7.700000,1285.517647,1960.000000,329.51,388.36,0.260000,0.20500,1.80,3570.000000,369.000000,1.27,69.000000,1.278000,0.238870,5.425871,174.967000,71.0,1.75,2.17,3675.0,140.000000,1314.000000,9841.000000,1842.23,1925.00,0.340000,400.0,2.24,3723.001515,2108.307692,2.58,130.000000,1.735000,2.077105,10.360010
mp-1001786,-5.900228,-2.103385,4,3,2.687084,1.5296,4.0,17.925309,29.006728,14.000000,1.262500,1.317500,1538.435000,20.850000,1784.129412,1860.000000,777.332500,761.102500,0.290702,25.352500,1.892500,4292.251136,2108.307692,1.875000,75.618443,1.601545,1.240115,8.168306,6.9410,3.0,1.00,0.88,717.87,7.700000,1285.517647,535.000000,329.51,388.36,0.280000,0.20500,1.80,3723.001515,2108.307692,0.98,4.900000,1.520000,0.188200,5.391715,44.955912,21.0,1.60,1.84,3103.0,57.000000,3223.000000,2985.000000,1289.00,1814.00,0.294269,85.0,2.15,6000.000000,2108.307692,2.58,111.786885,1.641000,2.077105,10.360010
mp-1002124,-9.890380,-0.298402,2,2,9.868236,0.5774,2.0,16.027883,95.250350,39.000000,1.125000,1.375000,4588.000000,71.500000,1285.517647,7788.500000,1435.000000,3153.000000,0.332135,81.500000,1.965000,10680.000000,1934.153846,1.925000,94.893443,1.601295,0.720092,9.042679,12.0107,6.0,0.70,0.67,4300.00,33.000000,1285.517647,2267.000000,500.00,2506.00,0.294269,23.00000,1.70,3010.000000,1760.000000,1.30,78.000000,1.580000,0.178070,6.825070,178.490000,72.0,1.55,2.08,4876.0,110.000000,1285.517647,13310.000000,2370.00,3800.00,0.370000,140.0,2.23,18350.000000,2108.307692,2.55,111.786885,1.622591,1.262114,11.260288
mp-1004528,-5.614998,-3.162482,17,4,3.068458,6.3125,17.0,19.116437,35.324664,15.294118,0.761765,0.715882,975.668235,123.960554,518.121799,6906.687349,470.282353,505.385882,0.302842,8.960729,1.697059,5779.942335,10383.312217,3.352941,99.676663,1.694580,2.498929,14.428146,10.8110,5.0,0.50,0.42,85.03,1.600000,144.000000,1879.000000,31.50,53.53,0.294269,0.02770,1.47,1260.000000,2108.307692,0.79,1.700000,1.622591,0.279723,3.893906,207.200000,82.0,2.60,2.98,4200.0,320.000000,1938.000000,11340.000000,1851.00,2349.00,0.440000,36.0,3.43,16200.000000,49000.000000,3.98,111.786885,2.719000,3.401190,17.422820
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
mvc-6916,-7.098949,-2.307339,14,3,4.200544,0.0516,7.0,22.383730,28.311241,13.428571,0.957143,0.941429,913.114286,93.082353,639.278992,6992.794521,315.942857,597.171429,0.293659,25.100903,1.701429,2310.285714,2108.307692,2.595714,126.878220,1.524909,0.626346,10.998076,15.9994,8.0,0.60,0.48,90.20,45.000000,154.600000,1738.000000,35.40,54.80,0.290000,0.02658,1.52,317.500000,2108.307692,1.31,45.000000,1.292000,-0.520000,7.434038,54.938045,25.0,1.50,1.61,2334.0,120.000000,1285.517647,8067.890411,815.00,1519.00,0.294269,160.0,2.05,5150.000000,2108.307692,3.44,198.000000,1.622591,1.461105,13.618055
mvc-6928,-6.543137,-2.192463,14,3,4.119621,1.7463,7.0,24.848833,30.823657,14.857143,1.000000,0.997143,1197.971429,103.368067,639.278992,7081.365949,489.942857,708.028571,0.295297,51.443760,1.781429,2128.571429,1679.648352,2.631429,127.021077,1.574338,0.882206,10.912902,15.9994,8.0,0.60,0.48,90.20,17.000000,154.600000,1550.000000,35.40,54.80,0.290000,0.02658,1.52,317.500000,608.000000,1.00,20.000000,1.277000,0.024551,6.113155,55.845000,26.0,1.80,1.94,3134.0,170.000000,1285.517647,8067.890411,1323.00,1811.00,0.310000,200.0,2.31,4910.000000,2108.307692,3.44,211.000000,1.976000,1.461105,13.618055
mvc-6946,-6.122609,-1.961400,6,2,5.131960,2.1009,3.0,32.509720,50.236267,22.000000,0.883333,0.803333,1018.466667,80.429412,531.572549,7815.260274,813.573333,204.893333,0.316179,22.351053,1.736667,1045.000000,2108.307692,2.946667,91.191257,1.608394,1.344760,11.526676,15.9994,8.0,0.60,0.48,90.20,58.000000,154.600000,7310.000000,35.40,54.80,0.294269,0.02658,1.52,317.500000,2108.307692,1.96,50.000000,1.580000,1.112070,7.343918,118.710000,50.0,1.45,1.45,2875.0,91.644118,1285.517647,8067.890411,2369.92,505.08,0.360000,67.0,2.17,2500.000000,2108.307692,3.44,111.786885,1.622591,1.461105,13.618055
mvc-7040,-8.067357,-1.999722,18,2,3.840256,3.4689,9.0,46.546747,53.823311,22.888889,0.766667,0.791111,1374.377778,153.500980,405.915033,10946.136986,563.088889,811.288889,0.295543,10.687340,1.662222,1291.388889,2184.239316,3.097778,189.834244,1.567571,1.149837,12.332603,15.9994,8.0,0.60,0.48,90.20,91.644118,154.600000,8067.890411,35.40,54.80,0.294269,0.02658,1.52,317.500000,2108.307692,1.90,111.786885,1.375000,0.060397,7.833520,186.207000,75.0,1.35,1.88,5869.0,370.000000,1285.517647,21020.000000,2410.00,3459.00,0.300000,48.0,2.16,4700.000000,2450.000000,3.44,463.000000,1.622591,1.461105,13.618055


In [38]:
X_train, X_test, y_train, y_test = train_test_split(design_matrix, 
                                                    train_filt,
                                                    test_size=0.1, 
                                                    random_state=42)

# Test Data Querying

In [39]:
# Using material IDs provided in training data to get corresponding information from MPD
data = mpr.query(criteria={"task_id": {"$in":test["material_id"].to_list()}}, properties=["material_id","energy",
        "energy_per_atom",
        "volume",
        "formation_energy_per_atom",
        "nsites",
        "pretty_formula",                                                                                  
        "nelements",
        "density",  "band_gap"])
df_X = pd.DataFrame(data)
display(df_X)

0 of 1400 done 0.0%
500 of 1400 done 35.7%
1000 of 1400 done 71.4%
1400 of 1400 done 100.0%


Unnamed: 0,material_id,energy,energy_per_atom,volume,formation_energy_per_atom,nsites,pretty_formula,nelements,density,band_gap
0,mp-1000,-8.646946,-4.323473,89.091084,-1.792337,2,BaTe,2,4.937886,1.8555
1,mp-10044,-11.376365,-5.688183,27.982045,-0.019233,2,BAs,2,5.087631,1.2522
2,mp-10086,-42.877615,-7.146269,99.212472,-3.359731,6,YSF,3,4.685387,1.3137
3,mp-1008858,-16.665812,-5.555271,80.958490,-0.944922,3,NdBiPd,3,9.427722,0.0900
4,mp-1008867,-11.760729,-3.920243,38.301187,-0.701211,3,NaCuO,3,4.445388,0.0017
...,...,...,...,...,...,...,...,...,...,...
1395,mvc-5908,-73.434572,-7.343457,120.736933,-2.771448,10,Ca2CrWO6,4,5.666228,1.6101
1396,mvc-5921,-64.816637,-6.481664,143.204106,-2.354943,10,Mg2SnWO6,4,5.185052,2.2892
1397,mvc-7386,-129.945458,-6.497273,386.834593,-1.936434,20,ZnMo2O7,3,3.170408,3.7020
1398,mvc-7701,-147.825353,-5.685591,330.380852,-1.957811,26,Zn2Sn3O8,3,6.181576,1.2507


In [41]:
# Remove noble gases 
filt_data_X = df_X

# #Iterating through data and using pandas drop function to filter out materials containing noble gas elements 
# for i in nobles:
#     filt_data_X = filt_data_X.drop(filt_data_X[filt_data_X['pretty_formula'].str.contains(i)].index)
    
        
# n = len(filt_data_X)
# print(f'There are {n} materials left')    
# display(filt_data_X)


# print(f'There are {n} materials left')    
# display(filt_data_X)

In [43]:
filt_data_X
#filt_data_X['Composition']

Unnamed: 0,material_id,energy,energy_per_atom,volume,formation_energy_per_atom,nsites,pretty_formula,nelements,density,band_gap
0,mp-1000,-8.646946,-4.323473,89.091084,-1.792337,2,BaTe,2,4.937886,1.8555
1,mp-10044,-11.376365,-5.688183,27.982045,-0.019233,2,BAs,2,5.087631,1.2522
2,mp-10086,-42.877615,-7.146269,99.212472,-3.359731,6,YSF,3,4.685387,1.3137
3,mp-1008858,-16.665812,-5.555271,80.958490,-0.944922,3,NdBiPd,3,9.427722,0.0900
4,mp-1008867,-11.760729,-3.920243,38.301187,-0.701211,3,NaCuO,3,4.445388,0.0017
...,...,...,...,...,...,...,...,...,...,...
1395,mvc-5908,-73.434572,-7.343457,120.736933,-2.771448,10,Ca2CrWO6,4,5.666228,1.6101
1396,mvc-5921,-64.816637,-6.481664,143.204106,-2.354943,10,Mg2SnWO6,4,5.185052,2.2892
1397,mvc-7386,-129.945458,-6.497273,386.834593,-1.936434,20,ZnMo2O7,3,3.170408,3.7020
1398,mvc-7701,-147.825353,-5.685591,330.380852,-1.957811,26,Zn2Sn3O8,3,6.181576,1.2507


In [44]:
listA_X = [a for a in filt_data_X["pretty_formula"]]
listB_X = [Composition(a) for a in listA_X]
listC_X = [a.elements for a in listB_X]

editC_X = [item for sublist in listC_X for item in sublist]
unique_editC_X = set(editC_X)
unique_editC_X

UL_editC_X = [a for a in unique_editC_X]
cprops_X = [a.data for a in UL_editC_X]

peel=set(UL_editC_X)
sy_editC_X = [a.symbol for a in UL_editC_X]
sy_editC_X


pd.set_option('display.max_rows', None)

cprops_df_X = pd.DataFrame(cprops_X, index=sy_editC_X)

droplist2 = droplist + ['Max oxidation state', 'Min oxidation state']

droplist2

cprops_df_X = cprops_df_X.drop(columns=droplist2)
cprops_df_X

Unnamed: 0,Atomic mass,Atomic no,Atomic radius,Atomic radius calculated,Boiling point,Bulk modulus,Common oxidation states,Critical temperature,Density of solid,Liquid range,Melting point,Poissons ratio,Thermal conductivity,Van der waals radius,Velocity of sound,Vickers hardness,X,Youngs modulus,Metallic radius,Ionization energies,Electron affinity
H,1.00794,1,0.25,0.53,20.28 K,no data GPa,"[-1, 1]",33 K,no data kg m<sup>-3</sup>,6.27 K,14.01 K,no data,0.1805 W m<sup>-1</sup> K<sup>-1</sup>,1.1,1270 m s<sup>-1</sup>,no data MN m<sup>-2</sup>,2.2,no data GPa,no data,[13.598434599702],0.754598
Li,6.941,3,1.45,1.67,1615 K,11 GPa,[1],3223 K,535 kg m<sup>-3</sup>,1161.31 K,453.69 K,no data,85 W m<sup>-1</sup> K<sup>-1</sup>,1.82,6000 m s<sup>-1</sup>,no data MN m<sup>-2</sup>,0.98,4.9 GPa,1.52,"[5.391714996, 75.640097, 122.45435914]",0.618049
Be,9.012182,4,1.05,1.12,2742 K,130 GPa,[2],no data K,1848 kg m<sup>-3</sup>,1182 K,1560 K,0.032,190 W m<sup>-1</sup> K<sup>-1</sup>,1.53,13000 m s<sup>-1</sup>,1670 MN m<sup>-2</sup>,1.57,287 GPa,1.12,"[9.322699, 18.21115, 153.896205, 217.7185861]",-0.52
B,10.811,5,0.85,0.87,4200 K,320 GPa,[3],no data K,2460 kg m<sup>-3</sup>,1851 K,2349 K,no data,27 W m<sup>-1</sup> K<sup>-1</sup>,1.92,16200 m s<sup>-1</sup>,49000 MN m<sup>-2</sup>,2.04,no data GPa,no data,"[8.298019, 25.15483, 37.93059, 259.3715, 340.2...",0.279723
C,12.0107,6,0.7,0.67,4300 K,33 GPa,"[-4, 4]",no data K,2267 kg m<sup>-3</sup>,500 K,3800 K,no data,140 W m<sup>-1</sup> K<sup>-1</sup>,1.7,18350 m s<sup>-1</sup>,no data MN m<sup>-2</sup>,2.55,no data GPa,no data,"[11.260288, 24.383154, 47.88778, 64.49352, 392...",1.262114
N,14.0067,7,0.65,0.56,77.36 K,no data GPa,"[-3, 3, 5]",126.2 K,no data kg m<sup>-3</sup>,14.31 K,63.05 K,no data,0.02583 W m<sup>-1</sup> K<sup>-1</sup>,1.55,333.6 m s<sup>-1</sup>,no data MN m<sup>-2</sup>,3.04,no data GPa,no data,"[14.53413, 29.60125, 47.4453, 77.4735, 97.8901...",-0.07
O,15.9994,8,0.6,0.48,90.2 K,no data GPa,[-2],154.6 K,no data kg m<sup>-3</sup>,35.4 K,54.8 K,no data,0.02658 W m<sup>-1</sup> K<sup>-1</sup>,1.52,317.5 m s<sup>-1</sup>,no data MN m<sup>-2</sup>,3.44,no data GPa,no data,"[13.618055, 35.12112, 54.93554, 77.4135, 113.8...",1.461105
F,18.998403,9,0.5,0.42,85.03 K,no data GPa,[-1],144 K,no data kg m<sup>-3</sup>,31.5 K,53.53 K,no data,0.0277 W m<sup>-1</sup> K<sup>-1</sup>,1.47,no data m s<sup>-1</sup>,no data MN m<sup>-2</sup>,3.98,no data GPa,no data,"[17.42282, 34.97081, 62.70798, 87.175, 114.249...",3.40119
Na,22.989769,11,1.8,1.9,1156 K,6.3 GPa,[1],2573 K,968 kg m<sup>-3</sup>,785.13 K,370.87 K,no data,140 W m<sup>-1</sup> K<sup>-1</sup>,2.27,3200 m s<sup>-1</sup>,no data MN m<sup>-2</sup>,0.93,10 GPa,1.86,"[5.13907696, 47.28636, 71.62, 98.936, 138.404,...",0.547926
Mg,24.305,12,1.5,1.45,1363 K,45 GPa,[2],no data K,1738 kg m<sup>-3</sup>,440 K,923 K,0.29,160 W m<sup>-1</sup> K<sup>-1</sup>,1.73,4602 m s<sup>-1</sup>,no data MN m<sup>-2</sup>,1.31,45 GPa,1.6,"[7.646236, 15.035271, 80.1436, 109.2654, 141.3...",-0.42


In [45]:

cprops_df_X['Boiling point'] = [(a.replace('K', '', 1)) for a in cprops_df_X['Boiling point']]
cprops_df_X['Bulk modulus'] = cprops_df_X["Bulk modulus"].str.replace("GPa", "")
cprops_df_X['Critical temperature'] = cprops_df_X["Critical temperature"].str.replace("K", "")
cprops_df_X['Density of solid'] = cprops_df_X['Density of solid'].str.replace("no data", "NaN")
cprops_df_X['Density of solid'] = cprops_df_X["Density of solid"].str.replace("kg m<sup>-3</sup>", "")
cprops_df_X['Liquid range'] = [(a.replace('K', '', 1)) for a in cprops_df_X['Liquid range']]
cprops_df_X['Poissons ratio'] = [(a.replace('no data', 'NaN', 1)) for a in cprops_df_X['Poissons ratio']]
cprops_df_X['Poissons ratio'] = cprops_df_X['Poissons ratio'].str.replace("no data", "").astype(float)
cprops_df_X['Thermal conductivity'] = [float(a.replace('W m<sup>-1</sup> K<sup>-1</sup>', "", 1)) for a in cprops_df_X['Thermal conductivity']]
cprops_df_X['Velocity of sound'] = cprops_df_X['Velocity of sound'].str.replace("no data","NaN")
cprops_df_X['Velocity of sound'] = [(a.replace('m s<sup>-1</sup>', '', 1)) for a in cprops_df_X['Velocity of sound']]
cprops_df_X['Vickers hardness'] = cprops_df_X['Vickers hardness'].str.replace("no data", "NaN")
cprops_df_X['Vickers hardness'] = [(a.replace('MN m<sup>-2</sup>', '', 1)) for a in cprops_df_X['Vickers hardness']]
cprops_df_X['Youngs modulus'] = cprops_df_X['Youngs modulus'].str.replace("no data", "NaN")
cprops_df_X['Youngs modulus'] = [(a.replace('GPa', '', 1)) for a in cprops_df_X['Youngs modulus']]
cprops_df_X['Bulk modulus'] = [(a.replace('no data', 'NaN', 1)) for a in cprops_df_X['Bulk modulus']]
cprops_df_X['Bulk modulus'] = [(a.replace('liquid', '', 1)) for a in cprops_df_X['Bulk modulus']]
cprops_df_X['Bulk modulus'] = cprops_df_X['Bulk modulus'].str.replace(r"\(.*\)","",  regex=True).astype(float)
cprops_df_X['Melting point'] = cprops_df_X['Melting point'].str.replace("K", "")
cprops_df_X['Melting point'] = cprops_df_X['Melting point'].str.replace("white P", "")
cprops_df_X['Melting point'] = cprops_df_X['Melting point'].str.replace(r"\(.*\)","",  regex=True).astype(float)
cprops_df_X['Metallic radius'] = cprops_df_X['Metallic radius'].astype(str)
cprops_df_X['Metallic radius'] = [(a.replace('no data', 'NaN', 1)) for a in cprops_df_X['Metallic radius']]
cprops_df_X['Metallic radius'] = cprops_df_X['Metallic radius'].astype(float)
cprops_df_X['First Ionization Energy'] = [a[0] for a in cprops_df_X['Ionization energies']]
cprops_df_X = cprops_df_X.drop("Ionization energies", axis=1)
cprops_df_X['Critical temperature'] = [(a.replace('no data', 'NaN', 1)) for a in cprops_df_X['Critical temperature']]

# cprops_df_X = cprops_df_X.apply(pd.to_numeric, errors='coerce')

In [46]:
cprops_df_X = cprops_df_X.apply(pd.to_numeric, errors='coerce')

In [47]:
# Computing mean values using mean function 
mvals_X = dict(cprops_df_X.mean())
mvals_X

for key, value in mvals_X.items():
    cprops_df_X.loc[cprops_df_X[key].isnull(),key] = value

display(cprops_df_X)

Unnamed: 0,Atomic mass,Atomic no,Atomic radius,Atomic radius calculated,Boiling point,Bulk modulus,Common oxidation states,Critical temperature,Density of solid,Liquid range,Melting point,Poissons ratio,Thermal conductivity,Van der waals radius,Velocity of sound,Vickers hardness,X,Youngs modulus,Metallic radius,Electron affinity,First Ionization Energy
H,1.00794,1.0,0.25,0.53,20.28,92.330303,,33.0,7915.56338,6.27,14.01,0.29624,0.1805,1.1,1270.0,2153.891892,2.2,111.118644,1.621047,0.754598,13.598435
Li,6.941,3.0,1.45,1.67,1615.0,11.0,,3223.0,535.0,1161.31,453.69,0.29624,85.0,1.82,6000.0,2153.891892,0.98,4.9,1.52,0.618049,5.391715
Be,9.012182,4.0,1.05,1.12,2742.0,130.0,,1230.194444,1848.0,1182.0,1560.0,0.032,190.0,1.53,13000.0,1670.0,1.57,287.0,1.12,-0.52,9.322699
B,10.811,5.0,0.85,0.87,4200.0,320.0,,1230.194444,2460.0,1851.0,2349.0,0.29624,27.0,1.92,16200.0,49000.0,2.04,111.118644,1.621047,0.279723,8.298019
C,12.0107,6.0,0.7,0.67,4300.0,33.0,,1230.194444,2267.0,500.0,3800.0,0.29624,140.0,1.7,18350.0,2153.891892,2.55,111.118644,1.621047,1.262114,11.260288
N,14.0067,7.0,0.65,0.56,77.36,92.330303,,126.2,7915.56338,14.31,63.05,0.29624,0.02583,1.55,333.6,2153.891892,3.04,111.118644,1.621047,-0.07,14.53413
O,15.9994,8.0,0.6,0.48,90.2,92.330303,,154.6,7915.56338,35.4,54.8,0.29624,0.02658,1.52,317.5,2153.891892,3.44,111.118644,1.621047,1.461105,13.618055
F,18.998403,9.0,0.5,0.42,85.03,92.330303,,144.0,7915.56338,31.5,53.53,0.29624,0.0277,1.47,3707.278462,2153.891892,3.98,111.118644,1.621047,3.40119,17.42282
Na,22.989769,11.0,1.8,1.9,1156.0,6.3,,2573.0,968.0,785.13,370.87,0.29624,140.0,2.27,3200.0,2153.891892,0.93,10.0,1.86,0.547926,5.139077
Mg,24.305,12.0,1.5,1.45,1363.0,45.0,,1230.194444,1738.0,440.0,923.0,0.29,160.0,1.73,4602.0,2153.891892,1.31,45.0,1.6,-0.42,7.646236


In [60]:
type(cprops_df_X.iloc[49, 2])

numpy.float64

In [66]:
cprops_df_X.dtypes

Atomic mass                 float64
Atomic no                   float64
Atomic radius               float64
Atomic radius calculated    float64
Boiling point               float64
Bulk modulus                float64
Common oxidation states     float64
Critical temperature        float64
Density of solid            float64
Liquid range                float64
Melting point               float64
Poissons ratio              float64
Thermal conductivity        float64
Van der waals radius        float64
Velocity of sound           float64
Vickers hardness            float64
X                           float64
Youngs modulus              float64
Metallic radius             float64
Electron affinity           float64
First Ionization Energy     float64
dtype: object

In [None]:
filt_data_comp_X = filt_data_X
filt_data_comp_X['Composition'] = [Composition(c) for c in filt_data_comp_X["pretty_formula"]]
filt_data_comp_X['num_atoms'] = [c.num_atoms for c in filt_data_comp_X['Composition']]
filt_data_comp_X['volume_per_atom'] = filt_data_comp_X['volume']/filt_data_comp_X['num_atoms']
filt_data_comp_X

In [61]:
filt_data_comp_X

Unnamed: 0,material_id,energy,energy_per_atom,volume,formation_energy_per_atom,nsites,pretty_formula,nelements,density,band_gap,Composition,num_atoms,volume_per_atom
0,mp-1000,-8.646946,-4.323473,89.091084,-1.792337,2,BaTe,2,4.937886,1.8555,"(Ba, Te)",2.0,44.545542
1,mp-10044,-11.376365,-5.688183,27.982045,-0.01923329,2,BAs,2,5.087631,1.2522,"(B, As)",2.0,13.991023
2,mp-10086,-42.877615,-7.146269,99.212472,-3.359731,6,YSF,3,4.685387,1.3137,"(Y, S, F)",3.0,33.070824
3,mp-1008858,-16.665812,-5.555271,80.95849,-0.9449219,3,NdBiPd,3,9.427722,0.09,"(Nd, Bi, Pd)",3.0,26.986163
4,mp-1008867,-11.760729,-3.920243,38.301187,-0.7012111,3,NaCuO,3,4.445388,0.0017,"(Na, Cu, O)",3.0,12.767062
5,mp-1009087,-38.652556,-4.831569,155.051465,-0.2116146,8,BeSiAs2,3,4.004124,1.0226,"(Be, Si, As)",4.0,38.762866
6,mp-10096,-84.572613,-3.84421,570.491249,-0.7818146,22,Na3Sr3GaP4,4,3.058858,0.7467,"(Na, Sr, Ga, P)",11.0,51.862841
7,mp-1009894,-18.157914,-9.078957,33.282905,-0.1917217,2,ZrC,2,5.15055,0.5469,"(Zr, C)",2.0,16.641452
8,mp-1013911,-313.454217,-6.530296,637.920695,-2.496547,48,Na2CoP2O7,4,2.903504,3.0679,"(Na, Co, P, O)",12.0,53.160058
9,mp-1014013,-21.472932,-5.368233,534.97062,0.04505289,4,P,1,0.384568,0.8924,(P),1.0,534.97062


In [None]:
cprops_df_X

In [63]:
for a in cprops_df_X['Atomic radius']:
    try:
        a.replace('no data', 'NaN', 1)
    except:
        print("wont replace")
    
    

wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace
wont replace

In [64]:
cprops_df_X

Unnamed: 0,Atomic mass,Atomic no,Atomic radius,Atomic radius calculated,Boiling point,Bulk modulus,Common oxidation states,Critical temperature,Density of solid,Liquid range,Melting point,Poissons ratio,Thermal conductivity,Van der waals radius,Velocity of sound,Vickers hardness,X,Youngs modulus,Metallic radius,Electron affinity,First Ionization Energy
H,1.00794,1.0,0.25,0.53,20.28,92.330303,,33.0,7915.56338,6.27,14.01,0.29624,0.1805,1.1,1270.0,2153.891892,2.2,111.118644,1.621047,0.754598,13.598435
Li,6.941,3.0,1.45,1.67,1615.0,11.0,,3223.0,535.0,1161.31,453.69,0.29624,85.0,1.82,6000.0,2153.891892,0.98,4.9,1.52,0.618049,5.391715
Be,9.012182,4.0,1.05,1.12,2742.0,130.0,,1230.194444,1848.0,1182.0,1560.0,0.032,190.0,1.53,13000.0,1670.0,1.57,287.0,1.12,-0.52,9.322699
B,10.811,5.0,0.85,0.87,4200.0,320.0,,1230.194444,2460.0,1851.0,2349.0,0.29624,27.0,1.92,16200.0,49000.0,2.04,111.118644,1.621047,0.279723,8.298019
C,12.0107,6.0,0.7,0.67,4300.0,33.0,,1230.194444,2267.0,500.0,3800.0,0.29624,140.0,1.7,18350.0,2153.891892,2.55,111.118644,1.621047,1.262114,11.260288
N,14.0067,7.0,0.65,0.56,77.36,92.330303,,126.2,7915.56338,14.31,63.05,0.29624,0.02583,1.55,333.6,2153.891892,3.04,111.118644,1.621047,-0.07,14.53413
O,15.9994,8.0,0.6,0.48,90.2,92.330303,,154.6,7915.56338,35.4,54.8,0.29624,0.02658,1.52,317.5,2153.891892,3.44,111.118644,1.621047,1.461105,13.618055
F,18.998403,9.0,0.5,0.42,85.03,92.330303,,144.0,7915.56338,31.5,53.53,0.29624,0.0277,1.47,3707.278462,2153.891892,3.98,111.118644,1.621047,3.40119,17.42282
Na,22.989769,11.0,1.8,1.9,1156.0,6.3,,2573.0,968.0,785.13,370.87,0.29624,140.0,2.27,3200.0,2153.891892,0.93,10.0,1.86,0.547926,5.139077
Mg,24.305,12.0,1.5,1.45,1363.0,45.0,,1230.194444,1738.0,440.0,923.0,0.29,160.0,1.73,4602.0,2153.891892,1.31,45.0,1.6,-0.42,7.646236


In [None]:
cprops_df_dict_X



In [76]:
cprops_df_dict_X = cprops_df_X.to_dict()

rad_dict_X = cprops_df_dict_X['Atomic radius']

rad_dict_X = filt_data_comp_X["Composition"].apply(radius_mean)
atomic_radius_X

KeyError: 'Xe'

In [73]:
avg_properties_df_X = pd.DataFrame()

for property in cprops_df_X.columns:
    individualpropertymean_X = partial(propertymean, property)
    averages_X = filt_data_comp_X['Composition'].apply(individualpropertymean_X)
    avg_properties_df_X[("average_" + property)] = averages_X
    
avg_properties_df_X.head()
print("Average properties Dimension: ", avg_properties_df_X.shape)

max_properties_X = pd.DataFrame()

for property in cprops_df_X.columns:
    individualpropertymax_X = partial(maxofproperty, property)
    max = filt_data_comp_X['Composition'].apply(individualpropertymax_X)
    max_properties_X[("max_" + property)] = max
    
min_properties_X = pd.DataFrame()

for property in cprops_df.columns:
    individualpropertymin_X = partial(minofproperty, property)
    min = filt_data_comp_X['Composition'].apply(individualpropertymin_X)
    min_properties_X[("min_" + property)] = min

KeyError: 'Xe'

In [None]:
design_matrix_X= pd.concat([filt_data_comp_X, avg_properties_df_X, min_properties_X, max_properties_X], axis=1)
design_matrix_X.columns

design_matrix_X = design_matrix_X.drop(columns=droplist2)
design_matrix_X

In [180]:
design_matrix_X.set_index('material_id',inplace=True)
design_matrix_X

Unnamed: 0_level_0,energy_per_atom,formation_energy_per_atom,nsites,nelements,density,band_gap,num_atoms,volume_per_atom,average_Atomic mass,average_Atomic no,average_Atomic radius,average_Atomic radius calculated,average_Boiling point,average_Bulk modulus,average_Critical temperature,average_Density of solid,average_Liquid range,average_Melting point,average_Poissons ratio,average_Thermal conductivity,average_Van der waals radius,average_Velocity of sound,average_Vickers hardness,average_X,average_Youngs modulus,average_Metallic radius,average_Electron affinity,average_First Ionization Energy,min_Atomic mass,min_Atomic no,min_Atomic radius,min_Atomic radius calculated,min_Boiling point,min_Bulk modulus,min_Critical temperature,min_Density of solid,min_Liquid range,min_Melting point,min_Poissons ratio,min_Thermal conductivity,min_Van der waals radius,min_Velocity of sound,min_Vickers hardness,min_X,min_Youngs modulus,min_Metallic radius,min_Electron affinity,min_First Ionization Energy,max_Atomic mass,max_Atomic no,max_Atomic radius,max_Atomic radius calculated,max_Boiling point,max_Bulk modulus,max_Critical temperature,max_Density of solid,max_Liquid range,max_Melting point,max_Poissons ratio,max_Thermal conductivity,max_Van der waals radius,max_Velocity of sound,max_Vickers hardness,max_X,max_Youngs modulus,max_Metallic radius,max_Electron affinity,max_First Ionization Energy
material_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1
mp-1000,-4.323473,-1.792337,2,2,4.937886,1.8555,2.0,44.545542,132.463500,54.000000,1.775000,1.880000,1702.000000,37.300000,1285.517647,4875.000000,840.670000,861.330000,0.294269,10.500000,2.370000,2115.000000,2108.307692,1.495000,28.000000,1.929295,1.057751,7.110736,127.600000,52.0,1.40,1.23,1261.00,9.6,1285.517647,3510.0,538.34,722.66,0.294269,3.00000,2.06,1620.000000,2108.307692,0.89,13.0,1.622591,0.144626,5.211665,137.32700,56.0,2.15,2.53,2143.0,65.000000,1285.517647,6240.000000,1143.00,1000.00,0.294269,18.0,2.68,2610.000000,2108.307692,2.10,43.000000,2.236000,1.970876,9.009808
mp-10044,-5.688183,-0.019233,2,2,5.087631,1.2522,2.0,13.991023,42.866300,19.000000,1.000000,1.005000,2543.500000,171.000000,1492.758824,4093.500000,1027.000000,1719.500000,0.294269,38.500000,1.885000,9961.500758,25554.153846,2.110000,59.893443,1.622591,0.542272,9.043285,10.811000,5.0,0.85,0.87,887.00,22.0,1285.517647,2460.0,203.00,1090.00,0.294269,27.00000,1.85,3723.001515,2108.307692,2.04,8.0,1.622591,0.279723,8.298019,74.92160,33.0,1.15,1.14,4200.0,320.000000,1700.000000,5727.000000,1851.00,2349.00,0.294269,50.0,1.92,16200.000000,49000.000000,2.18,111.786885,1.622591,0.804820,9.788550
mp-10086,-7.146269,-3.359731,6,3,4.685387,1.3137,3.0,33.070824,46.656418,21.333333,1.100000,1.140000,1470.633333,46.781373,914.505882,4833.296804,723.670000,746.963333,0.276179,5.744233,1.863333,3582.001010,2108.307692,2.593333,95.857923,1.681727,1.928471,11.333363,18.998403,9.0,0.50,0.42,85.03,7.7,144.000000,1960.0,31.50,53.53,0.240000,0.02770,1.47,3300.000000,2108.307692,1.22,64.0,1.622591,0.307120,6.217260,88.90585,39.0,1.80,2.12,3609.0,91.644118,1314.000000,8067.890411,1810.00,1799.00,0.294269,17.0,2.32,3723.001515,2108.307692,3.98,111.786885,1.800000,3.401190,17.422820
mp-1008858,-5.555271,-0.944922,3,3,9.427722,0.0900,3.0,26.986163,153.214133,63.000000,1.616667,1.726667,2815.333333,81.000000,1285.517647,9534.333333,1592.183333,1223.150000,0.333333,32.333333,2.186667,2396.666667,970.769231,1.786667,64.666667,1.672333,0.533999,7.049118,106.420000,46.0,1.40,1.43,1837.00,31.0,1285.517647,6800.0,1292.60,544.40,0.280000,8.00000,2.07,1790.000000,343.000000,1.14,32.0,1.376000,0.097493,5.525000,208.98040,83.0,1.85,2.06,3373.0,180.000000,1285.517647,12023.000000,2076.00,1828.05,0.390000,72.0,2.39,3070.000000,2108.307692,2.20,121.000000,1.821000,0.942362,8.336839
mp-1008867,-3.920243,-0.701211,3,3,4.445388,0.0017,3.0,12.767062,34.178390,16.000000,1.250000,1.276667,1482.066667,79.314706,1337.705882,5985.296804,887.586667,594.480000,0.309513,180.008860,1.916667,2362.500000,1528.538462,2.090000,83.928962,1.586864,1.081605,8.827837,15.999400,8.0,0.60,0.48,90.20,6.3,154.600000,968.0,35.40,54.80,0.294269,0.02658,1.52,317.500000,369.000000,0.93,10.0,1.278000,0.547926,5.139077,63.54600,29.0,1.80,1.90,3200.0,140.000000,2573.000000,8920.000000,1842.23,1357.77,0.340000,400.0,2.27,3570.000000,2108.307692,3.44,130.000000,1.860000,1.461105,13.618055
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
mvc-5908,-7.343457,-2.771448,10,4,5.666228,1.6101,10.0,12.073693,41.198850,18.600000,0.995000,1.035000,1282.720000,105.386471,606.967059,7789.734247,439.340000,843.380000,0.287562,66.415948,1.798000,2063.900000,2135.646154,2.666000,140.072131,1.638255,1.030784,10.856518,15.999400,8.0,0.60,0.48,90.20,17.0,154.600000,1550.0,35.40,54.80,0.210000,0.02658,1.52,317.500000,1060.000000,1.00,20.0,1.285000,0.024551,6.113155,183.84000,74.0,1.80,1.94,5828.0,310.000000,1285.517647,19250.000000,2133.00,3695.00,0.310000,200.0,2.31,5940.000000,3430.000000,3.44,411.000000,1.976000,1.461105,13.618055
mvc-5921,-6.481664,-2.354943,10,4,5.185052,2.2892,10.0,14.320411,44.715640,19.600000,0.940000,0.916000,1197.020000,100.786471,606.967059,7844.334247,559.532000,637.488000,0.298562,55.715948,1.693000,1878.300000,2240.476923,2.758000,122.172131,1.592555,0.985497,11.220875,15.999400,8.0,0.60,0.48,90.20,45.0,154.600000,1738.0,35.40,54.80,0.280000,0.02658,1.52,317.500000,2108.307692,1.31,45.0,1.410000,-0.420000,7.343918,183.84000,74.0,1.50,1.93,5828.0,310.000000,1285.517647,19250.000000,2369.92,3695.00,0.360000,170.0,2.18,5174.000000,3430.000000,3.44,411.000000,1.622591,1.461105,13.618055
mvc-7386,-6.497273,-1.936434,20,3,3.170408,3.7020,10.0,38.683459,36.928480,17.000000,0.845000,0.858000,1163.540000,117.150882,493.875294,8417.523288,476.712000,686.828000,0.292988,39.818606,1.699000,1830.250000,1992.646154,3.005000,154.850820,1.550214,1.110240,11.890544,15.999400,8.0,0.60,0.48,90.20,70.0,154.600000,7140.0,35.40,54.80,0.250000,0.02658,1.52,317.500000,1530.000000,1.65,108.0,1.340000,-0.620000,7.092430,95.94000,42.0,1.45,1.90,4912.0,230.000000,1285.517647,10280.000000,2016.00,2896.00,0.310000,139.0,2.17,6190.000000,2108.307692,3.44,329.000000,1.622591,1.461105,13.618055
mvc-7701,-5.685591,-1.957811,26,3,6.181576,1.2507,13.0,25.413912,47.303323,21.076923,0.911538,0.848462,900.507692,80.550226,589.568326,7750.240253,643.661538,256.846154,0.302627,33.939434,1.745385,1341.538462,2108.307692,2.823077,96.945776,1.569287,1.060389,11.520353,15.999400,8.0,0.60,0.48,90.20,58.0,154.600000,7140.0,35.40,54.80,0.250000,0.02658,1.52,317.500000,2108.307692,1.65,50.0,1.340000,-0.620000,7.343918,118.71000,50.0,1.45,1.45,2875.0,91.644118,1285.517647,8067.890411,2369.92,692.68,0.360000,120.0,2.17,3700.000000,2108.307692,3.44,111.786885,1.622591,1.461105,13.618055


In [None]:
# Computing mean and standard devaitaion for train X and normalizing
mean_train_X = X_train.apply(np.mean, axis=0)
std_train_X = X_train.apply(np.std, axis=0)
norm_train_X = (X_train - mean_train_X) / std_train_X

# Computing mean and standard devaitaion for test X and normalizing 
mean_test_X = X_test.apply(np.mean, axis=0)
std_test_X = X_test.apply(np.std, axis=0)
norm_test_X = (X_test - mean_test_X) / std_test_X

In [None]:
# from sklearn.linear_model import LinearRegression
# linear_model = LinearRegression()
# linear_model.fit(norm_train_X, y_train['dielectric_poly_total'])
# #linear_predictions = pd.DataFrame(linear_model.predict(norm_test_X),
#                                 #  columns = ['dielectric_poly_total'])
# linear_predictions = linear_model.predict(X_test)

# linear_model.score(design_matrix, train_filt['dielectric_poly_total'])

# #Reallinear_prediction = linear_model.predict(test)

In [None]:
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train['dielectric_poly_total'])
#linear_predictions = pd.DataFrame(linear_model.predict(norm_test_X),
                                #  columns = ['dielectric_poly_total'])
linear_predictions = linear_model.predict(X_test)

linear_model.score(design_matrix, train_filt['dielectric_poly_total'])

#Reallinear_prediction = linear_model.predict(test)

In [None]:
data2 = mpr.query(criteria={"task_id": {"$in":test["material_id"].to_list()}}, properties=["material_id","energy",
        "energy_per_atom",
        "volume",
        "formation_energy_per_atom",
        "nsites",                                                                                  
        "nelements",
        "density",  "band_gap"])
#df = pd.DataFrame(data)
df2 = pd.DataFrame(data2)
df2


In [None]:
type(df2)

In [None]:
df2.set_index('material_id',inplace=True)
df2

In [184]:
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train['dielectric_poly_total'])
#linear_predictions = pd.DataFrame(linear_model.predict(norm_test_X),
                                #  columns = ['dielectric_poly_total'])
linear_predictions = linear_model.predict(X_test)

linear_model.score(design_matrix, train_filt['dielectric_poly_total']) #CLOSE WE GOT TO PREDICTING x TEST....PROBABLY USE SOME FIXING A BETTER WAY TO TELL US HOW CLOSE OUR PREDICTION IS TO ACTUAL X TEST
 

#Y_data_intitial = df2[cols]
X_data_test = design_matrix_X

#F_linear_predictions = linear_model.predict(df2)
Final_linear_predictions = linear_model.predict(X_data_test)
flp_X = pd.DataFrame(Final_linear_predictions)

d_list=[test, flp_X]


newsubs = pd.concat(d_list, axis=1)
newsubs_update = newsubs.rename(columns={0:'dielectric_poly_total'})
newsubs_update.to_csv("submission_test_3.csv", index=False)

In [None]:
type(linear_predictions)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error


MAE = mean_absolute_error(y_test['dielectric_poly_total'], 
                          linear_predictions)


RMSE = np.sqrt(mean_squared_error(y_test['dielectric_poly_total'], 
                          linear_predictions))


# from sklearn.model_selection import cross_val_predict, KFold
# from sklearn.metrics import r2_score

# # Cross- Validation
# kfold = KFold(n_splits=5, shuffle=True, random_state=42)
# #yhat_mlr = cross_val_predict(linear_model, x_jojo, targets['dielectric_poly_total'], cv=kfold)
# CV_linMAE = mean_absolute_error(targets['dielectric_poly_total'], yhat_mlr)
# CV_linRMSE = np.sqrt(mean_squared_error(targets['dielectric_poly_total'], yhat_mlr))

# print('Cross Val MAE:', CV_linMAE)
# print('Cross Val RMSE:', CV_linRMSE)
print("The MAE is:", MAE)
print("The RMSE is:", RMSE)

In [None]:
# dielects=(model.predict(test_data))

# df_submit = pd.DataFrame(list(dielects))
# df_submit.to_csv('dielectric_csv.csv')


In [None]:
# data2 = mpr.query(criteria={"task_id": {"$in":test["material_id"].to_list()}}, properties=["energy",
#         "energy_per_atom",
#         "volume",
#         "formation_energy_per_atom",
#         "nsites",
#         "unit_cell_formula",
#         "pretty_formula",
#         "is_hubbard",
#         "elements",
#         "nelements",
#         "e_above_hull",
#         "hubbards",
#         "is_compatible",
#         "spacegroup",
#         "task_ids",
#         "band_gap",
#         "density",
#         "icsd_id",
#         "icsd_ids",
#         "total_magnetization",
#         "material_id",
#         "oxide_type",
#         "tags",
#         "elasticity"])
# df = pd.DataFrame(data)
# df2 = pd.DataFrame(data2)

In [None]:
# desired_factors = [ "energy",
#         "energy_per_atom",
#         "volume",
#         "formation_energy_per_atom",
#         "nsites",
#         "nelements",
#         "density",
#         "band_gap"] 

# Y_data = df2[cols]

# #set my model to DecisionTree
# model = DecisionTreeRegressor()

# #set prediction data to factors that will predict, and set target to SalePrice
# train_data = X_data[desired_factors]
# test_data = Y_data[desired_factors]
# target = train.dielectric_poly_total

# # #fitting model with prediction data and telling it my target
# model.fit(train_data, target)

# dielects=(model.predict(test_data))


# dfhi= pd.read_csv("test.csv")

# df_submit = pd.DataFrame(list(dielects))
# df_submit.to_csv('dielectric.csv')

In [None]:
#dielects

In [None]:
#len(dielects)

In [None]:
#test

In [None]:
 #shit=pd.DataFrame(dielects)

In [None]:
#d_list=[test, shit]

In [None]:
#newsubs=pd.concat(d_list, axis=1)

In [None]:
#newsubs_update=newsubs.rename(columns)

In [None]:
#newsubs.to_csv("submission_test.csv")

In [None]:
# from sklearn import metrics
# from sklearn.model_selection import cross_val_predict, KFold
# from sklearn.metrics import r2_score
# # print('Mean absolute error:', metrics.mean_absolute_error(y_test, linear_predictions))
# # print('Mean squared error:', metrics.mean_squared_error(y_test, linear_predictions))
# # print('root mean squared error:', np.sqrt(metrics.mean_squared_error(y_test, linear_predictions)))

# # CV_linMAE = mean_absolute_error(targets['formation_energy_per_atom'], yhat_mlr)
# # CV_linRMSE = np.sqrt(mean_squared_error(targets['formation_energy_per_atom'], yhat_mlr))
# kfold = KFold(n_splits=5, shuffle=True, random_state=42)
# yhat_mlr = cross_val_predict(linear_model, design_matrix, y_jojo['dielectric_poly_total'], cv=kfold)
# metrics.mean_absolute_error(y_test, yhat_mlr)


In [None]:
# x_train, x_test, y_train, y_test = train_test_split(x_jojo, y_jojo, test_size = 0.2 )

In [None]:
# x_train.head()

In [None]:
# # Computing mean and standard devaitaion for train X and normalizing
# mean_train_X = x_train.apply(np.mean, axis=0)
# std_train_X = x_train.apply(np.std, axis=0)
# norm_train_X = (x_train - mean_train_X) / std_train_X

# # Computing mean and standard devaitaion for test X and normalizing 
# mean_test_X = x_test.apply(np.mean, axis=0)
# std_test_X = x_test.apply(np.std, axis=0)
# norm_test_X = (x_test - mean_test_X) / std_test_X


In [None]:
# # Importing functions for regression 
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
# from sklearn.discriminant_analysis import (LinearDiscriminantAnalysis as LDA, 
#                                            QuadraticDiscriminantAnalysis as QDA)




# linear_model = LinearRegression()
# linear_model.fit(norm_train_X, y_train['dielectric_poly_total'])
# #linear_predictions = pd.DataFrame(linear_model.predict(norm_test_X),
#                                   #columns = ['dielectric_poly_total'])
# linear_predictions = linear_model.predict(norm_test_X)


# mae = mean_absolute_error(y_test, linear_predictions)

# mae





# # from sklearn.metrics import mean_squared_error, mean_absolute_error


# MAE = mean_absolute_error(y_test['dielectric_poly_total'],linear_predictions['dielectric_poly_total'])

# # RMSE = np.sqrt(mean_squared_error(y_test['dielectric_poly_total'], 
# #                           linear_predictions['dielectric_poly_total']))

# print("The MAE is:", MAE)
# # print("The RMSE is:", RMSE)


In [None]:
#mymodel= np.poly1d(np.polyfit(x_train, y_train, 4 ))

In [None]:
#r2 = r2_score(y_train, linear_predictions)
