In [1]:
# Importing Libraries
import numpy as np
import scipy as sp
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import AdaBoostRegressor, AdaBoostClassifier, \
    GradientBoostingClassifier, GradientBoostingRegressor, \
    RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import r2_score
from pymatgen.core import Element, Composition, periodic_table
from functools import partial


In [2]:
# Loading training and testing data
train= pd.read_csv("train.csv",index_col=False)
test= pd.read_csv("test.csv",index_col=False)

# Querying materials and their properties based on training data
from pymatgen.ext.matproj import MPRester
mpr = MPRester("241iWwhTEOaNmC6V")

# Using material IDs provided in training data to get corresponding information from MPD
data = mpr.query(criteria={"task_id": {"$in":train["material_id"].to_list()}}, properties=["material_id","energy",
        "energy_per_atom",
        "volume",
        "formation_energy_per_atom",
        "nsites",
        "pretty_formula",                                                                                  
        "nelements",
        "density",  "band_gap"])
df = pd.DataFrame(data)

  0%|          | 0/5619 [00:00<?, ?it/s]

In [3]:
#Making list of possible noble gases 
nobles = ["He","Ne", "Ar", "Kr", "Xe", "Rn", "Og"]
filt_data = df

#Iterating through data and using pandas drop function to filter out materials containing noble gas elements 
for i in nobles:
    filt_data = filt_data.drop(filt_data[filt_data['pretty_formula'].str.contains(i)].index)
 

In [4]:
listA = [a for a in filt_data["pretty_formula"]]
listB = [Composition(a) for a in listA]
listC = [a.elements for a in listB]

editC = [item for sublist in listC for item in sublist]
unique_editC = set(editC)
unique_editC

UL_editC = [a for a in unique_editC]
cprops = [a.data for a in UL_editC]

peel=set(UL_editC)
sy_editC = [a.symbol for a in UL_editC]
sy_editC

cprops_df = pd.DataFrame(cprops, index=sy_editC)

cprops_df.head()

Unnamed: 0,Atomic mass,Atomic no,Atomic orbitals,Atomic radius,Atomic radius calculated,Boiling point,Brinell hardness,Bulk modulus,Coefficient of linear thermal expansion,Common oxidation states,...,NMR Quadrupole Moment,Metallic radius,iupac_ordering,IUPAC ordering,Ground level,Ionization energies,Electron affinity,Ionic radii,Ionic radii hs,Ionic radii ls
H,1.00794,1,{'1s': -0.233471},0.25,0.53,20.28 K,no data MN m<sup>-2</sup>,no data GPa,no data x10<sup>-6</sup>K<sup>-1</sup>,"[-1, 1]",...,{'H-2': 2.86},no data,92,92,2S1/2,[13.598434599702],0.754598,,,
Li,6.941,3,"{'1s': -1.878564, '2s': -0.10554}",1.45,1.67,1615 K,no data MN m<sup>-2</sup>,11 GPa,46 x10<sup>-6</sup>K<sup>-1</sup>,[1],...,"{'Li-6': -0.808, 'Li-7': -40.1}",1.52,11,11,2S1/2,"[5.391714996, 75.640097, 122.45435914]",0.618049,{'1': 0.9},,
Be,9.012182,4,"{'1s': -3.856411, '2s': -0.205744}",1.05,1.12,2742 K,600 MN m<sup>-2</sup>,130 GPa,11.3 x10<sup>-6</sup>K<sup>-1</sup>,[2],...,{'Be-9': 52.88},1.12,17,17,1S0,"[9.322699, 18.21115, 153.896205, 217.7185861]",-0.52,{'2': 0.59},,
B,10.811,5,"{'1s': -6.564347, '2p': -0.136603, '2s': -0.34...",0.85,0.87,4200 K,no data MN m<sup>-2</sup>,320 GPa,6 x10<sup>-6</sup>K<sup>-1</sup>,[3],...,"{'B-10': 84.59, 'B-11': 40.59}",no data,81,81,2P°1/2,"[8.298019, 25.15483, 37.93059, 259.3715, 340.2...",0.279723,{'3': 0.41},,
C,12.0107,6,"{'1s': -9.947718, '2p': -0.199186, '2s': -0.50...",0.7,0.67,4300 K,no data MN m<sup>-2</sup>,33 GPa,7.1 x10<sup>-6</sup>K<sup>-1</sup>,"[-4, 4]",...,{'C-11': 33.27},no data,86,86,3P0,"[11.260288, 24.383154, 47.88778, 64.49352, 392...",1.262114,{'4': 0.3},,


In [5]:
droplist = ['Ionic radii',
    'Ionic radii hs', 
     'Ionic radii ls',
     'iupac_ordering', 
     'IUPAC ordering', 
     'NMR Quadrupole Moment', 
     'Reflectivity',
     'Refractive index', 
     'Rigidity modulus', 
     'Shannon radii',
     'Superconduction temperature',
     'Mendeleev no',       
     'Mineral hardness',
     'Molar volume',
     'Name',
     'Oxidation states',
     'ICSD oxidation states',
     'Brinell hardness',
     'Atomic orbitals', 
     'Coefficient of linear thermal expansion',
     'Atomic orbitals',
     'Electronic structure',
     'Electrical resistivity',
     'Ground level'
           ]


cprops_df = cprops_df.drop(columns=droplist)
cprops_df

Unnamed: 0,Atomic mass,Atomic no,Atomic radius,Atomic radius calculated,Boiling point,Bulk modulus,Common oxidation states,Critical temperature,Density of solid,Liquid range,...,Poissons ratio,Thermal conductivity,Van der waals radius,Velocity of sound,Vickers hardness,X,Youngs modulus,Metallic radius,Ionization energies,Electron affinity
H,1.007940,1,0.25,0.53,20.28 K,no data GPa,"[-1, 1]",33 K,no data kg m<sup>-3</sup>,6.27 K,...,no data,0.1805 W m<sup>-1</sup> K<sup>-1</sup>,1.10,1270 m s<sup>-1</sup>,no data MN m<sup>-2</sup>,2.20,no data GPa,no data,[13.598434599702],0.754598
Li,6.941000,3,1.45,1.67,1615 K,11 GPa,[1],3223 K,535 kg m<sup>-3</sup>,1161.31 K,...,no data,85 W m<sup>-1</sup> K<sup>-1</sup>,1.82,6000 m s<sup>-1</sup>,no data MN m<sup>-2</sup>,0.98,4.9 GPa,1.52,"[5.391714996, 75.640097, 122.45435914]",0.618049
Be,9.012182,4,1.05,1.12,2742 K,130 GPa,[2],no data K,1848 kg m<sup>-3</sup>,1182 K,...,0.032,190 W m<sup>-1</sup> K<sup>-1</sup>,1.53,13000 m s<sup>-1</sup>,1670 MN m<sup>-2</sup>,1.57,287 GPa,1.12,"[9.322699, 18.21115, 153.896205, 217.7185861]",-0.520000
B,10.811000,5,0.85,0.87,4200 K,320 GPa,[3],no data K,2460 kg m<sup>-3</sup>,1851 K,...,no data,27 W m<sup>-1</sup> K<sup>-1</sup>,1.92,16200 m s<sup>-1</sup>,49000 MN m<sup>-2</sup>,2.04,no data GPa,no data,"[8.298019, 25.15483, 37.93059, 259.3715, 340.2...",0.279723
C,12.010700,6,0.70,0.67,4300 K,33 GPa,"[-4, 4]",no data K,2267 kg m<sup>-3</sup>,500 K,...,no data,140 W m<sup>-1</sup> K<sup>-1</sup>,1.70,18350 m s<sup>-1</sup>,no data MN m<sup>-2</sup>,2.55,no data GPa,no data,"[11.260288, 24.383154, 47.88778, 64.49352, 392...",1.262114
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Pb,207.200000,82,1.80,1.54,2022 K,46 GPa,"[2, 4]",no data K,11340 kg m<sup>-3</sup>,1421.39 K,...,0.44,35 W m<sup>-1</sup> K<sup>-1</sup>,2.02,1260 m s<sup>-1</sup>,no data MN m<sup>-2</sup>,2.33,16 GPa,1.75,"[7.4166799, 15.032499, 31.9373, 42.33256, 68.8...",0.356721
Bi,208.980400,83,1.60,1.43,1837 K,31 GPa,[3],no data K,9780 kg m<sup>-3</sup>,1292.6 K,...,0.33,8 W m<sup>-1</sup> K<sup>-1</sup>,2.07,1790 m s<sup>-1</sup>,no data MN m<sup>-2</sup>,2.02,32 GPa,1.82,"[7.285516, 16.703, 25.57075, 45.37, 54.856, 88...",0.942362
Ac,227.000000,89,1.95,no data,3573 K,no data GPa,[3],no data K,10070 kg m<sup>-3</sup>,2250 K,...,no data,12 W m<sup>-1</sup> K<sup>-1</sup>,2.47,no data m s<sup>-1</sup>,no data MN m<sup>-2</sup>,1.10,no data GPa,1.878,"[5.380226, 11.75, 17.431, 44.8, 55.0, 67.0, 79...",0.350000
Th,232.038060,90,1.80,no data,5093 K,54 GPa,[4],no data K,11724 kg m<sup>-3</sup>,2978 K,...,0.27,54 W m<sup>-1</sup> K<sup>-1</sup>,2.45,2490 m s<sup>-1</sup>,350 MN m<sup>-2</sup>,1.30,79 GPa,1.798,"[6.3067, 12.1, 18.32, 28.648, 58.0, 69.1, 82.0...",1.170000


In [6]:
cprops_df['Boiling point'] = [(a.replace('K', '', 1)) for a in cprops_df['Boiling point']]
cprops_df['Bulk modulus'] = cprops_df["Bulk modulus"].str.replace("GPa", "")
cprops_df['Critical temperature'] = cprops_df["Critical temperature"].str.replace("K", "")
cprops_df['Density of solid'] = cprops_df['Density of solid'].str.replace("no data", "NaN")
cprops_df['Density of solid'] = cprops_df["Density of solid"].str.replace("kg m<sup>-3</sup>", "")
cprops_df['Liquid range'] = [(a.replace('K', '', 1)) for a in cprops_df['Liquid range']]
cprops_df['Poissons ratio'] = [(a.replace('no data', 'NaN', 1)) for a in cprops_df['Poissons ratio']]
cprops_df['Poissons ratio'] = cprops_df['Poissons ratio'].str.replace("no data", "").astype(float)
cprops_df['Thermal conductivity'] = [float(a.replace('W m<sup>-1</sup> K<sup>-1</sup>', "", 1)) for a in cprops_df['Thermal conductivity']]
cprops_df['Velocity of sound'] = cprops_df['Velocity of sound'].str.replace("no data","NaN")
cprops_df['Velocity of sound'] = [(a.replace('m s<sup>-1</sup>', '', 1)) for a in cprops_df['Velocity of sound']]
cprops_df['Vickers hardness'] = cprops_df['Vickers hardness'].str.replace("no data", "NaN")
cprops_df['Vickers hardness'] = [(a.replace('MN m<sup>-2</sup>', '', 1)) for a in cprops_df['Vickers hardness']]
cprops_df['Youngs modulus'] = cprops_df['Youngs modulus'].str.replace("no data", "NaN")
cprops_df['Youngs modulus'] = [(a.replace('GPa', '', 1)) for a in cprops_df['Youngs modulus']]
cprops_df['Bulk modulus'] = [(a.replace('no data', 'NaN', 1)) for a in cprops_df['Bulk modulus']]
cprops_df['Bulk modulus'] = [(a.replace('liquid', '', 1)) for a in cprops_df['Bulk modulus']]
cprops_df['Bulk modulus'] = cprops_df['Bulk modulus'].str.replace(r"\(.*\)","",  regex=True).astype(float)
cprops_df['Melting point'] = cprops_df['Melting point'].str.replace("K", "")
cprops_df['Melting point'] = cprops_df['Melting point'].str.replace("white P", "")
cprops_df['Melting point'] = cprops_df['Melting point'].str.replace(r"\(.*\)","",  regex=True).astype(float)
cprops_df['Metallic radius'] = cprops_df['Metallic radius'].astype(str)
cprops_df['Metallic radius'] = [(a.replace('no data', 'NaN', 1)) for a in cprops_df['Metallic radius']]
cprops_df['Metallic radius'] = cprops_df['Metallic radius'].astype(float)
cprops_df['Common oxidation states'] = [len(a) for a in cprops_df['Common oxidation states']]
cprops_df['First Ionization Energy'] = [a[0] for a in cprops_df['Ionization energies']]

cprops_df = cprops_df.drop("Ionization energies", axis=1)

cprops_df['Critical temperature'] = [(a.replace('no data', 'NaN', 1)) for a in cprops_df['Critical temperature']]


In [7]:
cprops_df = cprops_df.apply(pd.to_numeric, errors='coerce')

In [8]:
# Computing mean values using mean function 
mvals = dict(cprops_df.mean())
mvals

{'Atomic mass': 107.06698397475002,
 'Atomic no': 44.7,
 'Atomic radius': 1.479999999999999,
 'Atomic radius calculated': 1.6537837837837837,
 'Boiling point': 2696.401625,
 'Bulk modulus': 91.6441176470588,
 'Common oxidation states': 1.6625,
 'Critical temperature': 1285.5176470588235,
 'Density of solid': 8067.890410958904,
 'Liquid range': 1317.8828749999998,
 'Melting point': 1383.99375,
 'Poissons ratio': 0.2942692307692307,
 'Thermal conductivity': 66.761243875,
 'Van der waals radius': 2.1264999999999996,
 'Velocity of sound': 3723.0015151515154,
 'Vickers hardness': 2108.3076923076924,
 'X': 1.7591249999999998,
 'Youngs modulus': 111.78688524590164,
 'Metallic radius': 1.622590909090909,
 'Electron affinity': 0.7264296744124998,
 'First Ionization Energy': 7.683228095414148}

In [9]:
# Iterating through variable with averages to replace the NaN values in element_data
for key, value in mvals.items():
    cprops_df.loc[cprops_df[key].isnull(),key] = value

In [10]:
filt_data_comp = filt_data
filt_data_comp['Composition'] = [Composition(c) for c in filt_data_comp["pretty_formula"]]
filt_data_comp['num_atoms'] = [c.num_atoms for c in filt_data_comp['Composition']]
filt_data_comp['volume_per_atom'] = filt_data_comp['volume']/filt_data_comp['num_atoms']

In [11]:
cprops_df_dict = cprops_df.to_dict()

rad_dict = cprops_df_dict['Atomic radius']

def radius_mean(composition):
    sumofradii = 0
    totnumatoms = 0
    for element, number in composition.items():
        sumofradii += (number*rad_dict[str(element)])
        totnumatoms += number
    return sumofradii/totnumatoms

atomic_radius = filt_data_comp['Composition'].apply(radius_mean)
atomic_radius

0       1.314286
1       1.275000
2       1.262500
3       1.125000
4       0.761765
          ...   
5614    0.957143
5615    1.000000
5616    0.883333
5617    0.766667
5618    0.835000
Name: Composition, Length: 5614, dtype: float64

In [12]:
#jojo version

def propertymean(property, composition):
    sumofproperty = 0
    totalnumatoms = 0
    for element, number in composition.items():
        sumofproperty += (number*cprops_df_dict[property][str(element)])
        totalnumatoms += number
    return sumofproperty/totalnumatoms

def maxofproperty(property, composition):
    propmax = None
    for element, number in composition.items():
        propertyvalue = cprops_df_dict[property][str(element)]
        if propmax:
            propmax = propertyvalue if propertyvalue > propmax else propmax
        else:
            propmax = propertyvalue
    return propmax

def minofproperty(property, composition):
    propmin = None
    for element, number in composition.items():
        propertyvalue = cprops_df_dict[property][str(element)]
        if propmin:
            propmin = propertyvalue if propertyvalue < propmin else propmin
        else:
            propmin = propertyvalue
    return propmin

In [13]:
avg_properties_df = pd.DataFrame()

for property in cprops_df.columns:
    individualpropertymean = partial(propertymean, property)
    averages = filt_data_comp['Composition'].apply(individualpropertymean)
    avg_properties_df[("average_" + property)] = averages
    
avg_properties_df.head()
print("Average properties Dimension: ", avg_properties_df.shape)

max_properties = pd.DataFrame()

for property in cprops_df.columns:
    individualpropertymax = partial(maxofproperty, property)
    max = filt_data_comp['Composition'].apply(individualpropertymax)
    max_properties[("max_" + property)] = max
    
min_properties = pd.DataFrame()

for property in cprops_df.columns:
    individualpropertymin = partial(minofproperty, property)
    min = filt_data_comp['Composition'].apply(individualpropertymin)
    min_properties[("min_" + property)] = min

Average properties Dimension:  (5614, 21)


In [14]:
design_matrix= pd.concat([filt_data_comp, avg_properties_df, min_properties, max_properties], axis=1)
design_matrix.columns

droplist2 = ['volume','energy', 'pretty_formula', 'Composition', 'average_Common oxidation states', 'min_Common oxidation states','max_Common oxidation states']
design_matrix = design_matrix.drop(columns=droplist2)
design_matrix

Unnamed: 0,material_id,energy_per_atom,formation_energy_per_atom,nsites,nelements,density,band_gap,num_atoms,volume_per_atom,average_Atomic mass,...,max_Poissons ratio,max_Thermal conductivity,max_Van der waals radius,max_Velocity of sound,max_Vickers hardness,max_X,max_Youngs modulus,max_Metallic radius,max_Electron affinity,max_First Ionization Energy
0,mp-1001034,-3.715648,-0.703661,14,3,5.030727,0.7432,7.0,53.735123,81.397286,...,0.330000,160.0,1.93,4602.000000,2108.307692,2.55,45.000000,1.670000,2.020605,9.752392
1,mp-1001780,-5.591101,-1.699567,4,3,6.400668,1.5031,4.0,19.628830,75.660750,...,0.340000,400.0,2.24,3723.001515,2108.307692,2.58,130.000000,1.735000,2.077105,10.360010
2,mp-1001786,-5.900228,-2.103385,4,3,2.687084,1.5296,4.0,17.925309,29.006728,...,0.294269,85.0,2.15,6000.000000,2108.307692,2.58,111.786885,1.641000,2.077105,10.360010
3,mp-1002124,-9.890380,-0.298402,2,2,9.868236,0.5774,2.0,16.027883,95.250350,...,0.370000,140.0,2.23,18350.000000,2108.307692,2.55,111.786885,1.622591,1.262114,11.260288
4,mp-1004528,-5.614998,-3.162482,17,4,3.068458,6.3125,17.0,19.116437,35.324664,...,0.440000,36.0,3.43,16200.000000,49000.000000,3.98,111.786885,2.719000,3.401190,17.422820
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5614,mvc-6916,-7.098949,-2.307339,14,3,4.200544,0.0516,7.0,22.383730,28.311241,...,0.294269,160.0,2.05,5150.000000,2108.307692,3.44,198.000000,1.622591,1.461105,13.618055
5615,mvc-6928,-6.543137,-2.192463,14,3,4.119621,1.7463,7.0,24.848833,30.823657,...,0.310000,200.0,2.31,4910.000000,2108.307692,3.44,211.000000,1.976000,1.461105,13.618055
5616,mvc-6946,-6.122609,-1.961400,6,2,5.131960,2.1009,3.0,32.509720,50.236267,...,0.360000,67.0,2.17,2500.000000,2108.307692,3.44,111.786885,1.622591,1.461105,13.618055
5617,mvc-7040,-8.067357,-1.999722,18,2,3.840256,3.4689,9.0,46.546747,53.823311,...,0.300000,48.0,2.16,4700.000000,2450.000000,3.44,463.000000,1.622591,1.461105,13.618055


In [15]:
IDs = list(filt_data["material_id"])
IDs = pd.DataFrame(IDs)

train_filt = train[train.index.isin(IDs.index)] 
train_filt

Unnamed: 0,material_id,dielectric_poly_total
0,mp-555903,8.337936
1,mp-752658,14.735277
2,mp-3439,17.195305
3,mp-16135,21.593507
4,mp-36447,9.507068
...,...,...
5609,mp-754117,13.378949
5610,mp-1539137,16.908907
5611,mp-1079559,11.776195
5612,mp-555908,6.241000


In [16]:
train_filt.set_index('material_id')

Unnamed: 0_level_0,dielectric_poly_total
material_id,Unnamed: 1_level_1
mp-555903,8.337936
mp-752658,14.735277
mp-3439,17.195305
mp-16135,21.593507
mp-36447,9.507068
...,...
mp-754117,13.378949
mp-1539137,16.908907
mp-1079559,11.776195
mp-555908,6.241000


In [17]:
design_matrix

Unnamed: 0,material_id,energy_per_atom,formation_energy_per_atom,nsites,nelements,density,band_gap,num_atoms,volume_per_atom,average_Atomic mass,...,max_Poissons ratio,max_Thermal conductivity,max_Van der waals radius,max_Velocity of sound,max_Vickers hardness,max_X,max_Youngs modulus,max_Metallic radius,max_Electron affinity,max_First Ionization Energy
0,mp-1001034,-3.715648,-0.703661,14,3,5.030727,0.7432,7.0,53.735123,81.397286,...,0.330000,160.0,1.93,4602.000000,2108.307692,2.55,45.000000,1.670000,2.020605,9.752392
1,mp-1001780,-5.591101,-1.699567,4,3,6.400668,1.5031,4.0,19.628830,75.660750,...,0.340000,400.0,2.24,3723.001515,2108.307692,2.58,130.000000,1.735000,2.077105,10.360010
2,mp-1001786,-5.900228,-2.103385,4,3,2.687084,1.5296,4.0,17.925309,29.006728,...,0.294269,85.0,2.15,6000.000000,2108.307692,2.58,111.786885,1.641000,2.077105,10.360010
3,mp-1002124,-9.890380,-0.298402,2,2,9.868236,0.5774,2.0,16.027883,95.250350,...,0.370000,140.0,2.23,18350.000000,2108.307692,2.55,111.786885,1.622591,1.262114,11.260288
4,mp-1004528,-5.614998,-3.162482,17,4,3.068458,6.3125,17.0,19.116437,35.324664,...,0.440000,36.0,3.43,16200.000000,49000.000000,3.98,111.786885,2.719000,3.401190,17.422820
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5614,mvc-6916,-7.098949,-2.307339,14,3,4.200544,0.0516,7.0,22.383730,28.311241,...,0.294269,160.0,2.05,5150.000000,2108.307692,3.44,198.000000,1.622591,1.461105,13.618055
5615,mvc-6928,-6.543137,-2.192463,14,3,4.119621,1.7463,7.0,24.848833,30.823657,...,0.310000,200.0,2.31,4910.000000,2108.307692,3.44,211.000000,1.976000,1.461105,13.618055
5616,mvc-6946,-6.122609,-1.961400,6,2,5.131960,2.1009,3.0,32.509720,50.236267,...,0.360000,67.0,2.17,2500.000000,2108.307692,3.44,111.786885,1.622591,1.461105,13.618055
5617,mvc-7040,-8.067357,-1.999722,18,2,3.840256,3.4689,9.0,46.546747,53.823311,...,0.300000,48.0,2.16,4700.000000,2450.000000,3.44,463.000000,1.622591,1.461105,13.618055


In [18]:
design_matrix.set_index('material_id',inplace=True)
design_matrix

Unnamed: 0_level_0,energy_per_atom,formation_energy_per_atom,nsites,nelements,density,band_gap,num_atoms,volume_per_atom,average_Atomic mass,average_Atomic no,...,max_Poissons ratio,max_Thermal conductivity,max_Van der waals radius,max_Velocity of sound,max_Vickers hardness,max_X,max_Youngs modulus,max_Metallic radius,max_Electron affinity,max_First Ionization Energy
material_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mp-1001034,-3.715648,-0.703661,14,3,5.030727,0.7432,7.0,53.735123,81.397286,35.142857,...,0.330000,160.0,1.93,4602.000000,2108.307692,2.55,45.000000,1.670000,2.020605,9.752392
mp-1001780,-5.591101,-1.699567,4,3,6.400668,1.5031,4.0,19.628830,75.660750,33.000000,...,0.340000,400.0,2.24,3723.001515,2108.307692,2.58,130.000000,1.735000,2.077105,10.360010
mp-1001786,-5.900228,-2.103385,4,3,2.687084,1.5296,4.0,17.925309,29.006728,14.000000,...,0.294269,85.0,2.15,6000.000000,2108.307692,2.58,111.786885,1.641000,2.077105,10.360010
mp-1002124,-9.890380,-0.298402,2,2,9.868236,0.5774,2.0,16.027883,95.250350,39.000000,...,0.370000,140.0,2.23,18350.000000,2108.307692,2.55,111.786885,1.622591,1.262114,11.260288
mp-1004528,-5.614998,-3.162482,17,4,3.068458,6.3125,17.0,19.116437,35.324664,15.294118,...,0.440000,36.0,3.43,16200.000000,49000.000000,3.98,111.786885,2.719000,3.401190,17.422820
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
mvc-6916,-7.098949,-2.307339,14,3,4.200544,0.0516,7.0,22.383730,28.311241,13.428571,...,0.294269,160.0,2.05,5150.000000,2108.307692,3.44,198.000000,1.622591,1.461105,13.618055
mvc-6928,-6.543137,-2.192463,14,3,4.119621,1.7463,7.0,24.848833,30.823657,14.857143,...,0.310000,200.0,2.31,4910.000000,2108.307692,3.44,211.000000,1.976000,1.461105,13.618055
mvc-6946,-6.122609,-1.961400,6,2,5.131960,2.1009,3.0,32.509720,50.236267,22.000000,...,0.360000,67.0,2.17,2500.000000,2108.307692,3.44,111.786885,1.622591,1.461105,13.618055
mvc-7040,-8.067357,-1.999722,18,2,3.840256,3.4689,9.0,46.546747,53.823311,22.888889,...,0.300000,48.0,2.16,4700.000000,2450.000000,3.44,463.000000,1.622591,1.461105,13.618055


In [19]:
X_train, X_test, y_train, y_test = train_test_split(design_matrix, 
                                                    train_filt,
                                                    test_size=0.1, 
                                                    random_state=120)

In [44]:
# Computing mean and standard devaitaion for train X and normalizing
mean_train_X = X_train.apply(np.mean, axis=0)
std_train_X = X_train.apply(np.std, axis=0)
norm_train_X = (X_train - mean_train_X) / std_train_X

# Computing mean and standard devaitaion for test X and normalizing 
mean_test_X = X_test.apply(np.mean, axis=0)
std_test_X = X_test.apply(np.std, axis=0)
norm_test_X = (X_test - mean_test_X) / std_test_X

# Test Data Querying

In [20]:
# Using material IDs provided in training data to get corresponding information from MPD
data22 = mpr.query(criteria={"task_id": {"$in":test["material_id"].to_list()}}, properties=["material_id","energy",
        "energy_per_atom",
        "volume",
        "formation_energy_per_atom",
        "nsites",
        "pretty_formula",                                                                                  
        "nelements",
        "density",  "band_gap"])
df_X = pd.DataFrame(data22)
display(df_X)

  0%|          | 0/1400 [00:00<?, ?it/s]

Unnamed: 0,material_id,energy,energy_per_atom,volume,formation_energy_per_atom,nsites,pretty_formula,nelements,density,band_gap
0,mp-1000,-8.646946,-4.323473,89.091084,-1.792337,2,BaTe,2,4.937886,1.8555
1,mp-10044,-11.376365,-5.688183,27.982045,-0.019233,2,BAs,2,5.087631,1.2522
2,mp-10086,-42.877615,-7.146269,99.212472,-3.359731,6,YSF,3,4.685387,1.3137
3,mp-1008858,-16.665812,-5.555271,80.958490,-0.944922,3,NdBiPd,3,9.427722,0.0900
4,mp-1008867,-11.760729,-3.920243,38.301187,-0.701211,3,NaCuO,3,4.445388,0.0017
...,...,...,...,...,...,...,...,...,...,...
1395,mvc-5908,-73.434572,-7.343457,120.736933,-2.771448,10,Ca2CrWO6,4,5.666228,1.6101
1396,mvc-5921,-64.816637,-6.481664,143.204106,-2.354943,10,Mg2SnWO6,4,5.185052,2.2892
1397,mvc-7386,-129.945458,-6.497273,386.834593,-1.936434,20,ZnMo2O7,3,3.170408,3.7020
1398,mvc-7701,-147.825353,-5.685591,330.380852,-1.957811,26,Zn2Sn3O8,3,6.181576,1.2507


In [21]:
filt_data_X = df_X

listA_X = [a for a in filt_data_X["pretty_formula"]]
listB_X = [Composition(a) for a in listA_X]
listC_X = [a.elements for a in listB_X]

editC_X = [item for sublist in listC_X for item in sublist]
unique_editC_X = set(editC_X)
unique_editC_X

UL_editC_X = [a for a in unique_editC_X]
cprops_X = [a.data for a in UL_editC_X]

peel=set(UL_editC_X)
sy_editC_X = [a.symbol for a in UL_editC_X]
sy_editC_X

pd.set_option('display.max_rows', None)

cprops_df_X = pd.DataFrame(cprops_X, index=sy_editC_X)

droplist2 = droplist + ['Max oxidation state', 'Min oxidation state']

droplist2

cprops_df_X = cprops_df_X.drop(columns=droplist2)
cprops_df_X

Unnamed: 0,Atomic mass,Atomic no,Atomic radius,Atomic radius calculated,Boiling point,Bulk modulus,Common oxidation states,Critical temperature,Density of solid,Liquid range,...,Poissons ratio,Thermal conductivity,Van der waals radius,Velocity of sound,Vickers hardness,X,Youngs modulus,Metallic radius,Ionization energies,Electron affinity
H,1.00794,1,0.25,0.53,20.28 K,no data GPa,"[-1, 1]",33 K,no data kg m<sup>-3</sup>,6.27 K,...,no data,0.1805 W m<sup>-1</sup> K<sup>-1</sup>,1.1,1270 m s<sup>-1</sup>,no data MN m<sup>-2</sup>,2.2,no data GPa,no data,[13.598434599702],0.754598
Li,6.941,3,1.45,1.67,1615 K,11 GPa,[1],3223 K,535 kg m<sup>-3</sup>,1161.31 K,...,no data,85 W m<sup>-1</sup> K<sup>-1</sup>,1.82,6000 m s<sup>-1</sup>,no data MN m<sup>-2</sup>,0.98,4.9 GPa,1.52,"[5.391714996, 75.640097, 122.45435914]",0.618049
Be,9.012182,4,1.05,1.12,2742 K,130 GPa,[2],no data K,1848 kg m<sup>-3</sup>,1182 K,...,0.032,190 W m<sup>-1</sup> K<sup>-1</sup>,1.53,13000 m s<sup>-1</sup>,1670 MN m<sup>-2</sup>,1.57,287 GPa,1.12,"[9.322699, 18.21115, 153.896205, 217.7185861]",-0.52
B,10.811,5,0.85,0.87,4200 K,320 GPa,[3],no data K,2460 kg m<sup>-3</sup>,1851 K,...,no data,27 W m<sup>-1</sup> K<sup>-1</sup>,1.92,16200 m s<sup>-1</sup>,49000 MN m<sup>-2</sup>,2.04,no data GPa,no data,"[8.298019, 25.15483, 37.93059, 259.3715, 340.2...",0.279723
C,12.0107,6,0.7,0.67,4300 K,33 GPa,"[-4, 4]",no data K,2267 kg m<sup>-3</sup>,500 K,...,no data,140 W m<sup>-1</sup> K<sup>-1</sup>,1.7,18350 m s<sup>-1</sup>,no data MN m<sup>-2</sup>,2.55,no data GPa,no data,"[11.260288, 24.383154, 47.88778, 64.49352, 392...",1.262114
N,14.0067,7,0.65,0.56,77.36 K,no data GPa,"[-3, 3, 5]",126.2 K,no data kg m<sup>-3</sup>,14.31 K,...,no data,0.02583 W m<sup>-1</sup> K<sup>-1</sup>,1.55,333.6 m s<sup>-1</sup>,no data MN m<sup>-2</sup>,3.04,no data GPa,no data,"[14.53413, 29.60125, 47.4453, 77.4735, 97.8901...",-0.07
O,15.9994,8,0.6,0.48,90.2 K,no data GPa,[-2],154.6 K,no data kg m<sup>-3</sup>,35.4 K,...,no data,0.02658 W m<sup>-1</sup> K<sup>-1</sup>,1.52,317.5 m s<sup>-1</sup>,no data MN m<sup>-2</sup>,3.44,no data GPa,no data,"[13.618055, 35.12112, 54.93554, 77.4135, 113.8...",1.461105
F,18.998403,9,0.5,0.42,85.03 K,no data GPa,[-1],144 K,no data kg m<sup>-3</sup>,31.5 K,...,no data,0.0277 W m<sup>-1</sup> K<sup>-1</sup>,1.47,no data m s<sup>-1</sup>,no data MN m<sup>-2</sup>,3.98,no data GPa,no data,"[17.42282, 34.97081, 62.70798, 87.175, 114.249...",3.40119
Na,22.989769,11,1.8,1.9,1156 K,6.3 GPa,[1],2573 K,968 kg m<sup>-3</sup>,785.13 K,...,no data,140 W m<sup>-1</sup> K<sup>-1</sup>,2.27,3200 m s<sup>-1</sup>,no data MN m<sup>-2</sup>,0.93,10 GPa,1.86,"[5.13907696, 47.28636, 71.62, 98.936, 138.404,...",0.547926
Mg,24.305,12,1.5,1.45,1363 K,45 GPa,[2],no data K,1738 kg m<sup>-3</sup>,440 K,...,0.29,160 W m<sup>-1</sup> K<sup>-1</sup>,1.73,4602 m s<sup>-1</sup>,no data MN m<sup>-2</sup>,1.31,45 GPa,1.6,"[7.646236, 15.035271, 80.1436, 109.2654, 141.3...",-0.42


In [22]:
cprops_df_X['Boiling point'] = [(a.replace('K', '', 1)) for a in cprops_df_X['Boiling point']]
cprops_df_X['Bulk modulus'] = cprops_df_X["Bulk modulus"].str.replace("GPa", "")
cprops_df_X['Critical temperature'] = cprops_df_X["Critical temperature"].str.replace("K", "")
cprops_df_X['Density of solid'] = cprops_df_X['Density of solid'].str.replace("no data", "NaN")
cprops_df_X['Density of solid'] = cprops_df_X["Density of solid"].str.replace("kg m<sup>-3</sup>", "")
cprops_df_X['Liquid range'] = [(a.replace('K', '', 1)) for a in cprops_df_X['Liquid range']]
cprops_df_X['Poissons ratio'] = [(a.replace('no data', 'NaN', 1)) for a in cprops_df_X['Poissons ratio']]
#cprops_df_X['Poissons ratio'] = cprops_df_X['Poissons ratio'].str.replace("no data", "").astype(float)
cprops_df_X['Thermal conductivity'] = [float(a.replace('W m<sup>-1</sup> K<sup>-1</sup>', "", 1)) for a in cprops_df_X['Thermal conductivity']]
cprops_df_X['Velocity of sound'] = cprops_df_X['Velocity of sound'].str.replace("no data","NaN")
cprops_df_X['Velocity of sound'] = [(a.replace('m s<sup>-1</sup>', '', 1)) for a in cprops_df_X['Velocity of sound']]
cprops_df_X['Vickers hardness'] = cprops_df_X['Vickers hardness'].str.replace("no data", "NaN", 1)
cprops_df_X['Vickers hardness'] = [(a.replace('MN m<sup>-2</sup>', '', 1)) for a in cprops_df_X['Vickers hardness']]
cprops_df_X['Youngs modulus'] = cprops_df_X['Youngs modulus'].str.replace("no data", "NaN", 1)
cprops_df_X['Youngs modulus'] = [(a.replace('GPa', '', 1)) for a in cprops_df_X['Youngs modulus']]
cprops_df_X['Bulk modulus'] = [(a.replace('no data', 'NaN', 1)) for a in cprops_df_X['Bulk modulus']]
cprops_df_X['Bulk modulus'] = [(a.replace('liquid', '', 1)) for a in cprops_df_X['Bulk modulus']]
cprops_df_X['Bulk modulus'] = cprops_df_X['Bulk modulus'].str.replace(r"\(.*\)","",  regex=True).astype(float)
cprops_df_X['Melting point'] = cprops_df_X['Melting point'].str.replace("K", "")
cprops_df_X['Melting point'] = cprops_df_X['Melting point'].str.replace("white P", "")
cprops_df_X['Melting point'] = cprops_df_X['Melting point'].str.replace(r"\(.*\)","",  regex=True).astype(float)
cprops_df_X['Metallic radius'] = cprops_df_X['Metallic radius'].astype(str)
cprops_df_X['Metallic radius'] = [(a.replace('no data', 'NaN', 1)) for a in cprops_df_X['Metallic radius']]
#cprops_df_X['Metallic radius'] = cprops_df_X['Metallic radius'].astype(float)
#cprops_df_X['Common oxidation states'] = [len(a) for a in cprops_df_X['Common oxidation states']]
cprops_df_X['First Ionization Energy'] = [a[0] for a in cprops_df_X['Ionization energies']]

cprops_df_X = cprops_df_X.drop("Ionization energies", axis=1)

cprops_df_X['Critical temperature'] = [(a.replace('no data', 'NaN', 1)) for a in cprops_df_X['Critical temperature']]


In [23]:
# cprops_df_X['Boiling point'] = [(a.replace('K', '', 1)) for a in cprops_df_X['Boiling point']]
# cprops_df_X['Bulk modulus'] = cprops_df_X["Bulk modulus"].str.replace("GPa", "", 1)
# cprops_df_X['Critical temperature'] = cprops_df_X["Critical temperature"].str.replace("K", "", 1)
# cprops_df_X['Density of solid'] = cprops_df_X['Density of solid'].str.replace("no data", "NaN", 1)
# cprops_df_X['Density of solid'] = cprops_df_X["Density of solid"].str.replace("kg m<sup>-3</sup>", "", 1)
# cprops_df_X['Liquid range'] = [(a.replace('K', '', 1)) for a in cprops_df_X['Liquid range']]
# cprops_df_X['Poissons ratio'] = [(a.replace('no data', 'NaN', 1)) for a in cprops_df_X['Poissons ratio']]
# cprops_df_X['Poissons ratio'] = cprops_df_X['Poissons ratio'].str.replace("no data", "").astype(float)
# cprops_df_X['Thermal conductivity'] = [float(a.replace('W m<sup>-1</sup> K<sup>-1</sup>', "", 1)) for a in cprops_df_X['Thermal conductivity']]
# cprops_df_X['Velocity of sound'] = cprops_df_X['Velocity of sound'].str.replace("no data","NaN", 1)
# cprops_df_X['Velocity of sound'] = [(a.replace('m s<sup>-1</sup>', '', 1)) for a in cprops_df_X['Velocity of sound']]
# cprops_df_X['Vickers hardness'] = cprops_df_X['Vickers hardness'].str.replace("no data", "NaN", 1)
# cprops_df_X['Vickers hardness'] = [(a.replace('MN m<sup>-2</sup>', '', 1)) for a in cprops_df_X['Vickers hardness']]
# cprops_df_X['Youngs modulus'] = cprops_df_X['Youngs modulus'].str.replace("no data", "NaN", 1)
# cprops_df_X['Youngs modulus'] = [(a.replace('GPa', '', 1)) for a in cprops_df_X['Youngs modulus']]
# cprops_df_X['Bulk modulus'] = [(a.replace('no data', 'NaN', 1)) for a in cprops_df_X['Bulk modulus']]
# cprops_df_X['Bulk modulus'] = [(a.replace('liquid', '', 1)) for a in cprops_df_X['Bulk modulus']]
# cprops_df_X['Bulk modulus'] = cprops_df_X['Bulk modulus'].str.replace(r"\(.*\)","",  regex=True).astype(float)
# cprops_df_X['Melting point'] = cprops_df_X['Melting point'].str.replace("K", "", 1)
# cprops_df_X['Melting point'] = cprops_df_X['Melting point'].str.replace("white P", "", 1)
# cprops_df_X['Melting point'] = cprops_df_X['Melting point'].str.replace(r"\(.*\)","",  regex=True).astype(float)
# cprops_df_X['Metallic radius'] = cprops_df_X['Metallic radius'].astype(str)
# cprops_df_X['Metallic radius'] = [(a.replace('no data', 'NaN', 1)) for a in cprops_df_X['Metallic radius']]
# cprops_df_X['Metallic radius'] = cprops_df_X['Metallic radius'].astype(float)
# #cprops_df_X['Common oxidation states'] = [len(a) for a in cprops_df_X['Common oxidation states']]
# cprops_df_X['First Ionization Energy'] = [a[0] for a in cprops_df_X['Ionization energies']]

# cprops_df_X = cprops_df_X.drop("Ionization energies", axis=1)

# cprops_df_X['Critical temperature'] = [(a.replace('no data', 'NaN', 1)) for a in cprops_df_X['Critical temperature']]


In [24]:
cprops_df_X

Unnamed: 0,Atomic mass,Atomic no,Atomic radius,Atomic radius calculated,Boiling point,Bulk modulus,Common oxidation states,Critical temperature,Density of solid,Liquid range,...,Poissons ratio,Thermal conductivity,Van der waals radius,Velocity of sound,Vickers hardness,X,Youngs modulus,Metallic radius,Electron affinity,First Ionization Energy
H,1.00794,1,0.25,0.53,20.28,,"[-1, 1]",33.0,,6.27,...,,0.1805,1.1,1270.0,,2.2,,,0.754598,13.598435
Li,6.941,3,1.45,1.67,1615.0,11.0,[1],3223.0,535.0,1161.31,...,,85.0,1.82,6000.0,,0.98,4.9,1.52,0.618049,5.391715
Be,9.012182,4,1.05,1.12,2742.0,130.0,[2],,1848.0,1182.0,...,0.032,190.0,1.53,13000.0,1670.0,1.57,287.0,1.12,-0.52,9.322699
B,10.811,5,0.85,0.87,4200.0,320.0,[3],,2460.0,1851.0,...,,27.0,1.92,16200.0,49000.0,2.04,,,0.279723,8.298019
C,12.0107,6,0.7,0.67,4300.0,33.0,"[-4, 4]",,2267.0,500.0,...,,140.0,1.7,18350.0,,2.55,,,1.262114,11.260288
N,14.0067,7,0.65,0.56,77.36,,"[-3, 3, 5]",126.2,,14.31,...,,0.02583,1.55,333.6,,3.04,,,-0.07,14.53413
O,15.9994,8,0.6,0.48,90.2,,[-2],154.6,,35.4,...,,0.02658,1.52,317.5,,3.44,,,1.461105,13.618055
F,18.998403,9,0.5,0.42,85.03,,[-1],144.0,,31.5,...,,0.0277,1.47,,,3.98,,,3.40119,17.42282
Na,22.989769,11,1.8,1.9,1156.0,6.3,[1],2573.0,968.0,785.13,...,,140.0,2.27,3200.0,,0.93,10.0,1.86,0.547926,5.139077
Mg,24.305,12,1.5,1.45,1363.0,45.0,[2],,1738.0,440.0,...,0.29,160.0,1.73,4602.0,,1.31,45.0,1.6,-0.42,7.646236


In [25]:
cprops_df_X['Atomic radius'] = cprops_df_X["Atomic radius"].astype(str)


In [26]:
cprops_df_X['Atomic radius'] = cprops_df_X["Atomic radius"].str.replace("no data", "NaN", 1)

In [27]:
cprops_df_X

Unnamed: 0,Atomic mass,Atomic no,Atomic radius,Atomic radius calculated,Boiling point,Bulk modulus,Common oxidation states,Critical temperature,Density of solid,Liquid range,...,Poissons ratio,Thermal conductivity,Van der waals radius,Velocity of sound,Vickers hardness,X,Youngs modulus,Metallic radius,Electron affinity,First Ionization Energy
H,1.00794,1,0.25,0.53,20.28,,"[-1, 1]",33.0,,6.27,...,,0.1805,1.1,1270.0,,2.2,,,0.754598,13.598435
Li,6.941,3,1.45,1.67,1615.0,11.0,[1],3223.0,535.0,1161.31,...,,85.0,1.82,6000.0,,0.98,4.9,1.52,0.618049,5.391715
Be,9.012182,4,1.05,1.12,2742.0,130.0,[2],,1848.0,1182.0,...,0.032,190.0,1.53,13000.0,1670.0,1.57,287.0,1.12,-0.52,9.322699
B,10.811,5,0.85,0.87,4200.0,320.0,[3],,2460.0,1851.0,...,,27.0,1.92,16200.0,49000.0,2.04,,,0.279723,8.298019
C,12.0107,6,0.7,0.67,4300.0,33.0,"[-4, 4]",,2267.0,500.0,...,,140.0,1.7,18350.0,,2.55,,,1.262114,11.260288
N,14.0067,7,0.65,0.56,77.36,,"[-3, 3, 5]",126.2,,14.31,...,,0.02583,1.55,333.6,,3.04,,,-0.07,14.53413
O,15.9994,8,0.6,0.48,90.2,,[-2],154.6,,35.4,...,,0.02658,1.52,317.5,,3.44,,,1.461105,13.618055
F,18.998403,9,0.5,0.42,85.03,,[-1],144.0,,31.5,...,,0.0277,1.47,,,3.98,,,3.40119,17.42282
Na,22.989769,11,1.8,1.9,1156.0,6.3,[1],2573.0,968.0,785.13,...,,140.0,2.27,3200.0,,0.93,10.0,1.86,0.547926,5.139077
Mg,24.305,12,1.5,1.45,1363.0,45.0,[2],,1738.0,440.0,...,0.29,160.0,1.73,4602.0,,1.31,45.0,1.6,-0.42,7.646236


In [28]:
cprops_df_X.dtypes

Atomic mass                 float64
Atomic no                     int64
Atomic radius                object
Atomic radius calculated     object
Boiling point                object
Bulk modulus                float64
Common oxidation states      object
Critical temperature         object
Density of solid             object
Liquid range                 object
Melting point               float64
Poissons ratio               object
Thermal conductivity        float64
Van der waals radius        float64
Velocity of sound            object
Vickers hardness             object
X                           float64
Youngs modulus               object
Metallic radius              object
Electron affinity           float64
First Ionization Energy     float64
dtype: object

In [29]:
cprops_df_X = cprops_df_X.apply(pd.to_numeric, errors='coerce')
cprops_df_X.dtypes

Atomic mass                 float64
Atomic no                     int64
Atomic radius               float64
Atomic radius calculated    float64
Boiling point               float64
Bulk modulus                float64
Common oxidation states     float64
Critical temperature        float64
Density of solid            float64
Liquid range                float64
Melting point               float64
Poissons ratio              float64
Thermal conductivity        float64
Van der waals radius        float64
Velocity of sound           float64
Vickers hardness            float64
X                           float64
Youngs modulus              float64
Metallic radius             float64
Electron affinity           float64
First Ionization Energy     float64
dtype: object

In [30]:
# Computing mean values using mean function 
mvals_X = dict(cprops_df_X.mean()) #calculates 11
len(mvals_X)

for key, value in mvals_X.items():
    cprops_df_X.loc[cprops_df_X[key].isnull(),key] = value

display(cprops_df_X)

Unnamed: 0,Atomic mass,Atomic no,Atomic radius,Atomic radius calculated,Boiling point,Bulk modulus,Common oxidation states,Critical temperature,Density of solid,Liquid range,...,Poissons ratio,Thermal conductivity,Van der waals radius,Velocity of sound,Vickers hardness,X,Youngs modulus,Metallic radius,Electron affinity,First Ionization Energy
H,1.00794,1.0,0.25,0.53,20.28,92.330303,,33.0,7915.56338,6.27,...,0.29624,0.1805,1.1,1270.0,2153.891892,2.2,111.118644,1.621047,0.754598,13.598435
Li,6.941,3.0,1.45,1.67,1615.0,11.0,,3223.0,535.0,1161.31,...,0.29624,85.0,1.82,6000.0,2153.891892,0.98,4.9,1.52,0.618049,5.391715
Be,9.012182,4.0,1.05,1.12,2742.0,130.0,,1230.194444,1848.0,1182.0,...,0.032,190.0,1.53,13000.0,1670.0,1.57,287.0,1.12,-0.52,9.322699
B,10.811,5.0,0.85,0.87,4200.0,320.0,,1230.194444,2460.0,1851.0,...,0.29624,27.0,1.92,16200.0,49000.0,2.04,111.118644,1.621047,0.279723,8.298019
C,12.0107,6.0,0.7,0.67,4300.0,33.0,,1230.194444,2267.0,500.0,...,0.29624,140.0,1.7,18350.0,2153.891892,2.55,111.118644,1.621047,1.262114,11.260288
N,14.0067,7.0,0.65,0.56,77.36,92.330303,,126.2,7915.56338,14.31,...,0.29624,0.02583,1.55,333.6,2153.891892,3.04,111.118644,1.621047,-0.07,14.53413
O,15.9994,8.0,0.6,0.48,90.2,92.330303,,154.6,7915.56338,35.4,...,0.29624,0.02658,1.52,317.5,2153.891892,3.44,111.118644,1.621047,1.461105,13.618055
F,18.998403,9.0,0.5,0.42,85.03,92.330303,,144.0,7915.56338,31.5,...,0.29624,0.0277,1.47,3707.278462,2153.891892,3.98,111.118644,1.621047,3.40119,17.42282
Na,22.989769,11.0,1.8,1.9,1156.0,6.3,,2573.0,968.0,785.13,...,0.29624,140.0,2.27,3200.0,2153.891892,0.93,10.0,1.86,0.547926,5.139077
Mg,24.305,12.0,1.5,1.45,1363.0,45.0,,1230.194444,1738.0,440.0,...,0.29,160.0,1.73,4602.0,2153.891892,1.31,45.0,1.6,-0.42,7.646236


In [31]:
cprops_df_X.dtypes

Atomic mass                 float64
Atomic no                   float64
Atomic radius               float64
Atomic radius calculated    float64
Boiling point               float64
Bulk modulus                float64
Common oxidation states     float64
Critical temperature        float64
Density of solid            float64
Liquid range                float64
Melting point               float64
Poissons ratio              float64
Thermal conductivity        float64
Van der waals radius        float64
Velocity of sound           float64
Vickers hardness            float64
X                           float64
Youngs modulus              float64
Metallic radius             float64
Electron affinity           float64
First Ionization Energy     float64
dtype: object

In [32]:
filt_data_comp_X = filt_data_X
filt_data_comp_X['Composition'] = [Composition(c) for c in filt_data_comp_X["pretty_formula"]]
filt_data_comp_X['num_atoms'] = [c.num_atoms for c in filt_data_comp_X['Composition']]
filt_data_comp_X['volume_per_atom'] = filt_data_comp_X['volume']/filt_data_comp_X['num_atoms']
filt_data_comp_X

Unnamed: 0,material_id,energy,energy_per_atom,volume,formation_energy_per_atom,nsites,pretty_formula,nelements,density,band_gap,Composition,num_atoms,volume_per_atom
0,mp-1000,-8.646946,-4.323473,89.091084,-1.792337,2,BaTe,2,4.937886,1.8555,"(Ba, Te)",2.0,44.545542
1,mp-10044,-11.376365,-5.688183,27.982045,-0.01923329,2,BAs,2,5.087631,1.2522,"(B, As)",2.0,13.991023
2,mp-10086,-42.877615,-7.146269,99.212472,-3.359731,6,YSF,3,4.685387,1.3137,"(Y, S, F)",3.0,33.070824
3,mp-1008858,-16.665812,-5.555271,80.95849,-0.9449219,3,NdBiPd,3,9.427722,0.09,"(Nd, Bi, Pd)",3.0,26.986163
4,mp-1008867,-11.760729,-3.920243,38.301187,-0.7012111,3,NaCuO,3,4.445388,0.0017,"(Na, Cu, O)",3.0,12.767062
5,mp-1009087,-38.652556,-4.831569,155.051465,-0.2116146,8,BeSiAs2,3,4.004124,1.0226,"(Be, Si, As)",4.0,38.762866
6,mp-10096,-84.572613,-3.84421,570.491249,-0.7818146,22,Na3Sr3GaP4,4,3.058858,0.7467,"(Na, Sr, Ga, P)",11.0,51.862841
7,mp-1009894,-18.157914,-9.078957,33.282905,-0.1917217,2,ZrC,2,5.15055,0.5469,"(Zr, C)",2.0,16.641452
8,mp-1013911,-313.454217,-6.530296,637.920695,-2.496547,48,Na2CoP2O7,4,2.903504,3.0679,"(Na, Co, P, O)",12.0,53.160058
9,mp-1014013,-21.472932,-5.368233,534.97062,0.04505289,4,P,1,0.384568,0.8924,(P),1.0,534.97062


In [33]:




cprops_df_dict_X = cprops_df_X.to_dict()

rad_dict_X = cprops_df_dict_X['Atomic radius']

# atomic_radius_X = filt_data_comp_X['Composition'].apply(radius_mean)
# atomic_radius_X

In [34]:
cprops_df_dict_X

{'Atomic mass': {'H': 1.00794,
  'Li': 6.941,
  'Be': 9.012182,
  'B': 10.811,
  'C': 12.0107,
  'N': 14.0067,
  'O': 15.9994,
  'F': 18.9984032,
  'Na': 22.98976928,
  'Mg': 24.305,
  'Al': 26.9815386,
  'Si': 28.0855,
  'P': 30.973762,
  'S': 32.065,
  'Cl': 35.453,
  'K': 39.0983,
  'Ca': 40.078,
  'Sc': 44.955912,
  'Ti': 47.867,
  'V': 50.9415,
  'Cr': 51.9961,
  'Mn': 54.938045,
  'Fe': 55.845,
  'Co': 58.933195,
  'Ni': 58.6934,
  'Cu': 63.546,
  'Zn': 65.409,
  'Ga': 69.723,
  'Ge': 72.64,
  'As': 74.9216,
  'Se': 78.96,
  'Br': 79.904,
  'Rb': 85.4678,
  'Sr': 87.62,
  'Y': 88.90585,
  'Zr': 91.224,
  'Nb': 92.90638,
  'Mo': 95.94,
  'Tc': 98.0,
  'Ru': 101.07,
  'Rh': 102.9055,
  'Pd': 106.42,
  'Ag': 107.8682,
  'Cd': 112.411,
  'In': 114.818,
  'Sn': 118.71,
  'Sb': 121.76,
  'Te': 127.6,
  'I': 126.90447,
  'Xe': 131.293,
  'Cs': 132.9054519,
  'Ba': 137.327,
  'La': 138.90547,
  'Ce': 140.116,
  'Pr': 140.90765,
  'Nd': 144.242,
  'Sm': 150.36,
  'Eu': 151.964,
  'Tb': 15

In [35]:
# cprops_df_dict_X['Xe'] = None #adding new key-value pair

In [36]:
cprops_df_dict_X = cprops_df_X.to_dict()

rad_dict_X = cprops_df_dict_X['Atomic radius']

def radius_mean_X(composition):
    sumofradii = 0
    totnumatoms = 0
    for element, number in composition.items():
        sumofradii += (number*rad_dict_X[str(element)])
        totnumatoms += number
    return sumofradii/totnumatoms

atomic_radius_X = filt_data_comp_X['Composition'].apply(radius_mean_X)
atomic_radius_X

0       1.775000
1       1.000000
2       1.100000
3       1.616667
4       1.250000
5       1.112500
6       1.518182
7       1.125000
8       0.929167
9       1.000000
10      1.050000
11      1.337500
12      1.080000
13      1.237500
14      1.325000
15      1.150000
16      1.266667
17      1.240000
18      0.941667
19      1.566667
20      0.900000
21      1.046429
22      1.211111
23      0.954167
24      0.980000
25      0.958333
26      1.160000
27      0.770000
28      0.804545
29      0.784615
30      1.250000
31      0.966667
32      1.250000
33      1.383333
34      1.657143
35      1.012500
36      1.216667
37      1.160000
38      1.000000
39      1.212500
40      1.050000
41      1.016667
42      1.130000
43      1.066667
44      1.350000
45      1.200000
46      1.241667
47      1.050000
48      0.957500
49      1.328571
50      1.087500
51      1.336585
52      1.328049
53      1.391463
54      0.916667
55      0.842857
56      1.168182
57      0.840909
58      1.4250

In [37]:
#jojo version

def propertymean_X(property, composition):
    sumofproperty = 0
    totalnumatoms = 0
    for element, number in composition.items():
        sumofproperty += (number*cprops_df_dict_X[property][str(element)])
        totalnumatoms += number
    return sumofproperty/totalnumatoms

def maxofproperty_X(property, composition):
    propmax = None
    for element, number in composition.items():
        propertyvalue = cprops_df_dict_X[property][str(element)]
        if propmax:
            propmax = propertyvalue if propertyvalue > propmax else propmax
        else:
            propmax = propertyvalue
    return propmax

def minofproperty_X(property, composition):
    propmin = None
    for element, number in composition.items():
        propertyvalue = cprops_df_dict_X[property][str(element)]
        if propmin:
            propmin = propertyvalue if propertyvalue < propmin else propmin
        else:
            propmin = propertyvalue
    return propmin

In [38]:
avg_properties_df_X = pd.DataFrame()

for property in cprops_df_X.columns:
    individualpropertymean = partial(propertymean_X, property)
    averages = filt_data_comp_X['Composition'].apply(individualpropertymean)
    avg_properties_df_X[("average_" + property)] = averages
    
avg_properties_df_X.head()
print("Average properties Dimension: ", avg_properties_df_X.shape)

max_properties_X = pd.DataFrame()

for property in cprops_df.columns:
    individualpropertymax = partial(maxofproperty_X, property)
    max = filt_data_comp_X['Composition'].apply(individualpropertymax)
    max_properties_X[("max_" + property)] = max
    
min_properties_X = pd.DataFrame()

for property in cprops_df.columns:
    individualpropertymin = partial(minofproperty_X, property)
    min = filt_data_comp_X['Composition'].apply(individualpropertymin)
    min_properties_X[("min_" + property)] = min

Average properties Dimension:  (1400, 21)


In [39]:
filt_data_comp_X

Unnamed: 0,material_id,energy,energy_per_atom,volume,formation_energy_per_atom,nsites,pretty_formula,nelements,density,band_gap,Composition,num_atoms,volume_per_atom
0,mp-1000,-8.646946,-4.323473,89.091084,-1.792337,2,BaTe,2,4.937886,1.8555,"(Ba, Te)",2.0,44.545542
1,mp-10044,-11.376365,-5.688183,27.982045,-0.01923329,2,BAs,2,5.087631,1.2522,"(B, As)",2.0,13.991023
2,mp-10086,-42.877615,-7.146269,99.212472,-3.359731,6,YSF,3,4.685387,1.3137,"(Y, S, F)",3.0,33.070824
3,mp-1008858,-16.665812,-5.555271,80.95849,-0.9449219,3,NdBiPd,3,9.427722,0.09,"(Nd, Bi, Pd)",3.0,26.986163
4,mp-1008867,-11.760729,-3.920243,38.301187,-0.7012111,3,NaCuO,3,4.445388,0.0017,"(Na, Cu, O)",3.0,12.767062
5,mp-1009087,-38.652556,-4.831569,155.051465,-0.2116146,8,BeSiAs2,3,4.004124,1.0226,"(Be, Si, As)",4.0,38.762866
6,mp-10096,-84.572613,-3.84421,570.491249,-0.7818146,22,Na3Sr3GaP4,4,3.058858,0.7467,"(Na, Sr, Ga, P)",11.0,51.862841
7,mp-1009894,-18.157914,-9.078957,33.282905,-0.1917217,2,ZrC,2,5.15055,0.5469,"(Zr, C)",2.0,16.641452
8,mp-1013911,-313.454217,-6.530296,637.920695,-2.496547,48,Na2CoP2O7,4,2.903504,3.0679,"(Na, Co, P, O)",12.0,53.160058
9,mp-1014013,-21.472932,-5.368233,534.97062,0.04505289,4,P,1,0.384568,0.8924,(P),1.0,534.97062


In [40]:
design_matrix_X= pd.concat([filt_data_comp_X, avg_properties_df_X, min_properties_X, max_properties_X], axis=1)
design_matrix_X.columns

droplist2 = ['volume','energy', 'pretty_formula', 'Composition', 'average_Common oxidation states', 'min_Common oxidation states','max_Common oxidation states']

design_matrix_X = design_matrix_X.drop(columns=droplist2)
design_matrix_X

Unnamed: 0,material_id,energy_per_atom,formation_energy_per_atom,nsites,nelements,density,band_gap,num_atoms,volume_per_atom,average_Atomic mass,...,max_Poissons ratio,max_Thermal conductivity,max_Van der waals radius,max_Velocity of sound,max_Vickers hardness,max_X,max_Youngs modulus,max_Metallic radius,max_Electron affinity,max_First Ionization Energy
0,mp-1000,-4.323473,-1.792337,2,2,4.937886,1.8555,2.0,44.545542,132.4635,...,0.29624,18.0,2.68,2610.0,2153.891892,2.1,43.0,2.236,1.970876,9.009808
1,mp-10044,-5.688183,-0.01923329,2,2,5.087631,1.2522,2.0,13.991023,42.8663,...,0.29624,50.0,1.92,16200.0,49000.0,2.18,111.118644,1.621047,0.80482,9.78855
2,mp-10086,-7.146269,-3.359731,6,3,4.685387,1.3137,3.0,33.070824,46.656418,...,0.29624,17.0,2.32,3707.278462,2153.891892,3.98,111.118644,1.8,3.40119,17.42282
3,mp-1008858,-5.555271,-0.9449219,3,3,9.427722,0.09,3.0,26.986163,153.214133,...,0.39,72.0,2.39,3070.0,2153.891892,2.2,121.0,1.821,0.942362,8.336839
4,mp-1008867,-3.920243,-0.7012111,3,3,4.445388,0.0017,3.0,12.767062,34.17839,...,0.34,400.0,2.27,3570.0,2153.891892,3.44,130.0,1.86,1.461105,13.618055
5,mp-1009087,-4.831569,-0.2116146,8,3,4.004124,1.0226,4.0,38.762866,46.73522,...,0.29624,190.0,2.1,13000.0,2153.891892,2.18,287.0,1.621047,1.389521,9.78855
6,mp-10096,-3.84421,-0.7818146,22,4,3.058858,0.7467,11.0,51.862841,47.767941,...,0.29624,140.0,2.49,3707.278462,2153.891892,2.19,111.118644,2.151,0.746607,10.486686
7,mp-1009894,-9.078957,-0.1917217,2,2,5.15055,0.5469,2.0,16.641452,51.61735,...,0.34,140.0,2.23,18350.0,2153.891892,2.55,111.118644,1.621047,1.262114,11.260288
8,mp-1013911,-6.530296,-2.496547,48,4,2.903504,3.0679,12.0,53.160058,23.238005,...,0.31,140.0,2.27,4720.0,2153.891892,3.44,209.0,1.86,1.461105,13.618055
9,mp-1014013,-5.368233,0.04505289,4,1,0.384568,0.8924,1.0,534.97062,30.973762,...,0.29624,0.236,1.8,3707.278462,2153.891892,2.19,111.118644,1.621047,0.746607,10.486686


In [41]:
design_matrix_X.set_index('material_id',inplace=True)
design_matrix_X

Unnamed: 0_level_0,energy_per_atom,formation_energy_per_atom,nsites,nelements,density,band_gap,num_atoms,volume_per_atom,average_Atomic mass,average_Atomic no,...,max_Poissons ratio,max_Thermal conductivity,max_Van der waals radius,max_Velocity of sound,max_Vickers hardness,max_X,max_Youngs modulus,max_Metallic radius,max_Electron affinity,max_First Ionization Energy
material_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mp-1000,-4.323473,-1.792337,2,2,4.937886,1.8555,2.0,44.545542,132.4635,54.0,...,0.29624,18.0,2.68,2610.0,2153.891892,2.1,43.0,2.236,1.970876,9.009808
mp-10044,-5.688183,-0.01923329,2,2,5.087631,1.2522,2.0,13.991023,42.8663,19.0,...,0.29624,50.0,1.92,16200.0,49000.0,2.18,111.118644,1.621047,0.80482,9.78855
mp-10086,-7.146269,-3.359731,6,3,4.685387,1.3137,3.0,33.070824,46.656418,21.333333,...,0.29624,17.0,2.32,3707.278462,2153.891892,3.98,111.118644,1.8,3.40119,17.42282
mp-1008858,-5.555271,-0.9449219,3,3,9.427722,0.09,3.0,26.986163,153.214133,63.0,...,0.39,72.0,2.39,3070.0,2153.891892,2.2,121.0,1.821,0.942362,8.336839
mp-1008867,-3.920243,-0.7012111,3,3,4.445388,0.0017,3.0,12.767062,34.17839,16.0,...,0.34,400.0,2.27,3570.0,2153.891892,3.44,130.0,1.86,1.461105,13.618055
mp-1009087,-4.831569,-0.2116146,8,3,4.004124,1.0226,4.0,38.762866,46.73522,21.0,...,0.29624,190.0,2.1,13000.0,2153.891892,2.18,287.0,1.621047,1.389521,9.78855
mp-10096,-3.84421,-0.7818146,22,4,3.058858,0.7467,11.0,51.862841,47.767941,21.636364,...,0.29624,140.0,2.49,3707.278462,2153.891892,2.19,111.118644,2.151,0.746607,10.486686
mp-1009894,-9.078957,-0.1917217,2,2,5.15055,0.5469,2.0,16.641452,51.61735,23.0,...,0.34,140.0,2.23,18350.0,2153.891892,2.55,111.118644,1.621047,1.262114,11.260288
mp-1013911,-6.530296,-2.496547,48,4,2.903504,3.0679,12.0,53.160058,23.238005,11.25,...,0.31,140.0,2.27,4720.0,2153.891892,3.44,209.0,1.86,1.461105,13.618055
mp-1014013,-5.368233,0.04505289,4,1,0.384568,0.8924,1.0,534.97062,30.973762,15.0,...,0.29624,0.236,1.8,3707.278462,2153.891892,2.19,111.118644,1.621047,0.746607,10.486686


In [42]:
lo = design_matrix_X.isnull().sum()
lo

energy_per_atom                     0
formation_energy_per_atom           0
nsites                              0
nelements                           0
density                             0
band_gap                            0
num_atoms                           0
volume_per_atom                     0
average_Atomic mass                 0
average_Atomic no                   0
average_Atomic radius               0
average_Atomic radius calculated    0
average_Boiling point               0
average_Bulk modulus                0
average_Critical temperature        0
average_Density of solid            0
average_Liquid range                0
average_Melting point               0
average_Poissons ratio              0
average_Thermal conductivity        0
average_Van der waals radius        0
average_Velocity of sound           0
average_Vickers hardness            0
average_X                           0
average_Youngs modulus              0
average_Metallic radius             0
average_Elec

In [46]:
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
linear_model.fit(norm_train_X, y_train['dielectric_poly_total'])
#linear_predictions = pd.DataFrame(linear_model.predict(norm_test_X),
                                #  columns = ['dielectric_poly_total'])
linear_predictions = linear_model.predict(X_test)

# linear_model.score(design_matrix, train_filt['dielectric_poly_total']) #CLOSE WE GOT TO PREDICTING x TEST....PROBABLY USE SOME FIXING A BETTER WAY TO TELL US HOW CLOSE OUR PREDICTION IS TO ACTUAL X TEST

In [47]:
linear_model.score?

In [48]:
X_data_test = design_matrix_X

#F_linear_predictions = linear_model.predict(df2)
Final_linear_predictions_X = linear_model.predict(X_data_test)
flp_X = pd.DataFrame(Final_linear_predictions_X)

d_list=[test, flp_X]

newsubs = pd.concat(d_list, axis=1)
newsubs_update = newsubs.rename(columns={0:'dielectric_poly_total'})
newsubs_update.to_csv("submission_test_6_12-02-2021.csv", index=False)

In [49]:
y_train.head()

Unnamed: 0,material_id,dielectric_poly_total
5,mp-29175,25.723003
886,mp-7436,12.047716
3192,mp-765958,7.014766
5364,mp-989548,12.402333
595,mp-6675,6.223076


#### SVR

In [50]:
from sklearn import svm, datasets
from sklearn.model_selection import cross_val_score
model_svr = svm.SVR(kernel='rbf', C=0.001
                    , epsilon=0.001,  tol=1e-5)
scores = -cross_val_score(model_svr, norm_train_X, y_train['dielectric_poly_total'], cv=5, scoring='neg_mean_absolute_error')

In [62]:
cross_val_score?

In [51]:
# Training score
np.mean(scores)

7.930859837414291

In [52]:
# Training score
np.mean(scores)

7.930859837414291

In [53]:
# Testing score
model_svr = svm.SVR(kernel='rbf', C=0.1, epsilon=0.1,  tol=1e-5)
model_svr.fit(norm_train_X, y_train['dielectric_poly_total'])
scores = -cross_val_score(model_svr, norm_test_X, y_test['dielectric_poly_total'], cv=5, scoring='neg_mean_absolute_error')
print(np.mean(np.abs(y_test['dielectric_poly_total'] - model_svr.predict(norm_test_X))))
np.mean(scores)

8.272759195284086


8.249384012637083

#### Random Forest Regressor

In [54]:
from sklearn.ensemble import RandomForestRegressor
model_rfr = RandomForestRegressor(n_estimators=100, max_depth=35, min_samples_split=2 , random_state=42)
scores = -cross_val_score(model_rfr, X_train, y_train['dielectric_poly_total'], cv=5, scoring='neg_mean_absolute_error')

In [55]:
# Training score
np.mean(scores)

9.76191447785529

In [56]:
# Training score
np.mean(scores)

9.76191447785529

In [57]:
# Testing score
model_rfr = RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_split=4 , random_state=42)
model_rfr.fit(X_train, y_train['dielectric_poly_total'])
scores = -cross_val_score(model_rfr, X_test, y_test['dielectric_poly_total'], cv=5, scoring='neg_mean_absolute_error')
print(np.mean(np.abs(y_test['dielectric_poly_total'] - model_rfr.predict(X_test))))
np.mean(scores)

9.120111766335887


9.974078516982374

In [58]:
X_train.head()

Unnamed: 0_level_0,energy_per_atom,formation_energy_per_atom,nsites,nelements,density,band_gap,num_atoms,volume_per_atom,average_Atomic mass,average_Atomic no,...,max_Poissons ratio,max_Thermal conductivity,max_Van der waals radius,max_Velocity of sound,max_Vickers hardness,max_X,max_Youngs modulus,max_Metallic radius,max_Electron affinity,max_First Ionization Energy
material_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mp-1006888,-5.846844,-2.216231,4,3,3.02843,2.3526,4.0,26.337598,48.033538,22.5,...,0.294269,100.0,2.75,3723.001515,2108.307692,2.58,111.786885,2.381,2.077105,10.36001
mp-1226281,-4.666992,-0.238231,10,4,4.641012,0.7251,10.0,24.887337,69.557182,30.5,...,0.37,430.0,2.11,5940.0,2108.307692,2.55,279.0,1.622591,2.020605,10.486686
mp-555229,-5.659234,-1.007094,18,4,1.237595,5.041,9.0,18.801127,7.006211,3.777778,...,0.294269,140.0,1.7,18350.0,2108.307692,3.44,111.786885,1.622591,1.461105,14.53413
mp-961675,-6.876485,-1.034626,3,3,4.894903,0.7107,3.0,15.223106,44.874358,21.333333,...,0.31,91.0,2.15,4970.0,2108.307692,2.19,200.0,1.641,1.157161,10.486686
mp-1190488,-6.908359,-3.107679,24,3,3.653744,3.0642,12.0,25.172896,27.694413,13.0,...,0.35,235.0,2.18,5100.0,2108.307692,3.44,111.786885,1.622591,1.461105,13.618055


In [60]:
# #‘neg_mean_absolute_error’

# # metrics.mean_absolute_error
# X, y = datasets.load_iris(return_X_y=True)
# clf = svm.SVC(random_state=0)
# cross_val_score(clf, X, y, cv=5, scoring='recall_macro')

# model = svm.SVC()
# cross_val_score(model, X, y, cv=5, scoring='wrong_choice')

In [61]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
import tensorflow.keras.layers as layers
import tensorflow as tf



# Creating model
model = Sequential()
model.add(layers.Dense(68, activation='relu', input_shape=(1,))) # Number of features, intput layer
model.add(layers.Dense(64, activation='relu')) # Beginning of hidden layers
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dense(16, activation='relu'))
model.add(layers.Dense(8, activation='relu')) 
model.add(layers.Dense(1)) # Output layer

# Compiling model
model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['accuracy', 'mean_absolute_error']) # Remove mean_absolute_error if you're getting error

# Fitting and gathering results
model.fit(X_train, y_train, epochs=30, batch_size=500)
results = model.evaluate(X_test, y_test)


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).