In [None]:
!pip install numpy
!pip install scipy
!pip install seaborn
!pip install pandas
!pip install matplotlib
!pip install sklearn
!pip install pymatgen
!pip install tensorflow

In [None]:
# Importing Libraries
import numpy as np
import scipy as sp
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_predict, cross_val_score
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import AdaBoostRegressor, AdaBoostClassifier, \
    GradientBoostingClassifier, GradientBoostingRegressor, \
    RandomForestClassifier, RandomForestRegressor
from scipy import linalg
from sklearn.discriminant_analysis import (LinearDiscriminantAnalysis as LDA, QuadraticDiscriminantAnalysis as QDA)
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from pymatgen.core import Element, Composition, periodic_table
from functools import partial
from pymatgen.ext.matproj import MPRester
mpr = MPRester("241iWwhTEOaNmC6V")
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn import svm, datasets
# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# import tensorflow.keras.layers as layers
# import tensorflow as tf
from pandas.plotting import scatter_matrix


In [None]:
#loading the testing and training data into the file to be manipulated in several forms

train = pd.read_csv("train.csv",index_col=False)
test = pd.read_csv("test.csv",index_col=False)



In [None]:
#Using pymatgen and the material ID we can query some of the data in our training set that we will use AND we will store this in a dataframe

base_data = mpr.query(criteria={"task_id": {"$in":train["material_id"].to_list()}}, properties=["material_id","energy",
        "energy_per_atom",
        "volume",
        "formation_energy_per_atom",
        "nsites",
        "pretty_formula",                                                                                  
        "nelements",
        "density",  "band_gap"])
base_data_DF = pd.DataFrame(base_data)



In [None]:
#for the training data we will remove noble gases. Later on we will have to apply fixes to our code because the test set includes Xe 


#Making list of possible noble gases 
nobles = ["He","Ne", "Ar", "Kr", "Xe", "Rn", "Og"]
NoNobles_base_data_DF = base_data_DF

#Iterating through data and using pandas drop function to filter out materials containing noble gas elements 
for i in nobles:
    NoNobles_base_data_DF = NoNobles_base_data_DF.drop(NoNobles_base_data_DF[NoNobles_base_data_DF['pretty_formula'].str.contains(i)].index)
    
    

In [None]:
#this isn't the most efficient way to do this but why fix somthing that isnt technically broken
#I need to make an element properties dataframe similar to lab 2. How i did it in lab 2 was to get a list of unique elements in the NoNobles_base_data_DF and then making a dataframe that includes the data of each element


listA = [a for a in NoNobles_base_data_DF["pretty_formula"]]
listB = [Composition(a) for a in listA]
listC = [a.elements for a in listB]

editC = [item for sublist in listC for item in sublist]
unique_editC = set(editC)
unique_editC

UL_editC = [a for a in unique_editC]
indiv_ELproperties_train = [a.data for a in UL_editC]

peel = set(UL_editC)
sy_editC = [a.symbol for a in UL_editC]
sy_editC

indiv_ELproperties_train_DF = pd.DataFrame(indiv_ELproperties_train, index=sy_editC)

indiv_ELproperties_train_DF.head()



In [None]:
#Off top there are some properties that I don't want to use, either because there is significant lack of data for them, they are mostly words and not numbers or they are ranges, or they seem to be repeated
#there will be several droplists i will try to make this more ordered by numbering them all

droplist_1 = ['Ionic radii',
    'Ionic radii hs', 
     'Ionic radii ls',
     'iupac_ordering', 
     'IUPAC ordering', 
     'NMR Quadrupole Moment', 
     'Reflectivity',
     'Refractive index', 
     'Rigidity modulus', 
     'Shannon radii',
     'Superconduction temperature',
     'Mendeleev no',       
     'Mineral hardness',
     'Molar volume',
     'Name',
     'Oxidation states',
     'ICSD oxidation states',
     'Brinell hardness',
     'Atomic orbitals', 
     'Coefficient of linear thermal expansion',
     'Atomic orbitals',
     'Electronic structure',
     'Electrical resistivity',
     'Ground level'
           ]


indiv_ELproperties_train_DF = indiv_ELproperties_train_DF.drop(columns=droplist_1)


indiv_ELproperties_train_DF.head()


So initially we only queried and used the data available on pymatgen without building any features like we did in lab 2 that model performance wasn't good. it was my first attempt and it had a value of 173 when I submitted it to kaggle. We talked to the TA and he said to build the features out. Now the .data command took us forever to find. I don't know why. But once we found that and had the values at least we could start to do data cleaning of the data we had enough of. But I did want to say this part took us SO SO long. It was honestly very frustrating and in the future i think it might be worth it to provide more examples of the data cleaning so that we can focus more on choosing features and models and not data cleaning.

In [None]:
#Data cleaning lines. These should all work. But you can't run them twice without error. You are saving the new clean data into the dataframe so it wont be able to find anything since it is very column specific and once ran once
##there is nothing to fix. that being said the coerce to apply numeric is in a different box because i dont want to risk it making them numeric before i do a bunch of string operations. 

indiv_ELproperties_train_DF['Boiling point'] = [(a.replace('K', '', 1)) for a in indiv_ELproperties_train_DF['Boiling point']]
indiv_ELproperties_train_DF['Bulk modulus'] = indiv_ELproperties_train_DF["Bulk modulus"].str.replace("GPa", "")
indiv_ELproperties_train_DF['Critical temperature'] = indiv_ELproperties_train_DF["Critical temperature"].str.replace("K", "")
indiv_ELproperties_train_DF['Density of solid'] = indiv_ELproperties_train_DF['Density of solid'].str.replace("no data", "NaN")
indiv_ELproperties_train_DF['Density of solid'] = indiv_ELproperties_train_DF["Density of solid"].str.replace("kg m<sup>-3</sup>", "")
indiv_ELproperties_train_DF['Liquid range'] = [(a.replace('K', '', 1)) for a in indiv_ELproperties_train_DF['Liquid range']]
indiv_ELproperties_train_DF['Poissons ratio'] = [(a.replace('no data', 'NaN', 1)) for a in indiv_ELproperties_train_DF['Poissons ratio']]
indiv_ELproperties_train_DF['Poissons ratio'] = indiv_ELproperties_train_DF['Poissons ratio'].str.replace("no data", "").astype(float)
indiv_ELproperties_train_DF['Thermal conductivity'] = [float(a.replace('W m<sup>-1</sup> K<sup>-1</sup>', "", 1)) for a in indiv_ELproperties_train_DF['Thermal conductivity']]
indiv_ELproperties_train_DF['Velocity of sound'] = indiv_ELproperties_train_DF['Velocity of sound'].str.replace("no data","NaN")
indiv_ELproperties_train_DF['Velocity of sound'] = [(a.replace('m s<sup>-1</sup>', '', 1)) for a in indiv_ELproperties_train_DF['Velocity of sound']]
indiv_ELproperties_train_DF['Vickers hardness'] = indiv_ELproperties_train_DF['Vickers hardness'].str.replace("no data", "NaN")
indiv_ELproperties_train_DF['Vickers hardness'] = [(a.replace('MN m<sup>-2</sup>', '', 1)) for a in indiv_ELproperties_train_DF['Vickers hardness']]
indiv_ELproperties_train_DF['Youngs modulus'] = indiv_ELproperties_train_DF['Youngs modulus'].str.replace("no data", "NaN")
indiv_ELproperties_train_DF['Youngs modulus'] = [(a.replace('GPa', '', 1)) for a in indiv_ELproperties_train_DF['Youngs modulus']]
indiv_ELproperties_train_DF['Bulk modulus'] = [(a.replace('no data', 'NaN', 1)) for a in indiv_ELproperties_train_DF['Bulk modulus']]
indiv_ELproperties_train_DF['Bulk modulus'] = [(a.replace('liquid', '', 1)) for a in indiv_ELproperties_train_DF['Bulk modulus']]
indiv_ELproperties_train_DF['Bulk modulus'] = indiv_ELproperties_train_DF['Bulk modulus'].str.replace(r"\(.*\)","",  regex=True).astype(float)
indiv_ELproperties_train_DF['Melting point'] = indiv_ELproperties_train_DF['Melting point'].str.replace("K", "")
indiv_ELproperties_train_DF['Melting point'] = indiv_ELproperties_train_DF['Melting point'].str.replace("white P", "")
indiv_ELproperties_train_DF['Melting point'] = indiv_ELproperties_train_DF['Melting point'].str.replace(r"\(.*\)","",  regex=True).astype(float)
indiv_ELproperties_train_DF['Metallic radius'] = indiv_ELproperties_train_DF['Metallic radius'].astype(str)
indiv_ELproperties_train_DF['Metallic radius'] = [(a.replace('no data', 'NaN', 1)) for a in indiv_ELproperties_train_DF['Metallic radius']]
indiv_ELproperties_train_DF['Metallic radius'] = indiv_ELproperties_train_DF['Metallic radius'].astype(float)
indiv_ELproperties_train_DF['Common oxidation states'] = [len(a) for a in indiv_ELproperties_train_DF['Common oxidation states']]
indiv_ELproperties_train_DF['First Ionization Energy'] = [a[0] for a in indiv_ELproperties_train_DF['Ionization energies']]

indiv_ELproperties_train_DF = indiv_ELproperties_train_DF.drop("Ionization energies", axis=1)

indiv_ELproperties_train_DF['Critical temperature'] = [(a.replace('no data', 'NaN', 1)) for a in indiv_ELproperties_train_DF['Critical temperature']]


In [None]:
indiv_ELproperties_train_DF = indiv_ELproperties_train_DF.apply(pd.to_numeric, errors='coerce')

In [None]:
#We need to compute the mean values of each column so that way we can place the means of each column in the spaces where we previously made sure there was NAN

#means
mean_col_vals = dict(indiv_ELproperties_train_DF.mean())
mean_col_vals


# Iterating through variable with averages to replace the NaN values in element_data
for key, value in mean_col_vals.items():
    indiv_ELproperties_train_DF.loc[indiv_ELproperties_train_DF[key].isnull(),key] = value



In [None]:
#We talked with the TA about extensive vs intensive properties and we needed to remove volume and energy as a result. though we could keep volume/atom. however we would have to build that first. as it is just easier to build
##that once we cleaned the other data. we did that here

#NoNobles_base_data_DF



NoNobles_basedata_DF_wcomp = NoNobles_base_data_DF
NoNobles_basedata_DF_wcomp['Composition'] = [Composition(c) for c in NoNobles_basedata_DF_wcomp["pretty_formula"]]
NoNobles_basedata_DF_wcomp['num_atoms'] = [c.num_atoms for c in NoNobles_basedata_DF_wcomp['Composition']]
NoNobles_basedata_DF_wcomp['volume_per_atom'] = NoNobles_basedata_DF_wcomp['volume']/NoNobles_basedata_DF_wcomp['num_atoms']



NoNobles_basedata_DF_wcomp
#this should be a dataframe of all the MPIDS with the relevant compositions which is necessary because we are about to start making features for the indiv_ELproperties_train_DF



In [None]:
#this was the best way i figured out to make the additional properties. this is how i did it in lab 2. radius mean function was removed beceause it is never added to a dataframe and is just a tester to make sure they work


indiv_ELproperties_train_DF_dict = indiv_ELproperties_train_DF.to_dict()

#my functions
def propertymean(property, composition):
    sumofproperty = 0
    totalnumatoms = 0
    for element, number in composition.items():
        sumofproperty += (number*indiv_ELproperties_train_DF_dict[property][str(element)])
        totalnumatoms += number
    return sumofproperty/totalnumatoms

def maxofproperty(property, composition):
    propmax = None
    for element, number in composition.items():
        propertyvalue = indiv_ELproperties_train_DF_dict[property][str(element)]
        if propmax:
            propmax = propertyvalue if propertyvalue > propmax else propmax
        else:
            propmax = propertyvalue
    return propmax

def minofproperty(property, composition):
    propmin = None
    for element, number in composition.items():
        propertyvalue = indiv_ELproperties_train_DF_dict[property][str(element)]
        if propmin:
            propmin = propertyvalue if propertyvalue < propmin else propmin
        else:
            propmin = propertyvalue
    return propmin



#assigning the values of those functions to a dataframe

avg_properties_df = pd.DataFrame()

for property in indiv_ELproperties_train_DF.columns:
    individualpropertymean = partial(propertymean, property)
    averages = NoNobles_basedata_DF_wcomp['Composition'].apply(individualpropertymean)
    avg_properties_df[("average_" + property)] = averages
    
avg_properties_df.head()
print("Average properties Dimension: ", avg_properties_df.shape)

max_properties = pd.DataFrame()

for property in indiv_ELproperties_train_DF.columns:
    individualpropertymax = partial(maxofproperty, property)
    max = NoNobles_basedata_DF_wcomp['Composition'].apply(individualpropertymax)
    max_properties[("max_" + property)] = max
    
min_properties = pd.DataFrame()

for property in indiv_ELproperties_train_DF.columns:
    individualpropertymin = partial(minofproperty, property)
    min = NoNobles_basedata_DF_wcomp['Composition'].apply(individualpropertymin)
    min_properties[("min_" + property)] = min


In [None]:
#Now we need to take all theses dataframes we have made with the min/max/average properties and put them in a single matrix that the models will be performed on


ALL_Features_Matrix =  pd.concat([NoNobles_basedata_DF_wcomp, avg_properties_df, min_properties, max_properties], axis=1)

#ALL_Features_Matrix.columns


#this is the second droplist: this one is to get rid of some features that we needed before to create new features list, but that we cant use because we couldnt clean properly or we intensive properties
droplist_2 = ['volume','energy', 'pretty_formula', 'Composition', 'average_Common oxidation states', 'min_Common oxidation states','max_Common oxidation states']


model_matrix_1 = ALL_Features_Matrix.drop(columns=droplist_2)
model_matrix_1 
#model_matrix_1 is the feature matrix for the training data that we will later split


In [None]:
#We removed the noble gases from the feature matrix. so we have to make sure that the part of the training set that is just MPIDS and dielectric polytotals has those removed as well
#While typing this i thought ... I removed noble gases from my training set. will that matter if one is included in my dataset.... something to think about for the next project. 

#list of the ids and a dataframe of them
MPIDs = list(NoNobles_base_data_DF["material_id"])
MPIDs = pd.DataFrame(MPIDs)

#setting a new dataframe equal to the dataframe that only includes the one in model_matrix_1 which does not include noble gases

NoNobles_training_set = train[train.index.isin(MPIDs.index)] 


#you have to make the material_id the index because the dataframe operations can't handle strings
NoNobles_training_set.set_index('material_id')

#have to do the same to the feature matrix // idk why this one needs the inplace true but that took longer to get right that should have.
model_matrix_1.set_index('material_id', inplace=True)


In [None]:
#just in case i want to check what is in the NoNobles_training_set dataframe
#NoNobles_training_set

In [None]:
#just in case i want to check what is in the NoNobles_training_set dataframe
#model_matrix_1

Here is where you either run your model or start to do feature selection. I think for simplicity i will do the feature selection here and then do droplists according to different features I would like to drop.
Ultimately I didnt really like any of my models so it really doesn't matter which I choose. I have no clue how people got better than 7ish... which ultimately stephanie and I achieved working together even though i only submitted my 8 on kaggle. so i will have the feature selection first and then just do the splits for each model that i ultimately worked with 


some other things we tried that didnt really improve the model accuracy was test size and random state.

With regards to feature selection.

i tried a heat map (too many properties to see so I would up just printing the correlations to the datafram and looking at them there)
i tried a scatter_matrix( never fully ran so I assume it was just too much data, so essentially same problem)

I like printing the correlations to a dataframe and coloring those to determine which ones are important(it was brought up that this is low tech and could lead to missing them, but the others seemed like more work than they offered in results. (also additionally I never exactly figured out how to apply two colors to the same map) (you can idnore the ones that are 1 obviously)



In [None]:
#Getting the correlations
model_matrix_1_corr = model_matrix_1.corr()

In [None]:
#how to make the positive correlatins red

def color_negative_red(val):
    color = 'red' if 0.85 < val and val < 1.00 else 'black'
    return 'color: %s' % color


model_matrix_1_corr.style.applymap(color_negative_red)


In [None]:
#how to make the negative correlatins blue

def color_negative_blue(val):
    color = 'blue' if -1.00 < val and val < -0.85 else 'black'
    return 'color: %s' % color

model_matrix_1_corr.style.applymap(color_negative_blue)

In [None]:
#need this block because you need to be able to see which ones are correlated without having to go through the dataframe which is kinda tedius

lookthru = model_matrix_1_corr.columns

pos_correlated_ones_dict = {}

for col in lookthru:
    var = model_matrix_1_corr[model_matrix_1_corr[col] > 0.85].index.tolist()
    if col in var:
        var.remove(col)
    if var == []:
        continue
    pos_correlated_ones_dict[col] = var
print(pos_correlated_ones_dict)


this is the subset of correlated ones you might remove on the positive correlation side

{'average_Atomic mass': ['average_Atomic no'],  
'average_Atomic radius': ['average_Atomic radius calculated', 'average_Van der waals radius'], , 
'average_Boiling point': ['average_Liquid range', 'average_Melting point'], , 
'average_Melting point': ['average_Boiling point'], 
'average_X': ['average_First Ionization Energy'], 
'average_Electron affinity': ['max_Electron affinity'], 
 'min_Atomic mass': ['min_Atomic no'], 
 'min_Atomic radius': ['min_Atomic radius calculated', 'min_Critical temperature', 'min_Van der waals radius'], 
 'min_Liquid range': ['min_Boiling point'], 
 'min_Melting point': ['min_Boiling point'],
 'min_Van der waals radius': ['min_Atomic radius'], 
 'min_First Ionization Energy': ['min_X'], 
  'max_Atomic no': ['max_Atomic mass'], 
  'max_Atomic radius calculated': ['max_Atomic radius', 'max_Van der waals radius', 'max_Metallic radius'], 
  
  
  this is a subst of the correlated ones you might remove on the negative side
  
  average_Atomic radius': ['average_First Ionization Energy']
  

In [None]:
#need this block because you need to be able to see which ones are correlated without having to go through the dataframe which is kinda tedius

lookthru2 = model_matrix_1_corr.columns

neg_correlated_ones_dict = {}

for col in lookthru2:
    var2 = model_matrix_1_corr[model_matrix_1_corr[col] < -0.85].index.tolist()
    if col in var2:
        var2.remove(col)
    if var2 == []:
        continue
    neg_correlated_ones_dict[col] = var2
print(neg_correlated_ones_dict)


In [None]:
# I dont really believe you need to run this. But I'll leave it in to show you i tried it. 
# this is the heat map to display the correlations but there are just too many here to sus out and i didnt know a better way than the one above. 

fig, ax = plt.subplots()
## the size of A4 paper
fig.set_size_inches(14, 10)
sns.heatmap(model_matrix_1_corr)

In [None]:
#This one i tried but i really want to leave this hashtagged because it took soooooo long to run and never completed but again leaving it in so that you know i tried


# scatter_matrix(model_matrix_1, figsize=(20,20))



In [None]:
#Feature Selection: this is a list of the features that that I will drop from model_matrix_1 based on the feature selection

droplist_3 = ['max_Van der waals radius','min_Liquid range','max_Atomic radius', 'max_Atomic radius calculated', 'max_Metallic radius']



Test Data Querying

For the sake of saving time. And because we could submit our files to kaggle to check if we are right. Though Ideally we would have kept training the model until we got very close to accurately predicting the Ytest_train data by using the model appropriately. But ultimately we just trained a model. Predicted the data of the actual test set and determined how close we were on kaggle. so there is not a real point to doing the actual model priming here when its not something that we actually did. so we can just query the the test data now. try to predict the values and see how close we get


In [None]:
# Using material IDs provided in training data to get corresponding information from MPD
base_data_test = mpr.query(criteria={"task_id": {"$in":test["material_id"].to_list()}}, properties=["material_id","energy",
        "energy_per_atom",
        "volume",
        "formation_energy_per_atom",
        "nsites",
        "pretty_formula",                                                                                  
        "nelements",
        "density",  "band_gap"])
base_data_test_DF = pd.DataFrame(base_data_test)
#display(base_data_test_DF)

In [None]:
#Here is where for the Training data we removed the Noble Gases but here we will not be doing that so im not sure we actually need to call this anytype of filters



listA_X = [a for a in base_data_test_DF["pretty_formula"]]
listB_X = [Composition(a) for a in listA_X]
listC_X = [a.elements for a in listB_X]

editC_X = [item for sublist in listC_X for item in sublist]
unique_editC_X = set(editC_X)
unique_editC_X

UL_editC_X = [a for a in unique_editC_X]
indiv_ELproperties_test = [a.data for a in UL_editC_X]

peel=set(UL_editC_X)
sy_editC_X = [a.symbol for a in UL_editC_X]
sy_editC_X

#pd.set_option('display.max_rows', None)


indiv_ELproperties_test_DF = pd.DataFrame(indiv_ELproperties_test, index=sy_editC_X)

#Need droplist4 because you have to get rid of these two columns that exist because xenon is in the test group 
droplist_4 = droplist_1 + ['Max oxidation state', 'Min oxidation state']

#droplist_4

indiv_ELproperties_test_DF = indiv_ELproperties_test_DF.drop(columns=droplist_4)
#indiv_ELproperties_test_DF

In [None]:
#Data cleaning of the test data set

indiv_ELproperties_test_DF['Boiling point'] = [(a.replace('K', '', 1)) for a in indiv_ELproperties_test_DF['Boiling point']]
indiv_ELproperties_test_DF['Bulk modulus'] = indiv_ELproperties_test_DF["Bulk modulus"].str.replace("GPa", "")
indiv_ELproperties_test_DF['Critical temperature'] = indiv_ELproperties_test_DF["Critical temperature"].str.replace("K", "")
indiv_ELproperties_test_DF['Density of solid'] = indiv_ELproperties_test_DF['Density of solid'].str.replace("no data", "NaN")
indiv_ELproperties_test_DF['Density of solid'] = indiv_ELproperties_test_DF["Density of solid"].str.replace("kg m<sup>-3</sup>", "")
indiv_ELproperties_test_DF['Liquid range'] = [(a.replace('K', '', 1)) for a in indiv_ELproperties_test_DF['Liquid range']]
indiv_ELproperties_test_DF['Poissons ratio'] = [(a.replace('no data', 'NaN', 1)) for a in indiv_ELproperties_test_DF['Poissons ratio']]
indiv_ELproperties_test_DF['Poissons ratio'] = indiv_ELproperties_test_DF['Poissons ratio'].str.replace("no data", "").astype(float)
indiv_ELproperties_test_DF['Thermal conductivity'] = [float(a.replace('W m<sup>-1</sup> K<sup>-1</sup>', "", 1)) for a in indiv_ELproperties_test_DF['Thermal conductivity']]
indiv_ELproperties_test_DF['Velocity of sound'] = indiv_ELproperties_test_DF['Velocity of sound'].str.replace("no data","NaN")
indiv_ELproperties_test_DF['Velocity of sound'] = [(a.replace('m s<sup>-1</sup>', '', 1)) for a in indiv_ELproperties_test_DF['Velocity of sound']]
indiv_ELproperties_test_DF['Vickers hardness'] = indiv_ELproperties_test_DF['Vickers hardness'].str.replace("no data", "NaN", 1)
indiv_ELproperties_test_DF['Vickers hardness'] = [(a.replace('MN m<sup>-2</sup>', '', 1)) for a in indiv_ELproperties_test_DF['Vickers hardness']]
indiv_ELproperties_test_DF['Youngs modulus'] = indiv_ELproperties_test_DF['Youngs modulus'].str.replace("no data", "NaN", 1)
indiv_ELproperties_test_DF['Youngs modulus'] = [(a.replace('GPa', '', 1)) for a in indiv_ELproperties_test_DF['Youngs modulus']]
indiv_ELproperties_test_DF['Bulk modulus'] = [(a.replace('no data', 'NaN', 1)) for a in indiv_ELproperties_test_DF['Bulk modulus']]
indiv_ELproperties_test_DF['Bulk modulus'] = [(a.replace('liquid', '', 1)) for a in indiv_ELproperties_test_DF['Bulk modulus']]
indiv_ELproperties_test_DF['Bulk modulus'] = indiv_ELproperties_test_DF['Bulk modulus'].str.replace(r"\(.*\)","",  regex=True).astype(float)
indiv_ELproperties_test_DF['Melting point'] = indiv_ELproperties_test_DF['Melting point'].str.replace("K", "")
indiv_ELproperties_test_DF['Melting point'] = indiv_ELproperties_test_DF['Melting point'].str.replace("white P", "")
indiv_ELproperties_test_DF['Melting point'] = indiv_ELproperties_test_DF['Melting point'].str.replace(r"\(.*\)","",  regex=True).astype(float)
indiv_ELproperties_test_DF['Metallic radius'] = indiv_ELproperties_test_DF['Metallic radius'].astype(str)
indiv_ELproperties_test_DF['Metallic radius'] = [(a.replace('no data', 'NaN', 1)) for a in indiv_ELproperties_test_DF['Metallic radius']]
indiv_ELproperties_test_DF['Metallic radius'] = indiv_ELproperties_test_DF['Metallic radius'].astype(float)
#indiv_ELproperties_test_DF['Common oxidation states'] = [len(a) for a in indiv_ELproperties_test_DF['Common oxidation states']]
indiv_ELproperties_test_DF['First Ionization Energy'] = [a[0] for a in indiv_ELproperties_test_DF['Ionization energies']]

indiv_ELproperties_test_DF = indiv_ELproperties_test_DF.drop("Ionization energies", axis=1)

indiv_ELproperties_test_DF['Critical temperature'] = [(a.replace('no data', 'NaN', 1)) for a in indiv_ELproperties_test_DF['Critical temperature']]

#this code works but if for any reason there is an error you have to # out certain ones that cannot be run twice




In [None]:
#indiv_ELproperties_test_DF['First Ionization Energy'] = [a[0] for a in indiv_ELproperties_test_DF['Ionization energies']]

#indiv_ELproperties_test_DF = indiv_ELproperties_test_DF.drop("Ionization energies", axis=1)

indiv_ELproperties_test_DF['Critical temperature'] = [(a.replace('no data', 'NaN', 1)) for a in indiv_ELproperties_test_DF['Critical temperature']]

In [None]:
indiv_ELproperties_test_DF['Atomic radius'] = indiv_ELproperties_test_DF["Atomic radius"].astype(str)

In [None]:
indiv_ELproperties_test_DF['Atomic radius'] = indiv_ELproperties_test_DF["Atomic radius"].str.replace("no data", "NaN", 1)

In [None]:
indiv_ELproperties_test_DF

In [None]:
indiv_ELproperties_test_DF = indiv_ELproperties_test_DF.apply(pd.to_numeric, errors='coerce')

In [None]:
indiv_ELproperties_test_DF.dtypes

In [None]:
#We need to compute the mean values of each column so that way we can place the means of each column in the spaces where we previously made sure there was NAN

#means
mean_col_vals_test = dict(indiv_ELproperties_test_DF.mean())
mean_col_vals_test


# Iterating through variable with averages to replace the NaN values in element_data
for key, value in mean_col_vals_test.items():
    indiv_ELproperties_test_DF.loc[indiv_ELproperties_test_DF[key].isnull(),key] = value

In [None]:
indiv_ELproperties_test_DF.dtypes

In [None]:
#Adding some columns that we need 

base_data_test_DF_wcomp = base_data_test_DF
base_data_test_DF_wcomp['Composition'] = [Composition(c) for c in base_data_test_DF_wcomp["pretty_formula"]]
base_data_test_DF_wcomp['num_atoms'] = [c.num_atoms for c in base_data_test_DF_wcomp['Composition']]
base_data_test_DF_wcomp['volume_per_atom'] = base_data_test_DF_wcomp['volume']/base_data_test_DF_wcomp['num_atoms']
#base_data_test_DF_wcomp

In [None]:


indiv_ELproperties_test_DF_dict = indiv_ELproperties_test_DF.to_dict()


#my functions
def propertymean_X(property, composition):
    sumofproperty = 0
    totalnumatoms = 0
    for element, number in composition.items():
        sumofproperty += (number*indiv_ELproperties_test_DF_dict[property][str(element)])
        totalnumatoms += number
    return sumofproperty/totalnumatoms

def maxofproperty_X(property, composition):
    propmax = None
    for element, number in composition.items():
        propertyvalue = indiv_ELproperties_test_DF_dict[property][str(element)]
        if propmax:
            propmax = propertyvalue if propertyvalue > propmax else propmax
        else:
            propmax = propertyvalue
    return propmax

def minofproperty_X(property, composition):
    propmin = None
    for element, number in composition.items():
        propertyvalue = indiv_ELproperties_test_DF_dict[property][str(element)]
        if propmin:
            propmin = propertyvalue if propertyvalue < propmin else propmin
        else:
            propmin = propertyvalue
    return propmin


#assigning the values of those functions to a dataframe

avg_properties_df_X = pd.DataFrame()

for property in indiv_ELproperties_test_DF.columns:
    individualpropertymean = partial(propertymean_X, property)
    averages = base_data_test_DF_wcomp['Composition'].apply(individualpropertymean)
    avg_properties_df_X[("average_" + property)] = averages
    
avg_properties_df_X.head()
print("Average properties Dimension: ", avg_properties_df_X.shape)

max_properties_X = pd.DataFrame()

for property in indiv_ELproperties_test_DF.columns:
    individualpropertymax = partial(maxofproperty_X, property)
    max = base_data_test_DF_wcomp['Composition'].apply(individualpropertymax)
    max_properties_X[("max_" + property)] = max
    
min_properties_X = pd.DataFrame()

for property in indiv_ELproperties_test_DF.columns:
    individualpropertymin = partial(minofproperty_X, property)
    min = base_data_test_DF_wcomp['Composition'].apply(individualpropertymin)
    min_properties_X[("min_" + property)] = min



In [None]:
base_data_test_DF_wcomp

In [None]:
#model_matrix_2 is our feature space but for the test data

ALL_Features_Matrix_test = pd.concat([base_data_test_DF_wcomp, avg_properties_df_X, min_properties_X, max_properties_X], axis=1)
ALL_Features_Matrix_test.columns

#droplist_2 hasnt changed from the one we used for our traindata
model_matrix_2 = ALL_Features_Matrix_test.drop(columns=droplist_2)
#model_matrix_2

In [None]:
#model_matrix_2

In [None]:
model_matrix_2.set_index('material_id',inplace=True)


In [None]:
# testfornulls = model_matrix_2.isnull().sum()
# testfornulls

In [None]:
#Train Test Split based on the training data

Xtrain_train, Xtest_train, Ytrain_train, Ytest_train = train_test_split(model_matrix_1, NoNobles_training_set,test_size=0.1, random_state=120)

In [None]:
NoNobles_training_set

In [None]:
#Normalizing Data

# In some cases not normallizing the data provided better results. Is it always required to normalize data??

# Computing mean and standard devaitaion for train X and normalizing
mean_Xtrain = Xtrain_train.apply(np.mean, axis=0)
std_Xtrain = Xtrain_train.apply(np.std, axis=0)
norm_Xtrain = (Xtrain_train - mean_Xtrain) / std_Xtrain

# Computing mean and standard devaitaion for test X and normalizing 
mean_Xtest = Xtest_train.apply(np.mean, axis=0)
std_Xtest = Xtest_train.apply(np.std, axis=0)
norm_Xtest = (Xtest_train - mean_Xtest) / std_Xtest




In [None]:
norm_Xtrain

Now we can do out models and provide results

linear model 1 is on on all the features with normalized data

In [None]:
linear_model_1 = LinearRegression()

linear_model_1.fit(norm_Xtrain, Ytrain_train['dielectric_poly_total'])
#linear_model_1_predictions_traintest = linear_model_1.predict(norm_Xtrain)
linear_model_1_predictions_test = linear_model_1.predict(model_matrix_2)

linear_model_1_score = -cross_val_score(linear_model_1, norm_Xtrain, Ytrain_train['dielectric_poly_total'], cv=5, scoring='neg_mean_absolute_error')



DF_linear_model_1 = pd.DataFrame(linear_model_1_predictions_test)
linear_model_1_list =[test, DF_linear_model_1]

subs =  pd.concat(linear_model_1_list, axis=1)
submissions = subs.rename(columns={0:'dielectric_poly_total'})
submissions.to_csv("linreg-1-attempt-1.csv", index=False)


In [None]:
np.mean(linear_model_1_score)

In [None]:
model_matrix_1

linear model 2 is on on all the features with non normalized data

In [None]:
linear_model_2 = LinearRegression()

linear_model_2.fit(Xtrain_train, Ytrain_train['dielectric_poly_total'])
linear_model_2_predictions_traintest = linear_model_2.predict(Xtest_train)
linear_model_2_predictions_test = linear_model_2.predict(model_matrix_2)

linear_model_2_score = -cross_val_score(linear_model_2, Xtrain_train, Ytrain_train['dielectric_poly_total'], cv=5, scoring='neg_mean_absolute_error')



DF_linear_model_2 = pd.DataFrame(linear_model_2_predictions_test)
linear_model_2_list =[test, DF_linear_model_2]

subs2 =  pd.concat(linear_model_2_list, axis=1)
submissions2 = subs2.rename(columns={0:'dielectric_poly_total'})
submissions2.to_csv("linreg-2-attempt-4.csv", index=False)
#needed four attempts because i kept printing linearmodel1 list into subs2


In [None]:
np.mean(linear_model_2_score)

basically the same

even though the scores are the same if you print the next two cells you will see that the number predictions are very different

In [None]:
subs

ridge_model_1 is with normalized data, i tried both normalized and non normalized they are equally bad

In [None]:
ridgemodel_1 = Ridge(alpha=0.1, max_iter=-10000)
ridgemodel_1.fit(norm_Xtrain, Ytrain_train['dielectric_poly_total'])

ridgemodelprediction_1 = ridgemodel_1.predict(norm_Xtest)
ridgemodelprediction_1_test = ridgemodel_1.predict(model_matrix_2)

ridge_model_1_score = -cross_val_score(ridgemodel_1, norm_Xtrain, Ytrain_train['dielectric_poly_total'], cv=5, scoring='neg_mean_absolute_error')

DF_ridge_model_1 = pd.DataFrame(ridgemodelprediction_1_test)
ridge_model_1_list = [test, DF_ridge_model_1]



d_list_ridge = [test, DF_ridge_model_1]


subs_ridge = pd.concat(d_list_ridge, axis=1)
submissions_ridge = subs_ridge.rename(columns={0:'dielectric_poly_total'})
submissions_ridge.to_csv("ridge-1-attempt-1.csv", index=False)

In [None]:
np.mean(ridge_model_1_score)

In [None]:
subs_ridge

not good predicted values

In [None]:
# logreg = LogisticRegression(penalty='none',solver='lbfgs', max_iter=1000)
# logregmodel_1 = logreg.fit(norm_Xtrain, Ytrain_train['dielectric_poly_total'])

# logregmodel_1_score = -cross_val_score(logregmodel_1, norm_train_X, y_train['dielectric_poly_total'], cv=5, scoring='neg_mean_absolute_error')

# logreg_prediction_1_test = logregmodel_1.predict(model_matrix_2)

# DF_logregmodel_1 = pd.DataFrame(logreg_prediction_1_test)

# d_list_logreg = [test, DF_logregmodel_1]


# subs_logreg = pd.concat(d_list_logreg, axis=1)
# submissions_logreg = subs_logreg.rename(columns={0:'dielectric_poly_total'})
# submissions_logreg.to_csv("logreg-1-attempt-1.csv", index=False)


got an error for the logreg model

going to try the best model i have at this point with a major feature reduction


In [None]:
droplist_5 = droplist_2 + ['average_Atomic mass', 'average_Atomic radius calculated', 'average_Van der waals radius','average_Liquid range', 'average_Melting point', 'average_First Ionization Energy',
                           'max_Atomic mass', 'min_Atomic radius calculated', 'max_Van der waals radius','max_Liquid range', 'max_Melting point', 'max_First Ionization Energy',
                          'min_Atomic mass', 'min_Atomic radius calculated', 'min_Van der waals radius','min_Liquid range', 'min_Melting point', 'min_First Ionization Energy', 'band_gap']

droplist_5

In [None]:
#Now we need to take all theses dataframes we have made with the min/max/average properties and put them in a single matrix that the training models will be performed on


ALL_Features_Matrix =  pd.concat([NoNobles_basedata_DF_wcomp, avg_properties_df, min_properties, max_properties], axis=1)

#ALL_Features_Matrix.columns


#this is the second droplist: this one is to get rid of some features that we needed before to create new features list, but that we cant use because we couldnt clean properly or we intensive properties
#droplist_2 = ['volume','energy', 'pretty_formula', 'Composition', 'average_Common oxidation states', 'min_Common oxidation states','max_Common oxidation states']
#we are going to be using droplist_5 here because we want to make a training model that only had the data after our feature reduction
model_matrix_3 = ALL_Features_Matrix.drop(columns=droplist_5)
model_matrix_3 
#model_matrix_1 is the feature matrix for the training data that we will later split

In [None]:
#model_matrix_4 is our feature space but for the test data with the significant feature reduction

ALL_Features_Matrix_test = pd.concat([base_data_test_DF_wcomp, avg_properties_df_X, min_properties_X, max_properties_X], axis=1)
ALL_Features_Matrix_test.columns

#droplist_2 hasnt changed from the one we used for our traindata
model_matrix_4 = ALL_Features_Matrix_test.drop(columns=droplist_5)
model_matrix_4

In [None]:
model_matrix_3.set_index('material_id',inplace=True)
model_matrix_4.set_index('material_id',inplace=True)

In [None]:
#Train Test Split based on the training data

Xtrain_2, Xtest_2, Ytrain_2, Ytest_2 = train_test_split(model_matrix_3, NoNobles_training_set,test_size=0.1, random_state=120)

In [None]:
linear_model_3 = LinearRegression()

linear_model_3.fit(Xtrain_2, Ytrain_2['dielectric_poly_total'])
linear_model_3_predictions_traintest = linear_model_3.predict(Xtest_2)
linear_model_3_predictions_test = linear_model_3.predict(model_matrix_4)

linear_model_3_score = -cross_val_score(linear_model_3, Xtrain_2, Ytrain_2['dielectric_poly_total'], cv=5, scoring='neg_mean_absolute_error')



DF_linear_model_3 = pd.DataFrame(linear_model_3_predictions_test)
linear_model_3_list =[test, DF_linear_model_3]

subs3 =  pd.concat(linear_model_3_list, axis=1)
submissions3 = subs3.rename(columns={0:'dielectric_poly_total'})
submissions3.to_csv("linreg-3-attempt-1.csv", index=False)



In [None]:
np.mean(linear_model_3_score)

this isn't really any better  any better either. even after reducing features that were highly correlated down from 69 down to 51 and following the advice on piazza saying we shouldnt use bandgap

because they aren't abundantly better im going to try some other models and see if it gets any better



In [None]:
#SVR model

svr_model_1 = svm.SVR(kernel='rbf', C=0.001, epsilon=0.001,  tol=1e-5)
svr_model_1.fit(Xtrain_train, Ytrain_train['dielectric_poly_total'])

                      
svr_model_1_predictions_traintest = svr_model_1.predict(Xtest_train)
svr_model_1_predictions_test = svr_model_1.predict(model_matrix_2)
                      

svr_model_1_score = -cross_val_score(svr_model_1, Xtrain_train, Ytrain_train['dielectric_poly_total'], cv=5, scoring='neg_mean_absolute_error')

                      
DF_svr_model_1 = pd.DataFrame(svr_model_1_predictions_test)                      
svr_model_1_list = [test, DF_svr_model_1]

subs4 = pd.concat(svr_model_1_list, axis=1)                      
submissions4= subs4.rename(columns={0:'dielectric_poly_total'})
submissions4.to_csv("svr-1-attempt-1.csv", index=False)                      
                      
                      

In [None]:
np.mean(svr_model_1_score)

#Random Forest Regressor

rfr_model_1 = RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_split=4 , random_state=42)
rfr_model_1.fit(Xtrain_train, Ytrain_train['dielectric_poly_total'])

                      
rfr_model_1_predictions_traintest = rfr_model_1.predict(Xtest_train)
rfr_model_1_predictions_test = rfr_model_1.predict(model_matrix_2)
                      

rfr_model_1_score = -cross_val_score(rfr_model_1, Xtrain_train, Ytrain_train['dielectric_poly_total'], cv=5, scoring='neg_mean_absolute_error')

                      
DF_rfr_model_1 = pd.DataFrame(rfr_model_1_predictions_test)                      
rfr_model_1_list = [test, DF_rfr_model_1]

subs5 = pd.concat(rfr_model_1_list, axis=1)                      
submissions5 = subs5.rename(columns={0:'dielectric_poly_total'})
submissions5.to_csv("rfr-1-attempt-1.csv", index=False)                      
                      
                      

In [None]:
np.mean(rfr_model_1_score)

###Everything below here is just stuff classmates tried that got them better data that i am confused about. Namely why you would query the exact id's for the test part of the training set. and also why i wasnt able to do so

In [None]:
## Need a clean unindexed model_matrix 1 to work with that isnt called model_matrix_1


model_matrix_5 = ALL_Features_Matrix.drop(columns=droplist_2)
model_matrix_5
#model_matrix_1 is the feature matrix for the training data that we will later split


In [None]:
mpids_2 = list(model_matrix_5["material_id"])
mpids_2 = pd.DataFrame(mpids_2)

#setting a new dataframe equal to the dataframe that only includes the one in model_matrix_1 which does not include noble gases

NoNobles_training_set_2 = train[train.index.isin(mpids_2.index)] 


#you have to make the material_id the index because the dataframe operations can't handle strings
NoNobles_training_set_2.set_index('material_id')


In [None]:
model_matrix_5.set_index('material_id')

In [None]:
#different style of train test split i did it like the way we did in class but others suggested this might be wrong

#lists of  ids actually in the sets
train_ids = list(NoNobles_training_set_2['material_id'])
test_ids = list(test['material_id'])


train_X = model_matrix_1.loc[train_ids]
test_X = model_matrix_1.loc[test_ids]

train_y = train['dielectric_poly_total']

test_y = mpr.query({'dielectric_poly_total': {"$exists": True}}, properties=['material_id', 'diel.poly_total'])
tesy_y = pd.DataFrame(test_y)
test_y.set_index('material_id', inplace = True)
test_y = test_y.loc[test_ids]
test_y = test_y['diel.poly_total']



In [None]:
NoNobles_training_set


In [None]:
linear_model_4 = LinearRegression()

linear_model_4.fit(train_X, train_y)
linear_model_4_predictions_traintest = linear_model_4.predict(test_X)
linear_model_4_predictions_test = linear_model_4.predict(model_matrix_2)

linear_model_4_score = -cross_val_score(linear_model_4, train_X, train_y, cv=5, scoring='neg_mean_absolute_error')



DF_linear_model_4 = pd.DataFrame(linear_model_4_predictions_test)
linear_model_4_list =[test, DF_linear_model_4]

subs4 =  pd.concat(linear_model_4_list, axis=1)
submissions4 = subs4.rename(columns={0:'dielectric_poly_total'})
submissions4.to_csv("linreg-4-attempt-1.csv", index=False)
