In [1]:
# Importing Libraries
import numpy as np
import scipy as sp
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import AdaBoostRegressor, AdaBoostClassifier, \
    GradientBoostingClassifier, GradientBoostingRegressor, \
    RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import r2_score



In [2]:
# Loading training and testing data
train= pd.read_csv("train.csv",index_col=False)
test= pd.read_csv("test.csv",index_col=False)

# Querying materials and their properties based on training data
from pymatgen.ext.matproj import MPRester
mpr = MPRester("241iWwhTEOaNmC6V")

# Using material IDs provided in training data to get corresponding information from MPD
data = mpr.query(criteria={"task_id": {"$in":train["material_id"].to_list()}}, properties=["material_id","energy",
        "energy_per_atom",
        "volume",
        "formation_energy_per_atom",
        "nsites",
        "unit_cell_formula",
        "pretty_formula",
        "is_hubbard",
        "elements",
        "nelements",
        "e_above_hull",
        "hubbards",
        "is_compatible",
        "spacegroup",
        "task_ids",
        "band_gap",
        "density",
        "icsd_id",
        "icsd_ids",
        "total_magnetization",
        "oxide_type",
        "tags",
        "elasticity"])
df = pd.DataFrame(data)
display(df)

  0%|          | 0/5619 [00:00<?, ?it/s]

Unnamed: 0,material_id,energy,energy_per_atom,volume,formation_energy_per_atom,nsites,unit_cell_formula,pretty_formula,is_hubbard,elements,...,spacegroup,task_ids,band_gap,density,icsd_id,icsd_ids,total_magnetization,oxide_type,tags,elasticity
0,mp-1001034,-52.019078,-3.715648,376.145863,-0.703661,14,"{'Mg': 2.0, 'In': 4.0, 'Se': 8.0}",Mg(InSe2)2,False,"[In, Mg, Se]",...,"{'symprec': 0.1, 'source': 'spglib', 'symbol':...","[mp-1117061, mp-1001034, mp-1001078, mp-100111...",0.7432,5.030727,,[],1.522000e-04,,[],
1,mp-1001780,-22.364406,-5.591101,78.515319,-1.699567,4,"{'Lu': 1.0, 'Cu': 1.0, 'S': 2.0}",LuCuS2,False,"[Cu, Lu, S]",...,"{'symprec': 0.1, 'source': 'spglib', 'symbol':...","[mp-1001807, mp-1001780, mp-1064547, mp-106455...",1.5031,6.400668,,[628310],0.000000e+00,,[Lutetium copper(I) sulfide (1/1/2)],
2,mp-1001786,-23.600913,-5.900228,71.701237,-2.103385,4,"{'Li': 1.0, 'Sc': 1.0, 'S': 2.0}",LiScS2,False,"[Li, Sc, S]",...,"{'symprec': 0.1, 'source': 'spglib', 'symbol':...","[mp-1001786, mp-1065315, mp-1065324, mp-106532...",1.5296,2.687084,,[642305],7.000000e-07,,[Lithium scandium sulfide (1/1/2)],"{'G_Reuss': 35.0, 'G_VRH': 36.0, 'G_Voigt': 38..."
3,mp-1002124,-19.780759,-9.890380,32.055765,-0.298402,2,"{'Hf': 1.0, 'C': 1.0}",HfC,False,"[Hf, C]",...,"{'symprec': 0.1, 'source': 'spglib', 'symbol':...","[mp-1059944, mp-1059974, mp-1059978, mp-105992...",0.5774,9.868236,,[185992],0.000000e+00,,[Hafnium carbide (1/1)],"{'G_Reuss': 23.0, 'G_VRH': 32.0, 'G_Voigt': 41..."
4,mp-1004528,-95.454964,-5.614998,324.979430,-3.162482,17,"{'Cs': 1.0, 'B': 3.0, 'Pb': 1.0, 'F': 12.0}",CsB3PbF12,False,"[B, Cs, F, Pb]",...,"{'symprec': 0.1, 'source': 'spglib', 'symbol':...","[mp-1005525, mp-1004528, mp-1004763, mp-100620...",6.3125,3.068458,,[],1.900000e-06,,[],
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5614,mvc-6916,-99.385289,-7.098949,156.686110,-2.307339,14,"{'Mg': 2.0, 'Mn': 4.0, 'O': 8.0}",MgMn2O4,True,"[Mg, Mn, O]",...,"{'symprec': 0.1, 'source': 'spglib', 'symbol':...","[mp-1041914, mp-1041933, mp-1041952, mvc-6916,...",0.0516,4.200544,,[],8.000000e+00,oxide,[],
5615,mvc-6928,-91.603914,-6.543137,173.941832,-2.192463,14,"{'Ca': 2.0, 'Fe': 4.0, 'O': 8.0}",Ca(FeO2)2,True,"[Ca, Fe, O]",...,"{'symprec': 0.1, 'source': 'spglib', 'symbol':...","[mp-1041946, mvc-6928, mp-1041959, mp-1041965,...",1.7463,4.119621,,[],1.000000e+01,oxide,[],
5616,mvc-6946,-36.735653,-6.122609,97.529160,-1.961400,6,"{'Sn': 2.0, 'O': 4.0}",SnO2,False,"[O, Sn]",...,"{'symprec': 0.1, 'source': 'spglib', 'symbol':...","[mp-1041984, mp-1041987, mvc-6946, mp-1041986,...",2.1009,5.131960,,[],1.445000e-05,oxide,[],
5617,mvc-7040,-145.212422,-8.067357,418.920727,-1.999722,18,"{'Re': 4.0, 'O': 14.0}",Re2O7,False,"[O, Re]",...,"{'symprec': 0.1, 'source': 'spglib', 'symbol':...","[mvc-7040, mp-1042110, mp-1042121, mp-1042129,...",3.4689,3.840256,,[],0.000000e+00,oxide,[],


In [3]:
y_jojo = train
y_jojo



targets = df.loc[:,["energy",
        "energy_per_atom",
        "volume",
        "formation_energy_per_atom",
        "nsites",
        "nelements",
        "density",
        "band_gap"]]

# x_jojo = df[targets]
# x_jojo

cols = ["energy",
        "energy_per_atom",
        "volume",
        "formation_energy_per_atom",
        "nsites",
        "nelements",
        "density",
        "band_gap"] 
design_matrix= df[cols]

In [4]:
y_jojo

Unnamed: 0,material_id,dielectric_poly_total
0,mp-555903,8.337936
1,mp-752658,14.735277
2,mp-3439,17.195305
3,mp-16135,21.593507
4,mp-36447,9.507068
...,...,...
5614,mp-643378,20.365294
5615,mp-3536,8.476483
5616,mp-760402,27.401830
5617,mp-28109,8.384639


In [5]:
X_train, X_test, y_train, y_test = train_test_split(design_matrix, 
                                                    y_jojo,
                                                    test_size=0.1, 
                                                    random_state=42)

In [6]:
# Computing mean and standard devaitaion for train X and normalizing
mean_train_X = X_train.apply(np.mean, axis=0)
std_train_X = X_train.apply(np.std, axis=0)
norm_train_X = (X_train - mean_train_X) / std_train_X

# Computing mean and standard devaitaion for test X and normalizing 
mean_test_X = X_test.apply(np.mean, axis=0)
std_test_X = X_test.apply(np.std, axis=0)
norm_test_X = (X_test - mean_test_X) / std_test_X

In [9]:
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression()
linear_model.fit(norm_train_X, y_train['dielectric_poly_total'])
#linear_predictions = pd.DataFrame(linear_model.predict(norm_test_X),
                                #  columns = ['dielectric_poly_total'])
linear_predictions = linear_model.predict(X_test)

linear_model.score(design_matrix, y_jojo['dielectric_poly_total'])

#Reallinear_prediction = linear_model.predict(test)

-144.81337817686486

In [11]:
type(linear_predictions)

numpy.ndarray

In [13]:
from sklearn.metrics import mean_squared_error, mean_absolute_error


MAE = mean_absolute_error(y_test['dielectric_poly_total'], 
                          linear_predictions)


RMSE = np.sqrt(mean_squared_error(y_test['dielectric_poly_total'], 
                          linear_predictions))


# from sklearn.model_selection import cross_val_predict, KFold
# from sklearn.metrics import r2_score

# # Cross- Validation
# kfold = KFold(n_splits=5, shuffle=True, random_state=42)
# #yhat_mlr = cross_val_predict(linear_model, x_jojo, targets['dielectric_poly_total'], cv=kfold)
# CV_linMAE = mean_absolute_error(targets['dielectric_poly_total'], yhat_mlr)
# CV_linRMSE = np.sqrt(mean_squared_error(targets['dielectric_poly_total'], yhat_mlr))

# print('Cross Val MAE:', CV_linMAE)
# print('Cross Val RMSE:', CV_linRMSE)
print("The MAE is:", MAE)
print("The RMSE is:", RMSE)

The MAE is: 127.51149686913918
The RMSE is: 168.38988025427608


In [87]:
dielects=(model.predict(test_data))

df_submit = pd.DataFrame(list(dielects))
df_submit.to_csv('dielectric_csv.csv')


NameError: name 'model' is not defined

In [95]:
data2 = mpr.query(criteria={"task_id": {"$in":test["material_id"].to_list()}}, properties=["energy",
        "energy_per_atom",
        "volume",
        "formation_energy_per_atom",
        "nsites",
        "unit_cell_formula",
        "pretty_formula",
        "is_hubbard",
        "elements",
        "nelements",
        "e_above_hull",
        "hubbards",
        "is_compatible",
        "spacegroup",
        "task_ids",
        "band_gap",
        "density",
        "icsd_id",
        "icsd_ids",
        "total_magnetization",
        "material_id",
        "oxide_type",
        "tags",
        "elasticity"])
df = pd.DataFrame(data)
df2 = pd.DataFrame(data2)

  0%|          | 0/1400 [00:00<?, ?it/s]

In [97]:
desired_factors = [ "energy",
        "energy_per_atom",
        "volume",
        "formation_energy_per_atom",
        "nsites",
        "nelements",
        "density",
        "band_gap"] 

Y_data = df2[cols]

#set my model to DecisionTree
model = DecisionTreeRegressor()

#set prediction data to factors that will predict, and set target to SalePrice
train_data = X_data[desired_factors]
test_data = Y_data[desired_factors]
target = train.dielectric_poly_total

# #fitting model with prediction data and telling it my target
model.fit(train_data, target)

dielects=(model.predict(test_data))


dfhi= pd.read_csv("test.csv")

df_submit = pd.DataFrame(list(dielects))
df_submit.to_csv('dielectric.csv')

In [98]:
dielects

array([11.03863267,  5.28496733, 10.53338133, ...,  9.69750167,
        5.61765533, 65.746     ])

In [100]:
len(dielects)

1400

In [101]:
test

Unnamed: 0,material_id
0,mp-1223893
1,mp-16988
2,mp-730
3,mp-756202
4,mp-580941
...,...
1395,mp-34293
1396,mp-752975
1397,mp-8962
1398,mp-20782


In [103]:
 shit=pd.DataFrame(dielects)

In [106]:
d_list=[test, shit]

In [107]:
newsubs=pd.concat(d_list, axis=1)

In [None]:
newsubs_update=newsubs.rename(columns)

In [109]:
newsubs.to_csv("submission_test.csv")

In [79]:
# from sklearn import metrics
# from sklearn.model_selection import cross_val_predict, KFold
# from sklearn.metrics import r2_score
# # print('Mean absolute error:', metrics.mean_absolute_error(y_test, linear_predictions))
# # print('Mean squared error:', metrics.mean_squared_error(y_test, linear_predictions))
# # print('root mean squared error:', np.sqrt(metrics.mean_squared_error(y_test, linear_predictions)))

# # CV_linMAE = mean_absolute_error(targets['formation_energy_per_atom'], yhat_mlr)
# # CV_linRMSE = np.sqrt(mean_squared_error(targets['formation_energy_per_atom'], yhat_mlr))
# kfold = KFold(n_splits=5, shuffle=True, random_state=42)
# yhat_mlr = cross_val_predict(linear_model, design_matrix, y_jojo['dielectric_poly_total'], cv=kfold)
# metrics.mean_absolute_error(y_test, yhat_mlr)


In [80]:
# x_train, x_test, y_train, y_test = train_test_split(x_jojo, y_jojo, test_size = 0.2 )

In [81]:
# x_train.head()

In [82]:
# # Computing mean and standard devaitaion for train X and normalizing
# mean_train_X = x_train.apply(np.mean, axis=0)
# std_train_X = x_train.apply(np.std, axis=0)
# norm_train_X = (x_train - mean_train_X) / std_train_X

# # Computing mean and standard devaitaion for test X and normalizing 
# mean_test_X = x_test.apply(np.mean, axis=0)
# std_test_X = x_test.apply(np.std, axis=0)
# norm_test_X = (x_test - mean_test_X) / std_test_X


In [83]:
# # Importing functions for regression 
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
# from sklearn.discriminant_analysis import (LinearDiscriminantAnalysis as LDA, 
#                                            QuadraticDiscriminantAnalysis as QDA)




# linear_model = LinearRegression()
# linear_model.fit(norm_train_X, y_train['dielectric_poly_total'])
# #linear_predictions = pd.DataFrame(linear_model.predict(norm_test_X),
#                                   #columns = ['dielectric_poly_total'])
# linear_predictions = linear_model.predict(norm_test_X)


# mae = mean_absolute_error(y_test, linear_predictions)

# mae





# # from sklearn.metrics import mean_squared_error, mean_absolute_error


# MAE = mean_absolute_error(y_test['dielectric_poly_total'],linear_predictions['dielectric_poly_total'])

# # RMSE = np.sqrt(mean_squared_error(y_test['dielectric_poly_total'], 
# #                           linear_predictions['dielectric_poly_total']))

# print("The MAE is:", MAE)
# # print("The RMSE is:", RMSE)


In [84]:
#mymodel= np.poly1d(np.polyfit(x_train, y_train, 4 ))

In [85]:
#r2 = r2_score(y_train, linear_predictions)
