In [2]:
# Load in libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from time import time
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

In [7]:
# Load in the data
raw_data = np.loadtxt(open("40192_2013_16_MOESM1_ESM..csv", "r"), delimiter = ',', dtype = str, skiprows = 0)

# Get the labels
labels = raw_data[0]

# Create a dataframe
df = pd.DataFrame(raw_data[1:], columns = labels)

# Drop Sl. No. since it is just an indexing column
df = df.drop(["Sl. No."], axis = 1)

# Convert the columns to floating points
name = df.iloc[:,:].copy()
df = name.apply(pd.to_numeric)

df_labels = df.columns.to_list()
print(df_labels)

df.head()

['NT', 'THT', 'THt', 'THQCr', 'CT', 'Ct', 'DT', 'Dt', 'QmT', 'TT', 'Tt', 'TCr', 'C', 'Si', 'Mn', 'P', 'S', 'Ni', 'Cr', 'Cu', 'Mo', 'RedRatio', 'dA', 'dB', 'dC', 'Fatigue']


Unnamed: 0,NT,THT,THt,THQCr,CT,Ct,DT,Dt,QmT,TT,...,S,Ni,Cr,Cu,Mo,RedRatio,dA,dB,dC,Fatigue
0,885,30,0,0,30,0.0,30.0,0.0,30,30,...,0.022,0.01,0.02,0.01,0.0,825,0.07,0.02,0.04,232
1,885,30,0,0,30,0.0,30.0,0.0,30,30,...,0.017,0.08,0.12,0.08,0.0,610,0.11,0.0,0.04,235
2,885,30,0,0,30,0.0,30.0,0.0,30,30,...,0.015,0.02,0.03,0.01,0.0,1270,0.07,0.02,0.0,235
3,885,30,0,0,30,0.0,30.0,0.0,30,30,...,0.024,0.01,0.02,0.01,0.0,1740,0.06,0.0,0.0,241
4,885,30,0,0,30,0.0,30.0,0.0,30,30,...,0.022,0.01,0.02,0.02,0.0,825,0.04,0.02,0.0,225


In [None]:
import random

# I get all the values as a numpy array here
all_values = df.iloc[:,1:].values
id_values = df.iloc[:,0].values

# List of lists are turned into Numpy arrays to facilitate calculations in steps to follow (Normalization).
all_values = np.array(all_values, dtype = float) 
print("Shape of Values:", all_values.shape)
all_labels = np.array(all_labels, dtype = float)
print("Shape of Labels:", all_labels.shape)

# Uncomment the line below to shuffle the dataset (we do not do this here to ensure consistent results for every run)
#order = list(range(0, len(df)))
#random.shuffle(order)         # This numpy argsort returns the indexes that would be used to shuffle a list
order = np.arange(len(df))
some_values = all_values[order]
some_labels = all_labels[order]
some_materials = (df.iloc[:,0].values)[order]
id_values = id_values[order]

# We'll use this for the splitting of our data in to training and testing
length = int(np.rint(0.10*len(df)))

# Training Set
train_labels = some_labels[:(len(df) - length)]
train_values = some_values[:(len(df) - length)]

# Testing Set
test_labels = some_labels[-length:]
test_values = some_values[-length:]

# NORMALIZATION

mean = np.mean(train_values, axis = 0) # mean
std = np.std(train_values, axis = 0) # standard deviation

train_values = (train_values - mean) / std # input scaling
test_values = (test_values - mean) / std # input scaling

# This line is used for labels in the plots
# Marked in red
labeled_materials = id_values[(len(df)-length):]
# Marked in blue
materials = id_values

#print(order)
print("Sample entry from training set")
print(train_values[0]) # print a sample entry from the training set

# Here is the average and standard devation of the hardness for reference (uncomment if you wish to see them)
yield_avg = np.mean(all_labels)
yield_std = np.std(all_labels)
print(yield_avg) 
print(yield_std)

# Uncomment if you want to see a distribution of the hardness values
plt.hist(all_labels)
plt.xlabel('Yield Strength (MPa)')
plt.ylabel('Frequency')
plt.show()

In [None]:
# DEFINITION OF THE MODEL

# The weights of our neural network will be initialized in a random manner, using a seed allows for reproducibility
kernel_init = initializers.RandomNormal(seed=0)
# In a sequential model, the first layer must specify the input shape the model will expect;
# in this case the value is train_values.shape[1] which is the number
# of attributes (properties) and equals 17.

model = Sequential()
model.add(Dense(16, activation='relu', input_shape=(train_values.shape[1], ), kernel_initializer=kernel_init)) #16
model.add(Dense(64, activation='relu', kernel_initializer=kernel_init)) #64
model.add(Dense(1, kernel_initializer=kernel_init))

print(train_values.shape[1])
# DEFINITION OF THE OPTIMIZER

optimizer = tf.train.RMSPropOptimizer(0.004) # Root Mean Squared Propagation 4

# This line matches the optimizer to the model and states which metrics will evaluate the model's accuracy
#tensorboard = TensorBoard(log_dir ="log/{}".format(time()))

model.compile(loss='mse', optimizer=optimizer, metrics=['mae'])
model.summary()