In [1]:
# Using PyTorch for NN
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np
import inputparser

In [2]:
input_vector, y_test, y_dict = inputparser.make_input()

In [3]:
# x-test, y-test are 225 data points, remove first element for verification
x_verification = []
x_verification.append(input_vector[0])
y_verification = y_test[0]

x_input = input_vector[1:]
y_input = []
for i in y_test[1:]:
    y_input.append(i)

In [4]:
# Random seeding
torch.manual_seed(44)
np.random.seed(44)

# Input data
X = torch.tensor(x_input.tolist())
Y = torch.tensor(y_input)

# Split data
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=42)

# Normalize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# NN
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Initialize
input_size = len(x_input[0])
# Optimal hidden layer size
hidden_size = 290  
model = NeuralNet(input_size, hidden_size)

# loss
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
for epoch in range(141):
    # forward prop
    outputs = model(torch.tensor(X_train, dtype=torch.float32))
    loss = criterion(outputs.squeeze(), Y_train)
    # back prop
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 100 == 0:
        print(f'Epoch [{epoch+1}/141], Loss: {loss.item():.4f}')
print("Final Epoch Count:", epoch + 1)

with torch.no_grad():
    train_outputs = model(torch.tensor(X_train, dtype=torch.float32))
    train_mse = mean_squared_error(train_outputs.squeeze().detach().numpy(), Y_train)
print("Training MSE:", train_mse)

with torch.no_grad():
    val_outputs = model(torch.tensor(X_val, dtype=torch.float32))
    val_mse = mean_squared_error(val_outputs.squeeze().detach().numpy(), Y_val)
print("Validation MSE:", val_mse)



Epoch [100/141], Loss: 0.0056
Final Epoch Count: 141
Training MSE: 0.0036206632
Validation MSE: 0.017644415


In [5]:
# print results
val_unsorted = val_outputs.squeeze().detach().numpy()
y_true = Y_val.squeeze().detach().numpy()
y_diff = abs(val_unsorted - y_true)
rel_error = -y_diff/y_true

# perform float comparison for 1.e-7 precision
y_val_mols = []
for i,j in y_dict.items():
    for y in y_true:
        if abs(y-i) < 0.000001:
            #print(j,y)
            y_val_mols.append(j)

unsorted_results = {'name':y_val_mols, 'predicted y':val_unsorted, 'true y': y_true, 'abs error': y_diff, 'rel error': rel_error}
unsorted_df = pd.DataFrame.from_dict(unsorted_results)
print(unsorted_df)
#unsorted_df.to_csv('results_unsorted.csv', index=False)

                      name  predicted y     true y  abs error  rel error
0                 methanol    -0.822219  -0.797949   0.024269   0.030415
1   beryllium_monofluoride    -0.679704  -0.690841   0.011137   0.016121
2    phosphorus_tribromide    -1.737570  -1.668961   0.068609   0.041109
3     methanethiol_radical    -2.862629  -2.931822   0.069193   0.023601
4            cyano_radical    -1.064783  -1.052541   0.012242   0.011631
5           dimethyl_ether    -3.052216  -3.074626   0.022410   0.007289
6     aluminum_trichloride    -1.366799  -1.438742   0.071943   0.050004
7   phosphorus_trichloride    -1.482355  -1.612581   0.130226   0.080756
8                 ethylene    -0.528812  -0.474373   0.054439   0.114760
9                  pentane    -0.641362  -0.671736   0.030374   0.045217
10       isopropyl_alcohol    -4.941698  -5.074177   0.132479   0.026108
11           glyoxal_trans    -0.660431  -0.719291   0.058860   0.081830
12     oxygen_monofluoride    -0.292599  -0.239406 

In [6]:
# Using Sorted CM dataset
input_vector, y_test, y_dict = inputparser.make_input(sort = True)

In [7]:
# x-test, y-test are 225 data points, remove first element for verification
x_verification = []
x_verification.append(input_vector[0])
y_verification = y_test[0]

x_input = input_vector[1:]
y_input = []
for i in y_test[1:]:
    y_input.append(i)

In [8]:
# Random seeding
torch.manual_seed(12)
np.random.seed(12)

# Input data
X = torch.tensor(x_input.tolist())
Y = torch.tensor(y_input)

# Split data
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=42)

# Normalize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# NN
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Initialize
input_size = len(x_input[0])
# Optimal hidden layer size
hidden_size = 290  
model = NeuralNet(input_size, hidden_size)

# loss
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
for epoch in range(202):
    # forward prop
    outputs = model(torch.tensor(X_train, dtype=torch.float32))
    loss = criterion(outputs.squeeze(), Y_train)
    # back prop
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 100 == 0:
        print(f'Epoch [{epoch+1}/141], Loss: {loss.item():.4f}')
print("Final Epoch Count:", epoch + 1)

with torch.no_grad():
    train_outputs = model(torch.tensor(X_train, dtype=torch.float32))
    train_mse = mean_squared_error(train_outputs.squeeze().detach().numpy(), Y_train)
print("Training MSE:", train_mse)

with torch.no_grad():
    val_outputs = model(torch.tensor(X_val, dtype=torch.float32))
    val_mse = mean_squared_error(val_outputs.squeeze().detach().numpy(), Y_val)
print("Validation MSE:", val_mse)

Epoch [100/141], Loss: 0.0049
Epoch [200/141], Loss: 0.0030
Final Epoch Count: 202
Training MSE: 0.0029975642
Validation MSE: 0.04498981


In [9]:
# print results
val_sorted = val_outputs.squeeze().detach().numpy()
y_true = Y_val.squeeze().detach().numpy()
y_diff = abs(val_sorted - y_true)
rel_error = -y_diff/y_true

# perform float comparison for 1.e-7 precision
y_val_mols = []
for i,j in y_dict.items():
    for y in y_true:
        if abs(y-i) < 0.000001:
            #print(j,y)
            y_val_mols.append(j)

sorted_results = {'name':y_val_mols, 'predicted y':val_sorted, 'true y': y_true, 'abs error': y_diff, 'rel error': rel_error}
sorted_df = pd.DataFrame.from_dict(sorted_results)
print(sorted_df)
#sorted_df.to_csv('results_sorted.csv', index=False)

                      name  predicted y     true y  abs error  rel error
0                 methanol    -0.788658  -0.797949   0.009291   0.011644
1   beryllium_monofluoride    -0.684747  -0.690841   0.006094   0.008821
2    phosphorus_tribromide    -1.735474  -1.668961   0.066513   0.039853
3     methanethiol_radical    -2.463213  -2.931822   0.468609   0.159836
4            cyano_radical    -1.047070  -1.052541   0.005472   0.005199
5           dimethyl_ether    -3.480864  -3.074626   0.406238   0.132126
6     aluminum_trichloride    -1.371336  -1.438742   0.067406   0.046851
7   phosphorus_trichloride    -1.535325  -1.612581   0.077256   0.047908
8                 ethylene    -0.536838  -0.474373   0.062465   0.131678
9                  pentane    -0.635204  -0.671736   0.036532   0.054384
10       isopropyl_alcohol    -4.576627  -5.074177   0.497550   0.098055
11           glyoxal_trans    -0.665090  -0.719291   0.054201   0.075354
12     oxygen_monofluoride    -0.328010  -0.239406 

In [10]:
# Random seeding
torch.manual_seed(2)
np.random.seed(2)

# Input data
X = torch.tensor(x_input.tolist())
Y = torch.tensor(y_input)

# Split data
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=42)

# Normalize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# NN
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Initialize
input_size = len(x_input[0])
# Optimal hidden layer size
hidden_size = 64 
model = NeuralNet(input_size, hidden_size)

# loss
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
for epoch in range(30):
    # forward prop
    outputs = model(torch.tensor(X_train, dtype=torch.float32))
    loss = criterion(outputs.squeeze(), Y_train)
    # back prop
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 100 == 0:
        print(f'Epoch [{epoch+1}/141], Loss: {loss.item():.4f}')
print("Final Epoch Count:", epoch + 1)

with torch.no_grad():
    train_outputs = model(torch.tensor(X_train, dtype=torch.float32))
    train_mse = mean_squared_error(train_outputs.squeeze().detach().numpy(), Y_train)
print("Training MSE:", train_mse)

with torch.no_grad():
    val_outputs = model(torch.tensor(X_val, dtype=torch.float32))
    val_mse = mean_squared_error(val_outputs.squeeze().detach().numpy(), Y_val)
print("Validation MSE:", val_mse)

Final Epoch Count: 30
Training MSE: 0.0943756
Validation MSE: 1.2631232


In [11]:
# print results
val_sorted_naive = val_outputs.squeeze().detach().numpy()
y_true = Y_val.squeeze().detach().numpy()
y_diff = abs(val_sorted_naive - y_true)
rel_error = -y_diff/y_true

# perform float comparison for 1.e-7 precision
y_val_mols = []
for i,j in y_dict.items():
    for y in y_true:
        if abs(y-i) < 0.000001:
            #print(j,y)
            y_val_mols.append(j)

sorted_naive_results = {'name':y_val_mols, 'predicted y':val_sorted_naive, 'true y': y_true, 'abs error': y_diff, 'rel error': rel_error}
sorted_naive_df = pd.DataFrame.from_dict(sorted_naive_results)
print(sorted_naive_df)
#sorted_naive_df.to_csv('results_sorted_naive.csv', index=False)

                      name  predicted y     true y  abs error  rel error
0                 methanol    -1.065532  -0.797949   0.267583   0.335339
1   beryllium_monofluoride    -0.944828  -0.690841   0.253987   0.367650
2    phosphorus_tribromide    -1.802481  -1.668961   0.133520   0.080002
3     methanethiol_radical    -9.620527  -2.931822   6.688705   2.281415
4            cyano_radical    -1.156311  -1.052541   0.103770   0.098590
5           dimethyl_ether    -3.175776  -3.074626   0.101150   0.032898
6     aluminum_trichloride    -1.117206  -1.438742   0.321536   0.223484
7   phosphorus_trichloride    -1.418085  -1.612581   0.194496   0.120612
8                 ethylene    -0.830213  -0.474373   0.355840   0.750127
9                  pentane    -0.921955  -0.671736   0.250219   0.372496
10       isopropyl_alcohol    -5.219787  -5.074177   0.145610   0.028696
11           glyoxal_trans    -0.871379  -0.719291   0.152087   0.211440
12     oxygen_monofluoride    -0.674495  -0.239406 