In [1]:
# Using PyTorch for NN
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np
import inputparser

In [2]:
input_vector, y_test, y_dict = inputparser.make_input()

In [3]:
# x-test, y-test are 225 data points, remove first element for verification
x_verification = []
x_verification.append(input_vector[0])
y_verification = y_test[0]

x_input = input_vector[1:]
y_input = []
for i in y_test[1:]:
    y_input.append(i)

In [4]:
# Random seeding
torch.manual_seed(44)
np.random.seed(44)

# Input data
X = torch.tensor(x_input.tolist())
Y = torch.tensor(y_input)

# Split data
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=42)

# Normalize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# NN
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Initialize
input_size = len(x_input[0])
# Optimal hidden layer size
hidden_size = 290  
model = NeuralNet(input_size, hidden_size)

# loss
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
for epoch in range(141):
    # forward prop
    outputs = model(torch.tensor(X_train, dtype=torch.float32))
    loss = criterion(outputs.squeeze(), Y_train)
    # back prop
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 100 == 0:
        print(f'Epoch [{epoch+1}/141], Loss: {loss.item():.4f}')
print("Final Epoch Count:", epoch + 1)

with torch.no_grad():
    train_outputs = model(torch.tensor(X_train, dtype=torch.float32))
    train_mse = mean_squared_error(train_outputs.squeeze().detach().numpy(), Y_train)
print("Training MSE:", train_mse)

with torch.no_grad():
    val_outputs = model(torch.tensor(X_val, dtype=torch.float32))
    val_mse = mean_squared_error(val_outputs.squeeze().detach().numpy(), Y_val)
print("Validation MSE:", val_mse)



Epoch [100/141], Loss: 0.0056
Final Epoch Count: 141
Training MSE: 0.0036206632
Validation MSE: 0.017644415


In [5]:
val_unsorted = val_outputs.squeeze().detach().numpy()
print(val_unsorted)

[ -0.82221854  -0.6797038   -1.7375702   -2.862629    -1.0647833
  -3.052216    -1.3667988   -1.4823546   -0.5288122   -0.6413616
  -4.941698    -0.66043144  -0.29259875  -0.6784127   -1.1588546
  -4.5891337   -0.31316614  -1.888687    -0.8486231   -1.0634212
  -1.6879255   -1.6509457   -1.4356289   -0.45398113  -0.6838446
  -4.386206    -1.2435158   -1.3835038   -0.66985786  -1.7862955
  -1.0461179   -1.2789257  -10.880285    -1.3222715   -1.483077
  -0.85174894  -1.541234    -1.2242509   -0.9786508   -0.923383
  -0.9160499   -1.7520908   -2.5522738   -1.5270575   -1.2532095 ]


In [6]:
y_true = Y_val.squeeze().detach().numpy()
print(y_true)

[ -0.79794914  -0.6908408   -1.668961    -2.9318223   -1.0525415
  -3.0746264   -1.438742    -1.6125808   -0.4743731   -0.67173576
  -5.074177    -0.7192913   -0.23940559  -0.6822562   -1.1188079
  -4.89845     -0.21544072  -1.9638656   -0.95251924  -1.0548779
  -1.9152862   -1.5653359   -1.4664245   -0.3569649   -0.85329396
  -4.3623357   -1.4673803   -1.6451893   -0.655875    -1.8348596
  -0.6766137   -1.515031   -10.788043    -1.2504039   -1.4635166
  -0.82925224  -1.4858009   -1.6150858   -0.99036634  -0.90275496
  -0.95012754  -1.6095264   -2.610244    -1.5121139   -1.2108301 ]


In [7]:
# perform float comparison for 1.e-7 precision
y_val_mols = []
for i,j in y_dict.items():
    for y in y_true:
        if abs(y-i) < 0.0000001:
            #print(j,y)
            y_val_mols.append(j)

In [8]:
print(y_val_mols)

['methanol', 'beryllium_monofluoride', 'methanethiol_radical', 'cyano_radical', 'dimethyl_ether', 'ethylene', 'pentane', 'isopropyl_alcohol', 'glyoxal_trans', 'oxygen_monofluoride', 'acetamide', 'monosodium_sulfide', 'hydrogen_cyanide', 'pyridine', 'ethylamine', 'boron_trifluoride', 'pyrrole', 'cyclopentane', 'trifluoroacetic_acid', 'nitric_oxide', 'sulfur_monoxide', 'beryllium_monochloride', 'allene', 'methane', 'nonane', 'sodium_lithium', 'silicon_monoxide', 'phosphorus_mononitride', 'formic_acid', 'beryllium_dihydride', 'water', 'oxygen_diatomic', 'octane', 'dilithium_oxide', 'nitrogen_dioxide', 'sodium_diatomic', 'lithium_diatomic', 'acetyl_fluoride', 'propyne', 'ethyl_radical', 'silylene_s']


In [9]:
# Using Sorted CM dataset
input_vector, y_test, y_dict = inputparser.make_input(sort = True)

In [10]:
# x-test, y-test are 225 data points, remove first element for verification
x_verification = []
x_verification.append(input_vector[0])
y_verification = y_test[0]

x_input = input_vector[1:]
y_input = []
for i in y_test[1:]:
    y_input.append(i)

In [11]:
# Random seeding
torch.manual_seed(12)
np.random.seed(12)

# Input data
X = torch.tensor(x_input.tolist())
Y = torch.tensor(y_input)

# Split data
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=42)

# Normalize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# NN
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Initialize
input_size = len(x_input[0])
# Optimal hidden layer size
hidden_size = 290  
model = NeuralNet(input_size, hidden_size)

# loss
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
for epoch in range(202):
    # forward prop
    outputs = model(torch.tensor(X_train, dtype=torch.float32))
    loss = criterion(outputs.squeeze(), Y_train)
    # back prop
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 100 == 0:
        print(f'Epoch [{epoch+1}/141], Loss: {loss.item():.4f}')
print("Final Epoch Count:", epoch + 1)

with torch.no_grad():
    train_outputs = model(torch.tensor(X_train, dtype=torch.float32))
    train_mse = mean_squared_error(train_outputs.squeeze().detach().numpy(), Y_train)
print("Training MSE:", train_mse)

with torch.no_grad():
    val_outputs = model(torch.tensor(X_val, dtype=torch.float32))
    val_mse = mean_squared_error(val_outputs.squeeze().detach().numpy(), Y_val)
print("Validation MSE:", val_mse)

Epoch [100/141], Loss: 0.0049
Epoch [200/141], Loss: 0.0030
Final Epoch Count: 202
Training MSE: 0.0029975642
Validation MSE: 0.04498981


In [12]:
val_sorted = val_outputs.squeeze().detach().numpy()
print(val_sorted)

[ -0.7886578   -0.68474704  -1.7354742   -2.463213    -1.0470697
  -3.480864    -1.3713362   -1.5353248   -0.5368378   -0.63520384
  -4.576627    -0.66509014  -0.32801032  -0.68283397  -1.1394969
  -4.4988217   -0.34017283  -1.8551025   -0.8937009   -1.2088629
  -1.6239921   -1.6117038   -1.3389056   -0.4497335   -0.69593036
  -4.327557    -1.2987224   -1.0023984   -0.67456734  -1.7765346
  -1.1127595   -1.6085879  -10.573343    -1.2619056   -1.4872944
  -0.8481041   -1.5142056   -1.1206747   -0.97156376  -0.9293307
  -0.89673233  -1.6305561   -2.8635108   -1.4864188   -1.2638127 ]


In [13]:
y_true = Y_val.squeeze().detach().numpy()
print(y_true)

[ -0.79794914  -0.6908408   -1.668961    -2.9318223   -1.0525415
  -3.0746264   -1.438742    -1.6125808   -0.4743731   -0.67173576
  -5.074177    -0.7192913   -0.23940559  -0.6822562   -1.1188079
  -4.89845     -0.21544072  -1.9638656   -0.95251924  -1.0548779
  -1.9152862   -1.5653359   -1.4664245   -0.3569649   -0.85329396
  -4.3623357   -1.4673803   -1.6451893   -0.655875    -1.8348596
  -0.6766137   -1.515031   -10.788043    -1.2504039   -1.4635166
  -0.82925224  -1.4858009   -1.6150858   -0.99036634  -0.90275496
  -0.95012754  -1.6095264   -2.610244    -1.5121139   -1.2108301 ]


In [14]:
# perform float comparison for 1.e-7 precision
y_val_mols = []
for i,j in y_dict.items():
    for y in y_true:
        if abs(y-i) < 0.0000001:
            #print(j,y)
            y_val_mols.append(j)

In [15]:
print(y_val_mols)

['methanol', 'beryllium_monofluoride', 'methanethiol_radical', 'cyano_radical', 'dimethyl_ether', 'ethylene', 'pentane', 'isopropyl_alcohol', 'glyoxal_trans', 'oxygen_monofluoride', 'acetamide', 'monosodium_sulfide', 'hydrogen_cyanide', 'pyridine', 'ethylamine', 'boron_trifluoride', 'pyrrole', 'cyclopentane', 'trifluoroacetic_acid', 'nitric_oxide', 'sulfur_monoxide', 'beryllium_monochloride', 'allene', 'methane', 'nonane', 'sodium_lithium', 'silicon_monoxide', 'phosphorus_mononitride', 'formic_acid', 'beryllium_dihydride', 'water', 'oxygen_diatomic', 'octane', 'dilithium_oxide', 'nitrogen_dioxide', 'sodium_diatomic', 'lithium_diatomic', 'acetyl_fluoride', 'propyne', 'ethyl_radical', 'silylene_s']


In [16]:
# Random seeding
torch.manual_seed(2)
np.random.seed(2)

# Input data
X = torch.tensor(x_input.tolist())
Y = torch.tensor(y_input)

# Split data
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=42)

# Normalize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# NN
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Initialize
input_size = len(x_input[0])
# Optimal hidden layer size
hidden_size = 64 
model = NeuralNet(input_size, hidden_size)

# loss
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training loop
for epoch in range(30):
    # forward prop
    outputs = model(torch.tensor(X_train, dtype=torch.float32))
    loss = criterion(outputs.squeeze(), Y_train)
    # back prop
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 100 == 0:
        print(f'Epoch [{epoch+1}/141], Loss: {loss.item():.4f}')
print("Final Epoch Count:", epoch + 1)

with torch.no_grad():
    train_outputs = model(torch.tensor(X_train, dtype=torch.float32))
    train_mse = mean_squared_error(train_outputs.squeeze().detach().numpy(), Y_train)
print("Training MSE:", train_mse)

with torch.no_grad():
    val_outputs = model(torch.tensor(X_val, dtype=torch.float32))
    val_mse = mean_squared_error(val_outputs.squeeze().detach().numpy(), Y_val)
print("Validation MSE:", val_mse)

Final Epoch Count: 30
Training MSE: 0.0943756
Validation MSE: 1.2631232


In [17]:
val_naive = val_outputs.squeeze().detach().numpy()
print(val_naive)

[ -1.0655322   -0.9448282   -1.8024814   -9.620527    -1.1563112
  -3.1757765   -1.1172059   -1.4180846   -0.8302133   -0.92195475
  -5.2197866   -0.87137854  -0.67449534  -0.9587719   -1.1880789
  -4.707701    -0.68522745  -1.8700943   -1.1303685   -1.0225812
  -1.3315139   -1.4616244   -1.3798357   -0.79458785  -0.8376734
  -4.1339726   -1.433498    -1.575537    -1.0029141   -1.7710959
  -1.2004615   -1.875249   -10.574104    -1.5081534   -1.5837296
  -1.1004125   -1.4000089   -1.0742829   -1.2072896   -1.1695821
  -1.1724806   -1.7149622   -5.612506    -1.2187872   -1.4694252 ]


In [18]:
y_true = Y_val.squeeze().detach().numpy()
print(y_true)

[ -0.79794914  -0.6908408   -1.668961    -2.9318223   -1.0525415
  -3.0746264   -1.438742    -1.6125808   -0.4743731   -0.67173576
  -5.074177    -0.7192913   -0.23940559  -0.6822562   -1.1188079
  -4.89845     -0.21544072  -1.9638656   -0.95251924  -1.0548779
  -1.9152862   -1.5653359   -1.4664245   -0.3569649   -0.85329396
  -4.3623357   -1.4673803   -1.6451893   -0.655875    -1.8348596
  -0.6766137   -1.515031   -10.788043    -1.2504039   -1.4635166
  -0.82925224  -1.4858009   -1.6150858   -0.99036634  -0.90275496
  -0.95012754  -1.6095264   -2.610244    -1.5121139   -1.2108301 ]


In [19]:
# perform float comparison for 1.e-7 precision
y_val_mols = []
for i,j in y_dict.items():
    for y in y_true:
        if abs(y-i) < 0.0000001:
            #print(j,y)
            y_val_mols.append(j)

In [20]:
print(y_val_mols)

['methanol', 'beryllium_monofluoride', 'methanethiol_radical', 'cyano_radical', 'dimethyl_ether', 'ethylene', 'pentane', 'isopropyl_alcohol', 'glyoxal_trans', 'oxygen_monofluoride', 'acetamide', 'monosodium_sulfide', 'hydrogen_cyanide', 'pyridine', 'ethylamine', 'boron_trifluoride', 'pyrrole', 'cyclopentane', 'trifluoroacetic_acid', 'nitric_oxide', 'sulfur_monoxide', 'beryllium_monochloride', 'allene', 'methane', 'nonane', 'sodium_lithium', 'silicon_monoxide', 'phosphorus_mononitride', 'formic_acid', 'beryllium_dihydride', 'water', 'oxygen_diatomic', 'octane', 'dilithium_oxide', 'nitrogen_dioxide', 'sodium_diatomic', 'lithium_diatomic', 'acetyl_fluoride', 'propyne', 'ethyl_radical', 'silylene_s']
