### Problem 1 (50 points) 

Vapor-liquid equilibria data are correlated using two adjustable parameters $A_{12}$ and $A_{21}$ per binary
mixture. For low pressures, the equilibrium relation can be formulated as:

$$
\begin{aligned}
p = & x_1\exp\left(A_{12}\left(\frac{A_{21}x_2}{A_{12}x_1+A_{21}x_2}\right)^2\right)p_{water}^{sat}\\
& + x_2\exp\left(A_{21}\left(\frac{A_{12}x_1}{A_{12}x_1+A_{21}x_2}\right)^2\right)p_{1,4 dioxane}^{sat}.
\end{aligned}
$$

Here the saturation pressures are given by the Antoine equation

$$
\log_{10}(p^{sat}) = a_1 - \frac{a_2}{T + a_3},
$$

where $T = 20$($^{\circ}{\rm C}$) and $a_{1,2,3}$ for a water - 1,4 dioxane
system is given below.

|             | $a_1$     | $a_2$      | $a_3$     |
|:------------|:--------|:---------|:--------|
| Water       | 8.07131 | 1730.63  | 233.426 |
| 1,4 dioxane | 7.43155 | 1554.679 | 240.337 |


The following table lists the measured data. Recall that in a binary system $x_1 + x_2 = 1$.

|$x_1$ | 0.0 | 0.1 | 0.2 | 0.3 | 0.4 | 0.5 | 0.6 | 0.7 | 0.8 | 0.9 | 1.0 |
|:-----|:--------|:---------|:--------|:-----|:-----|:-----|:-----|:-----|:-----|:-----|:-----|
|$p$| 28.1 | 34.4 | 36.7 | 36.9 | 36.8 | 36.7 | 36.5 | 35.4 | 32.9 | 27.7 | 17.5 |

Estimate $A_{12}$ and $A_{21}$ using data from the above table: 

1. Formulate the least square problem; 
2. Since the model is nonlinear, the problem does not have an analytical solution. Therefore, solve it using the gradient descent or Newton's method implemented in HW1; 
3. Compare your optimized model with the data. Does your model fit well with the data?

---

### Problem 2 (50 points) 

Solve the following problem using Bayesian Optimization:
$$
    \min_{x_1, x_2} \quad \left(4-2.1x_1^2 + \frac{x_1^4}{3}\right)x_1^2 + x_1x_2 + \left(-4 + 4x_2^2\right)x_2^2,
$$
for $x_1 \in [-3,3]$ and $x_2 \in [-2,2]$. A tutorial on Bayesian Optimization can be found [here](https://thuijskens.github.io/2016/12/29/bayesian-optimisation/).





In [134]:
# A simple example of using PyTorch for gradient descent

import torch as t
from torch.autograd import Variable

# Define a variable, make sure requires_grad=True so that PyTorch can take gradient with respect to this variable
x = Variable(t.tensor([1.0, 0.0]), requires_grad=True)

# Define a loss
loss = (x[0] - 1)**2 + (x[1] - 2)**2

# Take gradient
loss.backward()

# Check the gradient. numpy() turns the variable from a PyTorch tensor to a numpy array.
x.grad.numpy()

array([ 0., -4.], dtype=float32)

In [42]:
# Let's examine the gradient at a different x.
x.data = t.tensor([2.0, 1.0])
loss = (x[0] - 1)**2 + (x[1] - 2)**2
loss.backward()
x.grad.numpy()

array([ 2., -6.], dtype=float32)

In [169]:
# Here is a code for gradient descent without line search

import torch as t
from torch.autograd import Variable

x = Variable(t.tensor([1.0, 0.0]), requires_grad=True)

# Fix the step size
a = 0.01

# Start gradient descent
for i in range(1000):  # TODO: change the termination criterion
    loss = (x[0] - 1)**2 + (x[1] - 2)**2
    loss.backward()
    
    # no_grad() specifies that the operations within this context are not part of the computational graph, i.e., we don't need the gradient descent algorithm itself to be differentiable with respect to x
    with t.no_grad():
        x -= a * x.grad
        
        # need to clear the gradient at every step, or otherwise it will accumulate...
        x.grad.zero_()
        
print(x.data.numpy())
print(loss.data.numpy())

[1.        1.9999971]
8.185452e-12


In [370]:
# Formulate least square problem
print("The function to be minimized is (p(x1,A12,A21) - p_i)**2 with respect to A12 and A21")
# Use pytorch to calculate gradient

import torch as t
from torch.autograd import Variable
import math
A = Variable(t.tensor([5.0,6.0]), requires_grad=True)
T = 20
a1_w = 8.07131
a2_w = 1730.63
a3_w = 233.426
a1_d = 7.43155
a2_d = 1554.679
a3_d = 240.337
p_satw = pow(10,(a1_w - (a2_w)/(T+a3_w)))

p_satd = pow(10,(a1_d - (a2_d)/(T+a3_d)))

x1 = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
p = [28.1,34.4,36.7,36.9,36.8,36.7,36.5,35.4,32.9,27.7,17.5]
#d = lambda x1,A12,A21: x1*t.exp(A12*((A21*(1-x1))/(A12*x1 + A21*(1-x1)))**2)*p_satw + (1-x1)*t.exp(A21*((A12*x1)/(A12*x1+A21*(1-x1)))**2)*p_satd
pd = lambda x1,A12,A21: x1*t.exp(A12*((A21*(1-x1))/(A12*x1 + A21*(1-x1)))**2)*p_satw + (1-x1)*t.exp(A21*((A12*x1)/(A12*x1+A21*(1-x1)))**2)*p_satd 
for i in range(len(x1)):    
    loss = t.norm(p[i] - pd(x1[i],A[0],A[1]))**2
    loss.backward()

A.grad.numpy()

array([109826.984, 150164.86 ], dtype=float32)

In [None]:
# Implement gradient descent to solve for A12 and A21
# Wasn't able to solve because A.grad.numpy()[1] and [0] are too large so never meets error criteria, wasn't sure how to fix
import torch as t
from torch.autograd import Variable
import math
A = Variable(t.tensor([5.0,6.0]), requires_grad=True)
T = 20
a1_w = 8.07131
a2_w = 1730.63
a3_w = 233.426
a1_d = 7.43155
a2_d = 1554.679
a3_d = 240.337
p_satw = pow(10,(a1_w - (a2_w)/(T+a3_w)))

p_satd = pow(10,(a1_d - (a2_d)/(T+a3_d)))

x1 = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
p = [28.1,34.4,36.7,36.9,36.8,36.7,36.5,35.4,32.9,27.7,17.5]
#d = lambda x1,A12,A21: x1*t.exp(A12*((A21*(1-x1))/(A12*x1 + A21*(1-x1)))**2)*p_satw + (1-x1)*t.exp(A21*((A12*x1)/(A12*x1+A21*(1-x1)))**2)*p_satd
pd = lambda x1,A12,A21: x1*t.exp(A12*((A21*(1-x1))/(A12*x1 + A21*(1-x1)))**2)*p_satw + (1-x1)*t.exp(A21*((A12*x1)/(A12*x1+A21*(1-x1)))**2)*p_satd 
eps = 1e-7
A12_0 = 5.0
A21_0 = 6.0
k1 = 0
k2 = 0
soln_1 = [A12_0]
soln_2 = [A21_0]
A12 = soln_1[k1]
A21 = soln_2[k2]
error = 1
def line_search(A12,A21,x1):
    a = 1
    phi1 = lambda a, A12, A21, x1: pd(x1,A12,A21) - a*0.8*A.grad.numpy()[0]**2
    phi2 = lambda a, A12, A21, x1: pd(x1,A12,A21) - a*0.8*A.grad.numpy()[1]**2
    while phi1(a,A12,A21,x1)<pd(x1,A12-a*A.grad.numpy()[0],A21-a*A.grad.numpy()[1]) and phi2(a,A12,A21,x1)<pd(x1,A12-a*A.grad.numpy()[0],A21-a*A.grad.numpy()[1]):
        a = 0.5*a
    return a
   
    #loss = t.norm(p[i] - pd(x1[i],A[0],A[1]))**2
    #loss.backward()
    #A.grad.numpy()
    #error = math.sqrt(A.grad.numpy()[0]**2 + A.grad.numpy()[1]**2)
while error >= eps:
    for i in range(len(x1)): 
        # calculate loss
        loss = t.norm(p[i] - pd(x1[i],A[0],A[1]))**2
        # calculate derivatives
        loss.backward()
        # check gradient
        A.grad.numpy()
        # create a with the gradient
    a = line_search(A[0],A[1],x1[i])
    # create new value for A12 with a and gradient
    A12 = A12 - a*A.grad.numpy()[0]
    # create new value for A21 with a and gradient
    A21 = A21 - a*A.grad.numpy()[1]
    # save solutions
    soln_1.append(A12)
    soln_2.append(A21)
    # calculate error
    error = math.sqrt(A.grad.numpy()[0]**2 + A.grad.numpy()[1]**2)
    # Reset initial guesses based off line search values
    A = Variable(t.tensor([A12,A21]), requires_grad=True)   
soln_1
        

In [380]:
# do gradient descent without line search
import torch as t
from torch.autograd import Variable
import math
A = Variable(t.tensor([5.0,6.0]), requires_grad=True)
T = 20
a1_w = 8.07131
a2_w = 1730.63
a3_w = 233.426
a1_d = 7.43155
a2_d = 1554.679
a3_d = 240.337
p_satw = pow(10,(a1_w - (a2_w)/(T+a3_w)))

p_satd = pow(10,(a1_d - (a2_d)/(T+a3_d)))

x1 = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
p = [28.1,34.4,36.7,36.9,36.8,36.7,36.5,35.4,32.9,27.7,17.5]
#d = lambda x1,A12,A21: x1*t.exp(A12*((A21*(1-x1))/(A12*x1 + A21*(1-x1)))**2)*p_satw + (1-x1)*t.exp(A21*((A12*x1)/(A12*x1+A21*(1-x1)))**2)*p_satd
pd = lambda x1,A12,A21: x1*t.exp(A12*((A21*(1-x1))/(A12*x1 + A21*(1-x1)))**2)*p_satw + (1-x1)*t.exp(A21*((A12*x1)/(A12*x1+A21*(1-x1)))**2)*p_satd 
for i in range(len(x1)):    
    loss = t.norm(p[i] - pd(x1[i],A[0],A[1]))**2
    loss.backward()

A.grad.numpy()

a = 0.01

for j in range(11):
    for i in range(len(x1)):
        loss = t.norm(p[i] - pd(x1[i],A[0],A[1]))**2
    loss.backward()
    
    with t.no_grad():
        A -= a * A.grad
        
        A.grad.zero_()
    
print(A.data.numpy())
print(loss.data.numpy())
pdl = []
error = []
pd1 = lambda x1,A12,A21: x1*math.exp(A12*((A21*(1-x1))/(A12*x1 + A21*(1-x1)))**2)*p_satw + (1-x1)*math.exp(A21*((A12*x1)/(A12*x1+A21*(1-x1)))**2)*p_satd
for i in range(len(x1)):
    pdl.append(pd1(x1[i],A.data.numpy()[0],A.data.numpy()[1]))
    error.append((p[i] - pd1(x1[i],A.data.numpy()[0],A.data.numpy()[1]))**2)
pdl
error

# Here the error between the values calculated by using the formula and the values from the given points is very large, if the gradient descent method had worked, would have repeateed this same process

[-1093.2698 -1495.6486]
0.00071549066


[0.5243201255884974,
 1182.974207894897,
 1346.8899999999996,
 1361.61,
 1354.2399999999998,
 1346.89,
 1332.25,
 1253.1599999999999,
 1082.4099999999999,
 767.2899952834708,
 0.0007154509784028096]

In [372]:
# utilized code written by Thomas Huijskens for Bayesian Optimization

import numpy as np
import sklearn.gaussian_process as gp

from scipy.stats import norm
from scipy.optimize import minimize
#value = [np.linspace(-3,3, num=5), np.linspace(-2,2, num=5)]
bounds = np.array([[-3,3], [-2,2]])
n_iters = 2
sample_loss = lambda value: (4 - 2.1*value[0]**2 + (value[0]**4)/3)*(value[0]**2) + value[0]*value[1] + (-4+4*(value[1]**2))*(value[1]**2)
x0 = None
n_pre_samples = 4
gp_params = None
random_search = False
alpha = 1e-5
epsilon = 1e-7
#def bayesian_optimisation(n_iters, sample_loss, bounds, x0=None, n_pre_samples=5,
                          #gp_params=None, random_search=False, alpha=1e-5, epsilon=1e-7):
    
x_list = []
y_list = []

n_params = bounds.shape[0]

if x0 is None:
    for params in np.random.uniform(bounds[:, 0], bounds[:, 1], (n_pre_samples, bounds.shape[0])):
        x_list.append(params)
        y_list.append(sample_loss(params))
else:
    for params in x0:
        x_list.append(params)
        y_list.append(sample_loss(params))

xp = np.array(x_list)
yp = np.array(y_list)

    # Create the GP
if gp_params is not None:
    model = gp.GaussianProcessRegressor(**gp_params)
else:
    kernel = gp.kernels.Matern()
    model = gp.GaussianProcessRegressor(kernel=kernel,
                                            alpha=alpha,
                                            n_restarts_optimizer=10,
                                            normalize_y=True)

for n in range(n_iters):

    model.fit(xp, yp)

        # Sample next hyperparameter
    if random_search:
        x_random = np.random.uniform(bounds[:, 0], bounds[:, 1], size=(random_search, n_params))
        ei = -1 * expected_improvement(x_random, model, yp, greater_is_better=True, n_params=n_params)
        next_sample = x_random[np.argmax(ei), :]
    else:
        next_sample = sample_next_hyperparameter(expected_improvement, model, yp, greater_is_better=True, bounds=bounds, n_restarts=100)

        # Duplicates will break the GP. In case of a duplicate, we will randomly sample a next query point.
    if np.any(np.abs(next_sample - xp) <= epsilon):
        next_sample = np.random.uniform(bounds[:, 0], bounds[:, 1], bounds.shape[0])

        # Sample loss for new set of parameters
    cv_score = sample_loss(next_sample)

        # Update lists
    x_list.append(next_sample)
    y_list.append(cv_score)

        # Update xp and yp
    xp = np.array(x_list)
    yp = np.array(y_list)

xp, yp
# note xp = x1 and yp = x2

(array([[-0.93813997, -1.05126187],
        [-0.62037868,  1.2353595 ],
        [ 2.47124112, -1.32560989],
        [-1.04428165, -0.7245698 ],
        [ 3.        , -2.        ],
        [ 1.85248226,  0.84747227]]),
 array([  3.57209653,   3.69266144,  24.07589448,   2.05614108,
        150.9       ,   3.22760183]))