In [17]:
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import (RBF, Matern)

# set parameters:
base_dir = "/Users/akanane/Documents/Research/ML_diffraction/data/AuPtSL20x20x7-1"
dirs = ['0A','2B','2C','2D','2E','2F','2G','2H','2I','2J']  # 2A conv C does not exist

ndata = 102
test_size = 0.2
alpha = 1e-7
sigma = 20.0

# read everything in...
difr, qgrid = read_data(base_dir, dirs, ndata)

# x array contains the input: (ntrain, train_vector)
x = get_input(qgrid, difr)
y = get_labels(base_dir, dirs, x.shape[0])

# build neural network:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size)

#kernel = RBF(length_scale=sigma, length_scale_bounds=(1e-5, 1e5))
kernel = Matern(length_scale=sigma, length_scale_bounds=(1e-5, 1e5), nu=5./3)
gp = GaussianProcessRegressor(kernel=kernel, alpha=alpha, n_restarts_optimizer=20)
    
gp.fit(x_train, y_train)
y_pred, sigma = gp.predict(x_test, return_std=True)

mae = 0.0
rmsd = 0.0
for m in range(x_test.shape[0]): 
    mae  += np.sum(np.abs(y_pred[m,:]-y_test[m,:]))/len(y_pred[m,:])
    rmsd += np.sqrt(np.sum(np.square(np.abs(y_pred[m,:]-y_test[m,:])))/len(y_pred[m,:]))

print (" MAE ", mae/x_test.shape[0])
print (" RMSD ", rmsd/x_test.shape[0])

 MAE  1.7458007359879582
 RMSD  1.8929715288352893


  " state: %s" % convergence_dict)
  " state: %s" % convergence_dict)


In [1]:
def read_data(base_dir, dirs, ndata):
    
    scaling_factor = 1e10
    
    # Q grid is only needed if we want to plot diffraction
    # pattern, it is not used in the GPR model
    # take low Q features with Q=[25,29]
    # and high Q features with Q=[51,58]
    qminL = 25.0
    qmaxL = 29.0
    qminH = 51.0
    qmaxH = 58.0
    dq = 0.01
    
    NQgridL = int((qmaxL - qminL)/dq)
    NQgridH = int((qmaxH - qminH)/dq)
    NQgrid = NQgridL + NQgridH
    
    dp = np.zeros((ndata, NQgrid))
    qgridL = np.linspace(qminL, qmaxL, NQgridL, False)
    qgridH = np.linspace(qminH, qmaxH, NQgridH, False)
    
    qgrid = np.concatenate((qgridL, qgridH), axis=0)
    
    ndir = len(dirs)
    difr = np.zeros((ndir,ndata,NQgrid))
    
    # read xrd files
    for ind_d, d in enumerate(dirs):
        for n in range(ndata):
            filename = base_dir + "/AuPtSLxrd" + d + "20x20x7-C/AuPtSLxrd" + d + "20x20x7-"+str(n)+"C"
            f = open(filename,"r")
            m = 0
            for line in f:
                line2 = line.split()
                qval = float(line2[0])
                ints = float(line2[1])
                if qval >= qminL and qval < qmaxL:
                    difr[ind_d, n, m] = ints
                    m += 1
                if qval >= qminH and qval < qmaxH:
                    difr[ind_d, n, m] = ints
                    m += 1
            f.close()
    
    difr[:,:,:] /= scaling_factor
    
    return difr, qgrid

In [2]:
import matplotlib.pyplot as plt
import os
from os import path

def plot_compare(x, y, x1, y1):
    
    fontsize=16
    
    fig = plt.figure()
    ax = fig.add_subplot(111)
    
    ax.plot(x, y, color='g')
    ax.plot(x1, y1, color='r', ls=':')
    ax.legend(loc='upper right',fontsize=fontsize)
    plt.tight_layout()
    ax.set_xlabel("Q",fontsize=fontsize)
    ax.set_ylabel("Intensity",fontsize=fontsize)

    plt.show()

In [3]:
import matplotlib.pyplot as plt
import os
from os import path

def plot_difference(x, y, y1):
    
    fontsize=16
    
    fig = plt.figure()
    ax = fig.add_subplot(111)
    
    ax.plot(x, y-y1, color='g')
    #ax.legend(loc='upper right',fontsize=fontsize)
    plt.tight_layout()
    ax.set_xlabel("Q",fontsize=fontsize)
    ax.set_ylabel("Intensity",fontsize=fontsize)

    plt.show()

In [4]:
import matplotlib.pyplot as plt
import os
from os import path

def plot_single(x, y):
    
    fontsize=16
    
    fig = plt.figure()
    ax = fig.add_subplot(111)
    
    ax.plot(x, y, color='g')
    #ax.legend(loc='upper right',fontsize=fontsize)
    plt.tight_layout()
    ax.set_xlabel("Q",fontsize=fontsize)
    ax.set_ylabel("Intensity",fontsize=fontsize)

    plt.show()

In [5]:
def peak_positions(x, y):
    
    cutoff = 0.05
    peaks = []
    ints = []
    z = sorted(y, reverse=True)
    
    for n in range(len(z)):
        val = z[n]
        tmp = np.where(y==val)[0]
        ind = tmp[0]
        if ind+1 < len(z):
            left = y[ind-1]
            righ = y[ind+1]
            if left < val and righ < val and val > cutoff:
                #print (" peak detected ",val,x[ind])
                peaks.append(x[ind])
                ints.append(val)
    
    return sorted(peaks), ints

In [6]:
def get_input(qgrid, difr):
    """
        Calculate input by concatenating vectors of intensities
        at some number of time points
    """
    
    ntrain = difr.shape[0]
    ntimes = difr.shape[1]
    
    nmax = 200
    
    x = np.zeros((ntrain, nmax))
    
    for n in range(ntrain):
        for t in range(0,60,10):
            peaks, ints = peak_positions(qgrid, difr[n,t,:])
            if t == 0:
                allints = ints
            else:
                allints = np.concatenate((allints, ints), axis=0)
        if n==0:
            nz = len(allints)
        else:
            assert nz == len(allints), " different length of input data for different samples"
            
        x[n,:nz] = allints[:]
        
    xo = np.zeros((ntrain,nz))
    
    xo[:,:] = x[:,:nz]
            
    return xo

In [7]:
def get_labels(base_dir, dirs, ndata):
    
    nlayers = 14
    
    y = np.zeros((ndata, nlayers))
    
    for ind_d, d in enumerate(dirs):
        filename = base_dir + "/AuPtSLxy" + d + "20x20x7/AuPtSLxy" + d + "20x20x7-0"
        f = open(filename,"r")
        m = 0
        for line in f:
            line2 = line.split()
            v1 = float(line2[0])
            v2 = float(line2[1])
            y[ind_d, m] = v1
            y[ind_d, m+7] = v2
            m += 1
            if m == 7:
                break
        f.close()
            
    return y