#### 新实验数据生成-V4-LC
#### 拉丁超立方采样-预测还原概率筛选-最远点采样筛选-按总体积换算
目标是补充数据点，提高还原产物的概率
最后更新：2022.04.05 戴以恒

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import time
import joblib
c_time = time.strftime("%Y%m%d_%H%M%S", time.localtime())
c_time_m = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())

In [None]:
# 参数
# ======== System Setup ========
DIR = 'New_Points V4'
EPOCH = 100 # 每个c1,c2条件绘制1张相图需要EPOCH轮，注意合理设置这些参数，防止运行时间暴增

C_LIMIT = [[0.05, 1.6], [0.05, 2.2], [3.0, 50.0]]    # 超立方体的边界，指定c1c2c3，c4c5由计算获得

NUM_GENERATE = 150  # 预生成的数据点数量
NUM_BATCH = 30  # 拉丁超立方采样的批次
NUM_FINAL = 72  # 最终保留的样本数，这个数值增大，最终数据集内部的最近欧几里得距离会下降

C5_PRODUCT_TIME = 1.986  # 2.601代表5:1加水，1.949代表5:0.75加水
C0 = [1.99, 1.6553, 100.0, 60.0]     # 原始的溶液浓度，单位为M，即mmol/L    c1:3.56(5:1)；3.72(5:0.75)

V_TOTAL = 1.2    # 总体积，mL

RED_STANDARD = 0.5   # 还原置信概率
YF_STANDARD = 0.4    # 黄色荧光置信概率
# ======== Fit Data Input ========
INPUT_X_C = 'Features_499_5_c.csv'
INPUT_Y_C = 'Values_Red_499_R.csv'
INPUT_TITLE = 'Title_5_c.csv'
TRAIN_TEST_SPLIT_1 = 0.85
INPUT_X_P = 'Features_68_5_c.csv'
INPUT_Y_P = 'Values_YF_68.csv'
TRAIN_TEST_SPLIT_2 = 0.75

In [None]:
import os
from pathlib import Path
DIR += '_'+c_time
os.mkdir(DIR)

In [None]:
X_C = np.loadtxt(INPUT_X_C, delimiter=',')
title = np.loadtxt(INPUT_TITLE, dtype='str', delimiter=',', comments='!')
y_c = np.loadtxt(INPUT_Y_C, delimiter=',', dtype=float)
print('X:', X_C.shape, '   y:', y_c.shape)
X_P = np.loadtxt(INPUT_X_P, delimiter=',')
y_pp = np.loadtxt(INPUT_Y_P, delimiter=',', dtype=float)

In [None]:
from sklearn import model_selection
from sklearn.svm import SVC
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
import joblib

In [None]:
# 拉丁超立方采样
data = []
for _ in range(NUM_BATCH):
    m = np.zeros((NUM_GENERATE, 4))
    for i in range(3):
        index = np.random.choice(NUM_GENERATE, NUM_GENERATE, replace=False)
        inter = (C_LIMIT[i][1] - C_LIMIT[i][0])/NUM_GENERATE
        for j in range(NUM_GENERATE):
            m[j, i] = np.random.rand()*inter + C_LIMIT[i][0] + index[j]*inter
    for i in range(m.shape[0]):
        ttt = 1-m[i,0]/C0[0]-m[i,1]/C0[1]-m[i,2]/C0[2]
        if ttt >= 0:
            c4_ttt = C0[3]*ttt
            temp = [m[i, 0], m[i, 1], m[i, 2], c4_ttt, m[i, 0]*C5_PRODUCT_TIME]
            data.append(temp)
print(len(data))
X_p_C = np.array(data)

In [None]:
# 预测是否还原
clf = SVC(kernel='rbf', gamma=np.exp(-5.315672748087822), C=15.651302789809343, verbose=1, max_iter=-1, cache_size=40960, probability=True)
paras = clf.get_params()
point = round(X_C.shape[0]*TRAIN_TEST_SPLIT_1)
y_p_m = []
y_p_c = np.zeros((X_p_C.shape[0], 1))
acc_list = []
for _ in range(EPOCH):
    permutation = np.random.permutation(y_c.shape[0])
    train_idx = permutation[:point]
    test_idx = permutation[point:]
    X_train = X_C[train_idx, :]
    y_train = y_c[train_idx]
    X_test = X_C[test_idx, :]
    y_test = y_c[test_idx]
    clf_new = SVC()
    for k, v in paras.items():
        clf_new.set_params(**{k: v})
    # 拟合模型
    clf_new.fit(X_train, y_train)
    # 计算损失
    y_pred = clf_new.predict(X_test)
    acc_count = 0
    for i in range(X_test.shape[0]):
        if y_pred[i]==y_test[i]:
            acc_count += 1
    acc = acc_count*100/X_test.shape[0]
    y_p = clf_new.predict_proba(X_p_C)
    y_p_l = y_p[:, 1].flatten().tolist()
    acc_list.append(acc)
    y_p_m.append(y_p_l)
y_p_m = np.array(y_p_m)
y_p_m = y_p_m.transpose()
for i in range(X_p_C.shape[0]):
    y_p_c[i, 0] = np.mean(y_p_m[i, :])
del y_p_m
data_nnn = []
for i in range(y_p_c.shape[0]):
    if y_p_c[i, 0] > RED_STANDARD:
        data_nnn.append(X_p_C[i, :].flatten().tolist())
print(len(data_nnn))

In [None]:
print(y_pp.shape)

In [None]:
# 预测是否生成黄色荧光
X_p_t1 = np.array(data_nnn)
clf = SVC(kernel='rbf', gamma=np.exp(-6.964045492214664), C=13.589042049663163, verbose=1, max_iter=-1, cache_size=40960, probability=True)
paras = clf.get_params()
point = round(X_P.shape[0]*TRAIN_TEST_SPLIT_2)
y_p_m = []
y_p_t1 = np.zeros((X_p_t1.shape[0], 1))
acc_list = []
for _ in range(EPOCH):
    permutation1 = np.random.permutation(y_pp.shape[0])
    train_idx = permutation1[:point]
    test_idx = permutation1[point:]
    X_train = X_P[train_idx, :]
    y_train = y_pp[train_idx]
    X_test = X_P[test_idx, :]
    y_test = y_pp[test_idx]
    clf_new = SVC()
    for k, v in paras.items():
        clf_new.set_params(**{k: v})
    # 拟合模型
    clf_new.fit(X_train, y_train)
    # 计算损失
    y_pred = clf_new.predict(X_test)
    acc_count = 0
    for i in range(X_test.shape[0]):
        if y_pred[i]==y_test[i]:
            acc_count += 1
    acc = acc_count*100/X_test.shape[0]
    y_p = clf_new.predict_proba(X_p_t1)
    y_p_l = y_p[:, 1].flatten().tolist()
    acc_list.append(acc)
    y_p_m.append(y_p_l)
y_p_m = np.array(y_p_m)
y_p_m = y_p_m.transpose()
for i in range(X_p_t1.shape[0]):
    y_p_t1[i, 0] = np.mean(y_p_m[i, :])
del y_p_m
data_next = []
for i in range(y_p_t1.shape[0]):
    if y_p_t1[i, 0] > YF_STANDARD:
        data_next.append(X_p_t1[i, :].flatten().tolist())
print(len(data_next))

In [None]:
data_next = np.array(data_next)
# 二次归一化
data_nnn = data_next.copy()
dd = data_nnn.copy()
for i in range(dd.shape[1]):
    dd[:, i] = (dd[:, i]-min(dd[:, i]))/(max(dd[:, i])-min(dd[:, i]))

In [None]:
# 基于欧几里得距离进行最远点采样
final_idx = []
d_m_2 = np.zeros((data_nnn.shape[0], data_nnn.shape[0]))
for i in range(data_nnn.shape[0]):
    for j in range(data_nnn.shape[0]):
        if i!=j:
            d_m_2[i, j] = np.sqrt(np.sum((dd[i, :] - dd[j, :])**2))
final_idx.append(np.random.randint(data_nnn.shape[0]))
for i in range(NUM_FINAL-1):
    max_dis_dataset = 0
    new_id = 0
    for j in range(data_nnn.shape[0]):
        if j not in final_idx:  # 从剩余数据集中搜索
            min_dis = np.min(d_m_2[j, final_idx])  # 到点集的最小距离
            if min_dis > max_dis_dataset:
                max_dis_dataset = min_dis  # 更新剩余数据集中的最大的距离点集的最小距离
                new_id = j
    final_idx.append(new_id)
final_data = data_nnn[final_idx, :]

In [None]:
V_data = []
for i in range(final_data.shape[0]):
    temp = []
    for j in range(4):
        temp.append(final_data[i, j]*V_TOTAL/C0[j])
    V_data.append(temp)
V_data = np.array(V_data)
print(np.sum(V_data[:, 0]), np.sum(V_data[:, 1]), np.sum(V_data[:, 2]), np.sum(V_data[:, 3]))

In [None]:
# 输出最终数据集
save_name = 'ExpData_'+time.strftime("%Y%m%d", time.localtime())+'_ML-Exp-1.csv'
save_name = Path('.', DIR, save_name)
np.savetxt(save_name, V_data[:36, :], fmt='%.3f', delimiter=',')
save_name = 'ExpData_'+time.strftime("%Y%m%d", time.localtime())+'_ML-Exp-2.csv'
save_name = Path('.', DIR, save_name)
np.savetxt(save_name, V_data[36:, :], fmt='%.3f', delimiter=',')
save_name = 'ExpData_'+time.strftime("%Y%m%d", time.localtime())+'_Exp.txt'
save_name = Path('.', DIR, save_name)
f1 = open(save_name, 'w')
f1.write('Reagent 1: ZrT(0.5ML, 0.375+H2O)   C:'+str(round(C0[0], 2))+'mM   V:'+str(round(np.sum(V_data[:, 0]), 3))+'mL\n')
f1.write('Reagent 2: HAuCl4(2.00mL-->30mL)   C:'+str(round(C0[1], 2))+'mM   V:'+str(round(np.sum(V_data[:, 1]), 3))+'mL\n')
f1.write('Reagent 3: NEt3   C:'+str(round(C0[2], 2))+'mM   V:'+str(round(np.sum(V_data[:, 2]), 3))+'mL\n')
f1.write('Reagent 4: VC   C:'+str(round(C0[3], 2))+'mM   V:'+str(round(np.sum(V_data[:, 3]), 3))+'mL\n')
f1.write('Oven: 80.0℃    10.0hour(s)\n')
f1.close()
id_list = []
for i in range(72):
    id_list.append(i+1)
id_list = np.array(id_list).reshape(72, 1)
out = np.hstack((id_list, V_data))
out = np.hstack((out, final_data))
out = np.hstack((out, np.zeros((final_data.shape[0], 1))))
title = np.array(['Index', 'ZrT(mL)', 'HAuCl4(mL)', 'NEt3(mL)', 'VC(mL)', 'ZrT(mM)', 'HAuCl4(mM)', 'NEt3(mM)', 'VC(mM)', 'H2O(M)', 'Reduction']).reshape(1, 11)
out = np.vstack((title, out))
save_name = 'ExpData_'+time.strftime("%Y%m%d", time.localtime())+'_Exp-Analysis.csv'
save_name = Path('.', DIR, save_name)
np.savetxt(save_name, out, fmt='%s', delimiter=',')