In [None]:
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from collections import Counter

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.metrics import roc_auc_score, average_precision_score, mean_absolute_error, accuracy_score, f1_score
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, LeaveOneOut
from sklearn.decomposition import PCA
import statsmodels.api as sm

import warnings
warnings.filterwarnings('ignore')


In [None]:
def read_data(fname, thresh, return_slope=False):
    """
    """
    cols_x = ['x1', 'y1/y2', 'x2/x1', 'y2']
    col_y = 'Adj. R-Square'
    col_z = 'Slope'

    raw = pd.read_excel(fname)
    raw['x1'] = raw['x1 (um)']
    raw['x2'] = raw['x2 (um)']
    raw['y1'] = raw['y1 (um)']
    raw['y2'] = raw['y2 (um)']
    raw['y1/x1'] = raw['y1']/raw['x1']
    raw['x2/x1'] = raw['x2']/raw['x1']
    raw['y1/y2'] = raw['y1']/raw['y2']
    X = raw.loc[:, cols_x].values
    Y = raw.loc[:, col_y].values
    Y = np.array(Y>thresh, dtype=float) # convert R-square to 0/1
    Z = raw.loc[:, col_z].values
    
    if return_slope:
        return X, Y, Z
    else:
        return X, Y

def read_train(fname_list, i_round, thresh, return_slope=False):
    X_list = []
    Y_list = []
    Z_list = []
    for i in range(i_round):
        if return_slope:
            tmp_X, tmp_Y, tmp_Z = read_data(fname_list[i], thresh, return_slope)
            X_list.append(tmp_X)
            Y_list.append(tmp_Y)
            Z_list.append(tmp_Z)
        else:
            tmp_X, tmp_Y = read_data(fname_list[i], thresh, return_slope)
            X_list.append(tmp_X)
            Y_list.append(tmp_Y)
    if return_slope:
        X = np.concatenate(X_list, axis=0)
        Y = np.concatenate(Y_list, axis=0)
        Z = np.concatenate(Z_list, axis=0)
        return X, Y, Z
    else:
        X = np.concatenate(X_list, axis=0)
        Y = np.concatenate(Y_list, axis=0)
        return X, Y

def get_rand_x():
    x1 = np.random.uniform(10, 100)
    x2 = np.random.uniform(x1, 140)
    y1 = np.random.uniform(10, min(4*x1,100))
    y2 = np.random.uniform(10, min(y1,100))

    y1_d_x1 = y1/x1
    x2_d_x1 = x2/x1
    y1_d_y2 = y1/y2
    return np.array([x1, y1_d_y2, x2_d_x1, y2])

# make data
boarder_points = np.array([[10,4,14,10], [12.5,5,11.2,10]])
thresh = 0.995
# Please replace the following filenames to be your own files
iteration_fname_list = ['Iteration-1st.xlsx', 'Iteration_2nd.xlsx', 'Iteration_3rd.xlsx', 'Iteration_4th.xlsx', 'Iteration_5th.xlsx']
random_fname_list = ['Iteration-1st.xlsx', 'Random-2nd.xlsx', 'Random-3rd.xlsx', 'Random-4th.xlsx', 'Random-5th.xlsx']

X_test, Y_test = read_data('test.xlsx', thresh)
print('Test labels: ', Counter(Y_test))

# ======================================================
# method 1: random
print('random')
res = []
all_Y_test_pred_prob = []
for i_round in range(1,6):

    X_train, Y_train = read_train(random_fname_list, i_round, thresh)
    print(Counter(Y_train))
    m = LogisticRegression()
    m.fit(X_train, Y_train)
    print('model in random round {}'.format(i_round), m.coef_, m.intercept_)
    Y_test_pred_prob = m.predict_proba(X_test)[:,1]
    all_Y_test_pred_prob.append(Y_test_pred_prob)

    # result 1
    tmp_res = [i_round, thresh]
    res.append(tmp_res)

res_df = pd.DataFrame(res, columns=['round', 'thresh'])
print(res_df)
all_Y_test_pred_prob = pd.DataFrame(np.array(all_Y_test_pred_prob).T)
all_Y_test_pred_prob.to_csv('test_probs_random.csv', index=False, float_format='%.6f')

# ======================================================
# method 2: active
print('active')
res = []
all_Y_test_pred_prob = []
for i_round in range(1,6):

    X_train, Y_train = read_train(iteration_fname_list, i_round, thresh)
    print(Counter(Y_train))
    m = LogisticRegression()
    m.fit(X_train, Y_train)
    print('model in iter round {}'.format(i_round), m.coef_, m.intercept_)
    Y_test_pred_prob = m.predict_proba(X_test)[:,1]
    all_Y_test_pred_prob.append(Y_test_pred_prob)

    # result 1
    tmp_res = [i_round, thresh]
    res.append(tmp_res)

res_df = pd.DataFrame(res, columns=['round', 'thresh'])
print(res_df)
all_Y_test_pred_prob = pd.DataFrame(np.array(all_Y_test_pred_prob).T)
all_Y_test_pred_prob.to_csv('test_probs_iter.csv', index=False, float_format='%.6f')

# pred test
n_test_syn = 1000000
X_test_syn = np.zeros((n_test_syn, 4))
for i in range(n_test_syn):
    X_test_syn[i] = get_rand_x()
# Y_pred_syn = m.predict_proba(X_test_syn)[:,1]
# res_df = np.concatenate([X_test_syn, np.expand_dims(Y_pred_syn,1)], axis=1)
# out_df = pd.DataFrame(res_df[np.argsort(Y_pred_syn)[::-1]])
# out_df.columns=['x1', 'y1_d_y2', 'x2_d_x1', 'y2', 'prob']
# out_df.to_csv('active_{}.csv'.format(i_round), index=False, float_format='%.6f')

# ======================================================
# method 2: random final
print('final random')
thresh = 0.999
res = []
all_Y_test_pred_prob = []

print('Test labels: ', Counter(Y_test))
X_train, Y_train = read_train(random_fname_list, i_round=5, thresh=thresh)
print(Counter(Y_train))
m = LogisticRegression()
m.fit(X_train, Y_train)
print('model in random round {}'.format(i_round), m.coef_, m.intercept_)
print('boarder_points', m.predict_proba(boarder_points)[:,1])
Y_test_pred_prob = m.predict_proba(X_test)[:,1]
all_Y_test_pred_prob.append(Y_test_pred_prob)

# result 1
tmp_res = [i_round, thresh]
res.append(tmp_res)

res_df = pd.DataFrame(res, columns=['round', 'thresh'])
print(res_df)
all_Y_test_pred_prob = pd.DataFrame(np.array(all_Y_test_pred_prob).T)
all_Y_test_pred_prob.to_csv('test_probs_random_final.csv', index=False, float_format='%.6f')

Y_pred_syn = m.predict_proba(X_test_syn)[:,1]
res_df = np.concatenate([X_test_syn, np.expand_dims(Y_pred_syn,1)], axis=1)
out_df = pd.DataFrame(res_df[np.argsort(Y_pred_syn)[::-1]])
out_df.columns=['x1', 'y1_d_y2', 'x2_d_x1', 'y2', 'prob']
out_df.to_csv('final_random.csv', index=False, float_format='%.6f')

# ======================================================
# method 2: active final
print('final active')
thresh = 0.999
res = []
all_Y_test_pred_prob = []

print('Test labels: ', Counter(Y_test))
X_train, Y_train, Z_train = read_train(iteration_fname_list, i_round=5, thresh=thresh, return_slope=True)
print(Counter(Y_train))

m = LogisticRegression()
m.fit(X_train, Y_train)
print('model in iter round {}'.format(i_round), m.coef_, m.intercept_)
print('boarder_points', m.predict_proba(boarder_points)[:,1])
Y_test_pred_prob = m.predict_proba(X_test)[:,1]
all_Y_test_pred_prob.append(Y_test_pred_prob)

m_reg = LinearRegression()
m_reg.fit(X_train, Z_train)

# result 1
tmp_res = [i_round, thresh]
res.append(tmp_res)

res_df = pd.DataFrame(res, columns=['round', 'thresh'])
print(res_df)
all_Y_test_pred_prob = pd.DataFrame(np.array(all_Y_test_pred_prob).T)
all_Y_test_pred_prob.to_csv('test_probs_iter_final.csv', index=False, float_format='%.6f')

Y_pred_syn = m.predict_proba(X_test_syn)[:,1]
Z_pred_syn = m_reg.predict(X_test_syn)
res_df = np.concatenate([X_test_syn, np.expand_dims(Y_pred_syn,1), np.expand_dims(Z_pred_syn,1)], axis=1)
out_df = pd.DataFrame(res_df[np.argsort(Y_pred_syn)[::-1]])
out_df.columns=['x1', 'y1_d_y2', 'x2_d_x1', 'y2', 'prob', 'slope']
out_df.to_csv('final_active.csv', index=False, float_format='%.6f')

