In [19]:
import glob
import joblib
from tensorflow.keras.models import load_model
from tensorflow.keras.losses import MeanSquaredError
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from keras.metrics import MeanSquaredError

In [None]:
#Pairing 
input_file = 'Data-files/transfer/heteronuclear-57-15features-degree2.xlsx'
reference_file = 'Data-files/transfer/homonuclear-159-15features-degree2.xlsx'

data = pd.read_excel(input_file)
reference_data = pd.read_excel(reference_file)
sample_ids = data['Clusters']  
targets = data['lg(k1)']  
features = data.iloc[:, 2:]  

reference_samples = reference_data[['ID', 'Clusters', 'lg(k1)'] + list(reference_data.columns[3:])]

def calculate_differences(reference_sample_id):
    pairwise_samples = []
    pairwise_targets = []
    pairwise_ids = []

    reference_sample = reference_samples[reference_samples['ID'] == reference_sample_id]  
    reference_features = reference_sample.iloc[:, 3:].values.flatten()  
    reference_target = reference_sample['lg(k1)'].values[0]  

    for idx in range(len(data)):
        sample_id = sample_ids[idx]
        feature_diff = features.iloc[idx].values - reference_features  
        target_diff = targets.iloc[idx] - reference_target  

        pairwise_samples.append(feature_diff)
        pairwise_targets.append(target_diff)
        pairwise_ids.append(f"{sample_id} vs {reference_sample_id}")

    pairwise_features_df = pd.DataFrame(pairwise_samples, columns=features.columns).reset_index(drop=True)
    pairwise_targets_df = pd.DataFrame(pairwise_targets, columns=['Target Difference']).reset_index(drop=True)
    pairwise_ids_df = pd.DataFrame(pairwise_ids, columns=['Sample Pair']).reset_index(drop=True)
    result_df = pd.concat([pairwise_ids_df, pairwise_targets_df, pairwise_features_df], axis=1)
    return result_df

for reference_sample_id in reference_samples['ID']:
    result_df = calculate_differences(reference_sample_id)
    output_file = f"Data-files/transfer/hetero-homo-{reference_sample_id}.xlsx"
    result_df.to_excel(output_file, index=False)

In [None]:
#predict
file_paths = glob.glob("Data-files/transfer/hetero-homo-*.xlsx")
loaded_model = load_model('trained-DNN/trained_model.keras', custom_objects={'mse': MeanSquaredError()})
scaler = joblib.load('trained-DNN/scaler_model.pkl')
reference_data = pd.read_excel('Data-files/transfer/homonuclear-159-15features-degree2.xlsx')
baseline_values = dict(zip(reference_data['ID'], reference_data.iloc[:, 2]))

all_predictions = {sample: [] for sample in range(1, 58)} 
all_sample_names = []

for expt_id in range(1, 160):
    file_path = f"Data-files/transfer/hetero-homo-{expt_id}.xlsx"
    
    if expt_id in baseline_values:
        baseline_value = baseline_values[expt_id]
    else:
        print(f"Warning: No baseline value found for {expt_id}. Skipping this file.")
        continue

    new_data = pd.read_excel(file_path)
    features = new_data.iloc[:, 2:] 
    real_values = new_data.iloc[:, 1].values  
    features_scaled = scaler.transform(features)  
    
    predictions = loaded_model.predict(features_scaled)
    predictions_adjusted = predictions + baseline_value
    real_values_adjusted = real_values + baseline_value

    for i, sample_id in enumerate(new_data['Sample Pair'].apply(lambda x: x.split(' vs ')[0])):
        all_predictions[i + 1].append(predictions_adjusted[i].item()) 
        if sample_id not in all_sample_names:
            all_sample_names.append(sample_id)  

predictions_df = pd.DataFrame(all_predictions)  
predictions_df.columns = all_sample_names  
predictions_df = predictions_df.T  
output_file = 'Data-files/transfer/hetero-pred.xlsx'
predictions_df.to_excel(output_file, index=True)

In [None]:
#prediction error
pred_file = 'Data-files/transfer/hetero-pred.xlsx' 
pred_data = pd.read_excel(pred_file, index_col=0)  

transfer_file = 'Data-files/transfer/heteronuclear-57-15features-degree2.xlsx' 
transfer_data = pd.read_excel(transfer_file)

error_data = []
sample_names = pred_data.index.tolist()

for sample_name in sample_names:
    target_value_row = transfer_data[transfer_data.iloc[:, 0] == sample_name]
    
    if not target_value_row.empty:
        target_value = target_value_row.iloc[0, 1] 
        target_value_up =  target_value + 0.477121255
        target_value_down =  target_value -0.477121255
        prediction_values = pred_data.loc[sample_name].values
        mean_val = np.mean(prediction_values)
        std_dev = np.std(prediction_values)
        #error
        error_pred = mean_val - target_value
        error_up = mean_val - target_value_up
        error_down = mean_val - target_value_down
        error_min = np.min([np.abs(error_pred), np.abs(error_up), np.abs(error_down)])

        plt.figure(figsize=(6, 3))
        plt.hist(prediction_values, bins=30, alpha=0.7, color='blue', label='Predicted Value')
        plt.axvline(mean_val, color='red', linestyle='dashed', linewidth=1, label=f'Mean: {mean_val:.4f}')
        plt.axvline(target_value, color='green', linestyle='dashed', linewidth=1, label=f'Target Value: {target_value:.4f}')
        plt.axvline(mean_val + std_dev, color='orange', linestyle='dashed', linewidth=1, label=f'Mean + 1 Std Dev: {mean_val + std_dev:.4f}')
        plt.axvline(mean_val - std_dev, color='orange', linestyle='dashed', linewidth=1, label=f'Mean - 1 Std Dev: {mean_val - std_dev:.4f}')
        plt.title(f'Prediction Distribution for {sample_name}')
        plt.xlabel('Predicted Value')
        plt.ylabel('Frequency')
        plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1))
        plt.grid(True)
        plt.tight_layout()
        plt.show()
        
        error_data.append([sample_name, mean_val, target_value, error_pred, error_up, error_down, error_min, std_dev])
    else:
        print(f"Sample {sample_name} not found in transfer data.")

error_df = pd.DataFrame(error_data, columns=['Sample Name', 'Predicted Mean', 'Target Value', 'Error_pred', 'Error_up','Error_down','Error_min','Std Dev'])
error_df.to_excel('Data-files/transfer/pred_error-all.xlsx', index=False)

In [None]:
#plot
data = pd.read_excel('Data-files/transfer/pred_error-all.xlsx')

predicted_values = data.iloc[:, 1].values 
actual_values = data.iloc[:, 2].values     

#  y = x 
x = np.linspace(min(min(predicted_values), min(actual_values)), max(max(predicted_values), max(actual_values)), 100)
y = x
# error bound
upper_bound = y + 0.70
lower_bound = y - 0.70

plt.figure(figsize=(8, 6))
plt.scatter(predicted_values, actual_values, color='blue', label='Predicted vs Actual', alpha=0.6)
plt.plot(x, y, 'k--', label='y = x', linewidth=1)
plt.plot(x, upper_bound, 'r--', label='Upper Bound (y = x + 0.1139)')
plt.plot(x, lower_bound, 'g--', label='Lower Bound (y = x - 0.1549)')
plt.xlim(-15,-10)
plt.ylim(-15,-10)
plt.xlabel('Predicted lgk1 Values')
plt.ylabel('Actual lgk1 Values')
plt.show()