# load python modules

In [None]:
import pandas as pd

In [None]:
import numpy as np

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, balanced_accuracy_score

In [None]:
import seaborn as sns

In [None]:
import matplotlib.pyplot as plt

In [None]:
import statsmodels.api as sm

# read in input files

## pheno

In [None]:
adsp_pheno_sample = pd.read_csv('input/ADSP_phenotype.keep_quest_comb.80%_train.txt',
                                sep = ' ')
print(len(adsp_pheno_sample.index))
adsp_pheno_sample.head()

# define number of samples and features

In [None]:
#n_samples = len(adsp_pheno_id.index)
n_samples = 13698
n_samples

In [None]:
n_features_2722 = 18717
n_features_2722

# create negative control

In [None]:
neg_control_2722_x = np.random.randn(n_samples, n_features_2722)

In [None]:
neg_control_2722_x_df = pd.DataFrame(neg_control_2722_x, columns=[f"feature_{i}" for i in range(n_features_2722)])
print(neg_control_2722_x_df.shape)

In [None]:
neg_control_2722 = pd.concat([adsp_pheno_sample, neg_control_2722_x_df], axis = 1)
print(neg_control_2722.shape)
neg_control_2722.head()

# test negative control

In [None]:
print(neg_control_2722.drop(columns = ['ID', 'AD']).min().min())
print(neg_control_2722.drop(columns = ['ID', 'AD']).max().max())

In [None]:
x_2722 = neg_control_2722.drop(columns = ['ID', 'AD'])

In [None]:
y_2722 = neg_control_2722['AD']

In [None]:
x_train_2722, x_test_2722, y_train_2722, y_test_2722 = train_test_split(x_2722, y_2722, stratify = y_2722, test_size = 0.3, random_state = 7)

In [None]:
clf_2722 = LogisticRegression()
clf_2722.fit(x_train_2722, y_train_2722)
y_pred_2722 = clf_2722.predict(x_test_2722)

In [None]:
auroc_neg_2722 = roc_auc_score(y_test_2722, y_pred_2722)
print(auroc_neg_2722)
auprc_neg_2722 = average_precision_score(y_test_2722, y_pred_2722)
print(auprc_neg_2722)
f1_neg_2722 = f1_score(y_test_2722, y_pred_2722)
print(f1_neg_2722)
bal_acc_neg_2722 = balanced_accuracy_score(y_test_2722, y_pred_2722)
print(bal_acc_neg_2722)

# create positive control

In [None]:
signal_features_2 = [0, 1]

In [None]:
signal_features_3 = [0, 1, 2]

In [None]:
signal_features_4 = [0, 1, 2, 3]

In [None]:
signal_features_5 = [0, 1, 2, 3, 4]

## 2 features with signal

In [None]:
signal_strength_2722_2f = 2.41

pos_control_2722_2f_x = np.random.normal(loc = 0.0, scale = 1.0, size = (n_samples, n_features_2722))

for f in signal_features_2:
    pos_control_2722_2f_x[y_2722 == 1, f] = np.random.normal(loc = signal_strength_2722_2f, scale = 1.0, size = (y_2722 == 1).sum())
pos_control_2722_2f_x_df = pd.DataFrame(pos_control_2722_2f_x, columns=[f"feature_{i}" for i in range(n_features_2722)])
pos_control_2722_2f = pd.concat([adsp_pheno_sample, pos_control_2722_2f_x_df], axis = 1)
print(pos_control_2722_2f.shape)

## 3 features with signal

In [None]:
signal_strength_2722_3f = 2.1

pos_control_2722_3f_x = np.random.normal(loc = 0.0, scale = 1.0, size = (n_samples, n_features_2722))

for f in signal_features_3:
    pos_control_2722_3f_x[y_2722 == 1, f] = np.random.normal(loc = signal_strength_2722_3f, scale = 1.0, size = (y_2722 == 1).sum())
pos_control_2722_3f_x_df = pd.DataFrame(pos_control_2722_3f_x, columns=[f"feature_{i}" for i in range(n_features_2722)])
pos_control_2722_3f = pd.concat([adsp_pheno_sample, pos_control_2722_3f_x_df], axis = 1)
print(pos_control_2722_3f.shape)

## 4 features with signal

In [None]:
signal_strength_2722_4f = 1.8

pos_control_2722_4f_x = np.random.normal(loc = 0.0, scale = 1.0, size = (n_samples, n_features_2722))

for f in signal_features_4:
    pos_control_2722_4f_x[y_2722 == 1, f] = np.random.normal(loc = signal_strength_2722_4f, scale = 1.0, size = (y_2722 == 1).sum())
pos_control_2722_4f_x_df = pd.DataFrame(pos_control_2722_4f_x, columns=[f"feature_{i}" for i in range(n_features_2722)])
pos_control_2722_4f = pd.concat([adsp_pheno_sample, pos_control_2722_4f_x_df], axis = 1)
print(pos_control_2722_4f.shape)

## 5 features with signal

In [None]:
signal_strength_2722_5f = 1.7

pos_control_2722_5f_x = np.random.normal(loc = 0.0, scale = 1.0, size = (n_samples, n_features_2722))

for f in signal_features_5:
    pos_control_2722_5f_x[y_2722 == 1, f] = np.random.normal(loc = signal_strength_2722_5f, scale = 1.0, size = (y_2722 == 1).sum())
pos_control_2722_5f_x_df = pd.DataFrame(pos_control_2722_5f_x, columns=[f"feature_{i}" for i in range(n_features_2722)])
pos_control_2722_5f = pd.concat([adsp_pheno_sample, pos_control_2722_5f_x_df], axis = 1)
print(pos_control_2722_5f.shape)

# test positive control

## 2 features w signal

In [None]:
print(pos_control_2722_2f.drop(columns = ['ID', 'AD']).min().min())
print(pos_control_2722_2f.drop(columns = ['ID', 'AD']).max().max())
print(' ')

x_2722_2f = pos_control_2722_2f.drop(columns = ['ID', 'AD'])
y_2722_2f = pos_control_2722_2f['AD']
x_train_2722_2f, x_test_2722_2f, y_train_2722_2f, y_test_2722_2f = train_test_split(x_2722_2f, y_2722_2f, stratify = y_2722_2f, test_size = 0.3, random_state = 7)
clf_2722_2f = LogisticRegression()
clf_2722_2f.fit(x_train_2722_2f, y_train_2722_2f)
y_pred_2722_2f = clf_2722_2f.predict(x_test_2722_2f)

auroc_pos_2722_2f = roc_auc_score(y_test_2722_2f, y_pred_2722_2f)
print(auroc_pos_2722_2f)
auprc_pos_2722_2f = average_precision_score(y_test_2722_2f, y_pred_2722_2f)
print(auprc_pos_2722_2f)
f1_pos_2722_2f = f1_score(y_test_2722_2f, y_pred_2722_2f)
print(f1_pos_2722_2f)
bal_acc_pos_2722_2f = balanced_accuracy_score(y_test_2722_2f, y_pred_2722_2f)
print(bal_acc_pos_2722_2f)

## 3 features w signal

In [None]:
print(pos_control_2722_3f.drop(columns = ['ID', 'AD']).min().min())
print(pos_control_2722_3f.drop(columns = ['ID', 'AD']).max().max())
print(' ')

x_2722_3f = pos_control_2722_3f.drop(columns = ['ID', 'AD'])
y_2722_3f = pos_control_2722_3f['AD']
x_train_2722_3f, x_test_2722_3f, y_train_2722_3f, y_test_2722_3f = train_test_split(x_2722_3f, y_2722_3f, stratify = y_2722_3f, test_size = 0.3, random_state = 7)
clf_2722_3f = LogisticRegression()
clf_2722_3f.fit(x_train_2722_3f, y_train_2722_3f)
y_pred_2722_3f = clf_2722_3f.predict(x_test_2722_3f)

auroc_pos_2722_3f = roc_auc_score(y_test_2722_3f, y_pred_2722_3f)
print(auroc_pos_2722_3f)
auprc_pos_2722_3f = average_precision_score(y_test_2722_3f, y_pred_2722_3f)
print(auprc_pos_2722_3f)
f1_pos_2722_3f = f1_score(y_test_2722_3f, y_pred_2722_3f)
print(f1_pos_2722_3f)
bal_acc_pos_2722_3f = balanced_accuracy_score(y_test_2722_3f, y_pred_2722_3f)
print(bal_acc_pos_2722_3f)

## 4 features with signal

In [None]:
print(pos_control_2722_4f.drop(columns = ['ID', 'AD']).min().min())
print(pos_control_2722_4f.drop(columns = ['ID', 'AD']).max().max())
print(' ')

x_2722_4f = pos_control_2722_4f.drop(columns = ['ID', 'AD'])
y_2722_4f = pos_control_2722_4f['AD']
x_train_2722_4f, x_test_2722_4f, y_train_2722_4f, y_test_2722_4f = train_test_split(x_2722_4f, y_2722_4f, stratify = y_2722_4f, test_size = 0.3, random_state = 7)
clf_2722_4f = LogisticRegression()
clf_2722_4f.fit(x_train_2722_4f, y_train_2722_4f)
y_pred_2722_4f = clf_2722_4f.predict(x_test_2722_4f)

auroc_pos_2722_4f = roc_auc_score(y_test_2722_4f, y_pred_2722_4f)
print(auroc_pos_2722_4f)
auprc_pos_2722_4f = average_precision_score(y_test_2722_4f, y_pred_2722_4f)
print(auprc_pos_2722_4f)
f1_pos_2722_4f = f1_score(y_test_2722_4f, y_pred_2722_4f)
print(f1_pos_2722_4f)
bal_acc_pos_2722_4f = balanced_accuracy_score(y_test_2722_4f, y_pred_2722_4f)
print(bal_acc_pos_2722_4f)

## 5 features with signal

In [None]:
print(pos_control_2722_5f.drop(columns = ['ID', 'AD']).min().min())
print(pos_control_2722_5f.drop(columns = ['ID', 'AD']).max().max())
print(' ')

x_2722_5f = pos_control_2722_5f.drop(columns = ['ID', 'AD'])
y_2722_5f = pos_control_2722_5f['AD']
x_train_2722_5f, x_test_2722_5f, y_train_2722_5f, y_test_2722_5f = train_test_split(x_2722_5f, y_2722_5f, stratify = y_2722_5f, test_size = 0.3, random_state = 7)
clf_2722_5f = LogisticRegression()
clf_2722_5f.fit(x_train_2722_5f, y_train_2722_5f)
y_pred_2722_5f = clf_2722_5f.predict(x_test_2722_5f)

auroc_pos_2722_5f = roc_auc_score(y_test_2722_5f, y_pred_2722_5f)
print(auroc_pos_2722_5f)
auprc_pos_2722_5f = average_precision_score(y_test_2722_5f, y_pred_2722_5f)
print(auprc_pos_2722_5f)
f1_pos_2722_5f = f1_score(y_test_2722_5f, y_pred_2722_5f)
print(f1_pos_2722_5f)
bal_acc_pos_2722_5f = balanced_accuracy_score(y_test_2722_5f, y_pred_2722_5f)
print(bal_acc_pos_2722_5f)

# remove AD column

## negative control

In [None]:
neg_control_2722_export = neg_control_2722.drop(columns = ['AD'])

## positive control

In [None]:
pos_control_2722_2f_export = pos_control_2722_2f.drop(columns = ['AD'])

In [None]:
pos_control_2722_3f_export = pos_control_2722_3f.drop(columns = ['AD'])

In [None]:
pos_control_2722_4f_export = pos_control_2722_4f.drop(columns = ['AD'])

In [None]:
pos_control_2722_5f_export = pos_control_2722_5f.drop(columns = ['AD'])

# assess correlation between features

In [None]:
neg_control_2722_export_sub = neg_control_2722_export[['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4']]
sns.heatmap(neg_control_2722_export_sub.corr(numeric_only = True), cmap = 'coolwarm', linecolor = 'black', linewidths = 0.5, annot = True)
plt.title('Negative Control 2722 Features')
plt.savefig('simulated_datasets/plots/ADSP.simulated.2722_features.80%_train.negative_control.features_0-4_correlation.heatmap.png')
plt.show()

In [None]:
pos_control_2722_2f_export_sub = pos_control_2722_2f_export[['feature_0', 'feature_1']]
sns.heatmap(pos_control_2722_2f_export_sub.corr(numeric_only = True), cmap = 'coolwarm', linecolor = 'black', linewidths = 0.5, annot = True)
plt.title('Positive 2722 Features: Features 0-1 with Signal')
plt.show()

In [None]:
pos_control_2722_3f_export_sub = pos_control_2722_3f_export[['feature_0', 'feature_1', 'feature_2']]
sns.heatmap(pos_control_2722_3f_export_sub.corr(numeric_only = True), cmap = 'coolwarm', linecolor = 'black', linewidths = 0.5, annot = True)
plt.title('Positive 2722 Features: Features 0-2 with Signal')
plt.show()

In [None]:
pos_control_2722_4f_export_sub = pos_control_2722_4f_export[['feature_0', 'feature_1', 'feature_2', 'feature_3']]
sns.heatmap(pos_control_2722_4f_export_sub.corr(numeric_only = True), cmap = 'coolwarm', linecolor = 'black', linewidths = 0.5, annot = True)
plt.title('Positive 2722 Features: Features 0-3 with Signal')
plt.show()

In [None]:
pos_control_2722_5f_export_sub = pos_control_2722_5f_export[['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4']]
sns.heatmap(pos_control_2722_5f_export_sub.corr(numeric_only = True), cmap = 'coolwarm', linecolor = 'black', linewidths = 0.5, annot = True)
plt.title('Positive 2722 Features: Features 0-4 with Signal')
plt.savefig('simulated_datasets/plots/ADSP.simulated.2722_features.features_0-4_signal.80%_train.positive_control.features_0-4_correlation.heatmap.png')
plt.show()

# run univariate and multivariate regressions

## negative control

In [None]:
df = neg_control_2722.copy()

x = df.drop(columns = ['ID', 'AD'])
y = df['AD']
result = sm.Logit(y, x).fit()

#print(result.summary())
print(result.llr_pvalue)

## positive control

In [None]:
pd.set_option('display.float_format', '{:.3e}'.format)

### 0-1 with signal

In [None]:
df = pos_control_2722_2f.copy()

x = df.drop(columns = ['ID', 'AD'])
y = df['AD']
result = sm.Logit(y, x).fit()

print(result.llr_pvalue)
result_df = pd.DataFrame({'pval' : result.pvalues})
result_df[result_df['pval'] < 0.01].sort_values(by = 'pval')

### 0-2 with signal

In [None]:
df = pos_control_2722_3f.copy()

x = df.drop(columns = ['ID', 'AD'])
y = df['AD']
result = sm.Logit(y, x).fit()

print(result.llr_pvalue)
result_df = pd.DataFrame({'pval' : result.pvalues})
result_df[result_df['pval'] < 0.01].sort_values(by = 'pval')

### 0-3 with signal

In [None]:
df = pos_control_2722_4f.copy()

x = df.drop(columns = ['ID', 'AD'])
y = df['AD']
result = sm.Logit(y, x).fit()

print(result.llr_pvalue)
result_df = pd.DataFrame({'pval' : result.pvalues})
result_df[result_df['pval'] < 0.01].sort_values(by = 'pval')

### 0-4 with signal

In [None]:
df = pos_control_2722_5f.copy()

x = df.drop(columns = ['ID', 'AD'])
y = df['AD']
result = sm.Logit(y, x).fit()

print(result.llr_pvalue)
result_df = pd.DataFrame({'pval' : result.pvalues})
result_df[result_df['pval'] < 0.01].sort_values(by = 'pval')

# make eval metrics df

In [None]:
neg_2722_eval = ['Negative Control- 18717 Features',
                 auroc_neg_2722,
                 auprc_neg_2722,
                 f1_neg_2722,
                 bal_acc_neg_2722]
pos_2722_2f_eval = ['Positive Control- 18717 Features, 0-1 with signal',
                    auroc_pos_2722_2f,
                    auprc_pos_2722_2f,
                    f1_pos_2722_2f,
                    bal_acc_pos_2722_2f]
pos_2722_3f_eval = ['Positive Control- 18717 Features, 0-2 with signal',
                    auroc_pos_2722_3f,
                    auprc_pos_2722_3f,
                    f1_pos_2722_3f,
                    bal_acc_pos_2722_3f]
pos_2722_4f_eval = ['Positive Control- 18717 Features, 0-3 with signal',
                    auroc_pos_2722_4f,
                    auprc_pos_2722_4f,
                    f1_pos_2722_4f,
                    bal_acc_pos_2722_4f]
pos_2722_5f_eval = ['Positive Control- 18717 Features, 0-4 with signal',
                    auroc_pos_2722_5f,
                    auprc_pos_2722_5f,
                    f1_pos_2722_5f,
                    bal_acc_pos_2722_5f]

In [None]:
eval_df = pd.DataFrame([neg_2722_eval,
                        pos_2722_2f_eval,
                        pos_2722_3f_eval,
                        pos_2722_4f_eval,
                        pos_2722_5f_eval],
                       columns = ['Simulated Data', 'LR AUROC', 'LR AUPRC', 'LR F1', 'LR BA'])
eval_df

# export

## negative controls

In [None]:
neg_control_2722_export.to_csv('simulated_datasets/ADSP.simulated.18717_features.keep_quest_comb.80%_train.negative_control.txt',
                               sep = ' ',
                               index = None,
                               na_rep = 'NaN')

## positive controls

In [None]:
pos_control_2722_2f_export.to_csv('simulated_datasets/ADSP.simulated.18717_features.features_0-1_signal.keep_quest_comb.80%_train.positive_control.txt',
                                  sep = ' ',
                                  index = None,
                                  na_rep = 'NaN')

In [None]:
pos_control_2722_3f_export.to_csv('simulated_datasets/ADSP.simulated.18717_features.features_0-2_signal.keep_quest_comb.80%_train.positive_control.txt',
                                  sep = ' ',
                                  index = None,
                                  na_rep = 'NaN')

In [None]:
pos_control_2722_4f_export.to_csv('simulated_datasets/ADSP.simulated.18717_features.features_0-3_signal.keep_quest_comb.80%_train.positive_control.txt',
                                  sep = ' ',
                                  index = None,
                                  na_rep = 'NaN')

In [None]:
pos_control_2722_5f_export.to_csv('simulated_datasets/ADSP.simulated.18717_features.features_0-4_signal.keep_quest_comb.80%_train.positive_control.txt',
                                  sep = ' ',
                                  index = None,
                                  na_rep = 'NaN')

## eval df

In [None]:
eval_df.to_csv('simulated_datasets/ADSP.simulated.keep_quest_comb.80%_train.eval_metrics.txt',
               sep = '\t',
               index = None,
               na_rep = 'NaN')