In [None]:
import numpy as np
import pandas as pd
# import spektral
np.random.seed(0)
import os 

path = '/content/drive/My Drive/IIITH/GCN_KEGG/GCN_Dataset/CSV'
os.listdir(path)

['Clinical_KICH_81_tumors.csv',
 'Clinical_KIRP_290_tumors.csv',
 'Clinical_KIRC_518_tumors.csv',
 'KICH_81_tumors_log_transformed.csv',
 'KIRP_290_tumors_log_transformed.csv',
 'KIRC_518_tumors_log_transformed.csv',
 '.DS_Store']

In [None]:
def load_dataset(path, filename, transpose=True):
    '''
        Loads the dataset and converts into its transpose with appropriate columns
    '''
    df = pd.read_csv(os.path.join(path, filename))
    df.rename(columns={"Unnamed: 0": "pid"}, inplace=True)
    if transpose:
        df = df.astype({"pid": str})
        df = df.T
        new_header = df.iloc[0] 
        df = df[1:]
        df.columns = new_header
    return df

In [None]:
def tsne_plot(x1, y1, name="graph.png"):
    tsne = TSNE(n_components=2, random_state=0)
    X_t = tsne.fit_transform(x1)

    plt.figure(figsize=(12, 8))
    plt.scatter(X_t[np.where(y1 == 0), 0], X_t[np.where(y1 == 0), 1], marker='o', color='g', linewidth='1', alpha=0.8, label='Non Fraud')
    plt.scatter(X_t[np.where(y1 == 1), 0], X_t[np.where(y1 == 1), 1], marker='o', color='r', linewidth='1', alpha=0.8, label='Fraud')

    plt.legend(loc='best');
    # plt.savefig(name);
    plt.show();

In [None]:
def get_features_matrix(pathway, dataframe):

  genes_used = set()

  for i in range(len(pathway)):
      genes_used.add(pathway.iloc[i]['from'][4:])
      genes_used.add(pathway.iloc[i]['to'][4:])

  to_remove = []

  for gene in genes_used:
      if gene not in dataframe.columns:
          to_remove.append(gene)

  for gene in to_remove:
    genes_used.remove(gene)

  genes_used = list(genes_used)
  genes_used.sort()

  return genes_used

In [None]:
kegg_pathways_path = '/content/drive/My Drive/IIITH/GCN_KEGG/KEGG_csv'

In [None]:
df_kirc = load_dataset(path,'KIRC_518_tumors_log_transformed.csv',transpose=True)
patient_data_kirc = load_dataset(path,'Clinical_KIRC_518_tumors.csv',transpose=False)
pid_kirc_drop1 = patient_data_kirc[patient_data_kirc['ajcc_pathologic_tumor_stage']=='[Not Available]'].pid
pid_kirc_drop2 = patient_data_kirc[patient_data_kirc['ajcc_pathologic_tumor_stage']=='[Discrepancy]'].pid
patient_data_kirc.drop(patient_data_kirc[patient_data_kirc['ajcc_pathologic_tumor_stage']=='[Not Available]'].index, inplace=True )
patient_data_kirc.drop(patient_data_kirc[patient_data_kirc['ajcc_pathologic_tumor_stage']=='[Discrepancy]'].index, inplace=True )
df_kirc.drop(pid_kirc_drop1,inplace=True)
df_kirc.drop(pid_kirc_drop2,inplace=True)

y_kirc=[]
for pid in df_kirc.index:
    stage=patient_data_kirc[patient_data_kirc['pid']==pid]['ajcc_pathologic_tumor_stage']
    stage = stage.values[0]
    if stage=='Stage I':
        y_kirc.append(0)
    elif stage=='Stage II':
        y_kirc.append(0)
    elif stage=='Stage III':
        y_kirc.append(1)
    elif stage=='Stage IV':
        y_kirc.append(1)



In [None]:
# features_to_use = set()
# data_subset = data_train
# data_subset = data_subset.assign(y=pd.Series(y_train).values)
# data_subset = data_subset.apply(pd.to_numeric) 
# corrMatrix = data_subset.corr()
# features_to_use.update(corrMatrix[corrMatrix['y']<-0.3].index.tolist())
# features_to_use.update(corrMatrix[corrMatrix['y']>0.3].index.tolist())

In [None]:
# # features_to_use.remove('y')
# features_to_use = {'100129583',
#  '100289341',
#  '10643',
#  '11065',
#  '11162',
#  '148808',
#  '1748',
#  '196047',
#  '201161',
#  '2018',
#  '26275',
#  '3131',
#  '36',
#  '3706',
#  '4306',
#  '5047',
#  '51054',
#  '53833',
#  '55165',
#  '5522',
#  '55325',
#  '55521',
#  '6262',
#  '6716',
#  '7691',
#  '7923',
#  '79944',
#  '81796',
#  '829',
#  '84866',
#  '9603'}

In [None]:
# data = df_kirc[list(features_to_use)]
# # y = y_kirc
# y_kirc = np.asarray(y_kirc)
# data = data.assign(y=pd.Series(y_kirc).values)
# data = data.apply(pd.to_numeric)

In [None]:
from sklearn.model_selection import train_test_split
# data_train, data_test, y_train, y_test = train_test_split(data, y_kirc, test_size=0.1, random_state=0, stratify=y_kirc)

In [None]:
import tensorflow as tf
from keras.layers import Input, Dense, BatchNormalization
from keras.models import Model, Sequential
from keras import regularizers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import NuSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn import preprocessing 
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

Using TensorFlow backend.
  import pandas.util.testing as tm


In [None]:
# X = data_train.drop(['y'], axis=1)
# y = data_train['y']
# y = y.values

# ## define the model
# input_layer = Input(shape=(X.shape[1],))
# encoded = Dense(200, activation='tanh', activity_regularizer=regularizers.l2(10e-5))(input_layer)
# encoded = Dense(100, activation='tanh')(encoded)
# encoded = Dense(50, activation='relu')(encoded)
# decoded = Dense(50, activation='tanh')(encoded)
# decoded = Dense(100, activation='tanh')(decoded)
# decoded = Dense(200, activation='tanh')(decoded)
# output_layer = Dense(X.shape[1], activation='relu')(decoded)

# autoencoder = Model(input_layer, output_layer)
# autoencoder.compile(optimizer="rmsprop", loss="mse")

In [None]:
# test = data_test.drop(['y'], axis=1)
# scaler = preprocessing.MinMaxScaler()
# scaler.fit(X.values)
# X_scale = scaler.transform(X.values)
# test_x_scale = scaler.transform(test.values)

# x_early, x_late = X_scale[y == 0], X_scale[y == 1]
# autoencoder.fit(x_early, x_early, epochs = 20, validation_split=0.1, batch_size=8)

In [None]:
# hidden_representation = Sequential()
# hidden_representation.add(autoencoder.layers[0])
# hidden_representation.add(autoencoder.layers[1])
# hidden_representation.add(autoencoder.layers[2])
# hidden_representation.add(autoencoder.layers[3])

In [None]:
# autoencoder.summary()

In [None]:
# early_hid_rep = hidden_representation.predict(x_early)
# late_hid_rep = hidden_representation.predict(x_late)

# rep_x = np.append(early_hid_rep, late_hid_rep, axis = 0)
# y_n = np.zeros(early_hid_rep.shape[0])
# y_f = np.ones(late_hid_rep.shape[0])
# rep_y = np.append(y_n, y_f)

In [None]:
# tsne_plot(rep_x, rep_y, "latent_representation_train.png")

In [None]:
# train_x, val_x, train_y, val_y = train_test_split(rep_x, rep_y, test_size=0.1, random_state=0, stratify=rep_y)
# clf1 = LogisticRegression().fit(train_x, train_y)
# pred_y = clf1.predict(val_x)

# print(classification_report(val_y, pred_y))
# print(accuracy_score(val_y, pred_y))

In [None]:
# test_rep_x = hidden_representation.predict(test_x_scale)

# pred_y_test1 = clf1.predict(test_rep_x)

# print(accuracy_score(y_test, pred_y_test1))

In [None]:
from sklearn.model_selection import cross_val_score

lda_scores= []
files_used = []

files_to_use = os.listdir(kegg_pathways_path)
files_to_use.sort()

for file in files_to_use:
	if 'hsa' not in file:
		continue
	pathway = pd.read_csv(os.path.join(kegg_pathways_path,file))
	pathway.rename(columns={"Unnamed: 0": "idx"}, inplace=True)

	features_to_use = get_features_matrix(pathway, df_kirc)

	if len(features_to_use)<5:
		continue
	data = df_kirc[features_to_use]
	y_kirc = np.asarray(y_kirc)
	data = data.assign(y=pd.Series(y_kirc).values)
	data = data.apply(pd.to_numeric)
	data_train, data_test, y_train, y_test = train_test_split(data, y_kirc, test_size=0.2, random_state=0, stratify=y_kirc)


	X = data_train.drop(['y'], axis=1)
	y = data_train['y']
	y = y.values

	## define the model
	input_layer = Input(shape=(X.shape[1],))
	encoded = Dense(128, activation='tanh', activity_regularizer=regularizers.l2(10e-5))(input_layer)
	encoded = Dense(128, activation='tanh')(encoded)
	encoded = Dense(64, activation='relu')(encoded)
	decoded = Dense(64, activation='tanh')(encoded)
	decoded = Dense(128, activation='tanh')(decoded)
	decoded = Dense(128, activation='tanh')(decoded)
	output_layer = Dense(X.shape[1], activation='relu')(decoded)

	autoencoder = Model(input_layer, output_layer)
	autoencoder.compile(optimizer="adadelta", loss="mse")

	test = data_test.drop(['y'], axis=1)
	scaler = preprocessing.MinMaxScaler()
	scaler.fit(X.values)
	X_scale = scaler.transform(X.values)
	test_x_scale = scaler.transform(test.values)

	x_early, x_late = X_scale[y == 0], X_scale[y == 1]
	autoencoder.fit(x_early, x_early, epochs = 15, validation_split=0.0, batch_size=4, verbose=0)

	hidden_representation = Sequential()
	hidden_representation.add(autoencoder.layers[0])
	hidden_representation.add(autoencoder.layers[1])
	hidden_representation.add(autoencoder.layers[2])
	hidden_representation.add(autoencoder.layers[3])



	early_hid_rep = hidden_representation.predict(x_early)
	late_hid_rep = hidden_representation.predict(x_late)

	rep_x = np.append(early_hid_rep, late_hid_rep, axis = 0)
	y_n = np.zeros(early_hid_rep.shape[0])
	y_f = np.ones(late_hid_rep.shape[0])
	rep_y = np.append(y_n, y_f)

	# train_x, val_x, train_y, val_y = train_test_split(rep_x, rep_y, test_size=0.1, random_state=0, stratify=rep_y)
	clf1 = LogisticRegression(max_iter=1500, random_state=0)
	scores = cross_val_score(clf1, rep_x, rep_y, cv=10)
	val_score = np.mean(scores)
	
	clf1 = LogisticRegression(max_iter=1500, random_state=0).fit(rep_x, rep_y)

	test_rep_x = hidden_representation.predict(test_x_scale)

	pred_y_test1 = clf1.predict(test_rep_x)

	score = accuracy_score(y_test, pred_y_test1)


	# score = eclf3.score(X_test, y_test)
 
	lda_scores.append(score)
	print(file, val_score, score)
	tf.keras.backend.clear_session()
	if val_score>0.78:
		files_used.append(file)
		

		# gcn_pathway_output = eclf3.predict_proba(X)
		# filename_output_csv = os.path.join("/content/drive/My Drive/IIITH/GCN_KEGG/KIRC_Stage_TSNE_PCA_Plots/GCN_Pathway_output_scores",file)
		# np.savetxt(filename_output_csv,gcn_pathway_output)    
	

	



hsa00010 .csv 0.6481416957026712 0.6796116504854369
hsa00051 .csv 0.6529616724738675 0.6601941747572816
hsa00250 .csv 0.6891986062717769 0.7475728155339806
hsa00260 .csv 0.6358885017421602 0.6601941747572816
hsa00280 .csv 0.6822299651567943 0.7087378640776699
hsa00830 .csv 0.6770615563298489 0.6699029126213593
hsa00860 .csv 0.6795005807200928 0.6796116504854369
hsa01522 .csv 0.6702671312427408 0.7378640776699029
hsa03320 .csv 0.6628339140534262 0.6699029126213593


KeyboardInterrupt: ignored

In [None]:

plt.plot(lda_scores)

In [None]:
from sklearn.model_selection import cross_val_score

features_to_use = set()

lda_scores= []
files_used = []

files_to_use = os.listdir(kegg_pathways_path)
files_to_use.sort()

for file in [
'hsa04622 .csv',
'hsa00250 .csv']:
	if 'hsa' not in file:
		continue
	pathway = pd.read_csv(os.path.join(kegg_pathways_path,file))
	pathway.rename(columns={"Unnamed: 0": "idx"}, inplace=True)

	features_to_use.update(get_features_matrix(pathway, df_kirc))

	# if len(features_to_use)<5:
	# 	continue
print(len(features_to_use))
data = df_kirc[list(features_to_use)]
y_kirc = np.asarray(y_kirc)
data = data.assign(y=pd.Series(y_kirc).values)
data = data.apply(pd.to_numeric)
data_train, data_test, y_train, y_test = train_test_split(data, y_kirc, test_size=0.2, random_state=0, stratify=y_kirc)


X = data_train.drop(['y'], axis=1)
y = data_train['y']
y = y.values

## define the model
input_layer = Input(shape=(X.shape[1],))
encoded = Dense(100, activation='tanh', activity_regularizer=regularizers.l2(10e-5))(input_layer)
encoded = Dense(50, activation='relu', activity_regularizer=regularizers.l2(10e-5))(encoded)
decoded = Dense(50, activation='tanh', activity_regularizer=regularizers.l2(10e-5))(encoded)
decoded = Dense(100, activation='tanh', activity_regularizer=regularizers.l2(10e-5))(decoded)
output_layer = Dense(X.shape[1], activation='relu')(decoded)

autoencoder = Model(input_layer, output_layer)
autoencoder.compile(optimizer="adadelta", loss="mse")

test = data_test.drop(['y'], axis=1)
scaler = preprocessing.MinMaxScaler()
scaler.fit(X.values)
X_scale = scaler.transform(X.values)
test_x_scale = scaler.transform(test.values)

x_early, x_late = X_scale[y == 0], X_scale[y == 1]
autoencoder.fit(x_early, x_early, epochs = 20, validation_split=0.0, batch_size=4, verbose=0)

hidden_representation = Sequential()
hidden_representation.add(autoencoder.layers[0])
hidden_representation.add(autoencoder.layers[1])
hidden_representation.add(autoencoder.layers[2])
# hidden_representation.add(autoencoder.layers[3])



early_hid_rep = hidden_representation.predict(x_early)
late_hid_rep = hidden_representation.predict(x_late)

rep_x = np.append(early_hid_rep, late_hid_rep, axis = 0)
y_n = np.zeros(early_hid_rep.shape[0])
y_f = np.ones(late_hid_rep.shape[0])
rep_y = np.append(y_n, y_f)

# train_x, val_x, train_y, val_y = train_test_split(rep_x, rep_y, test_size=0.1, random_state=0, stratify=rep_y)
clf1 = LogisticRegression()
scores = cross_val_score(clf1, rep_x, rep_y, cv=10)
val_score = np.mean(scores)

clf1 = LogisticRegression().fit(rep_x, rep_y)

test_rep_x = hidden_representation.predict(test_x_scale)

pred_y_test1 = clf1.predict(test_rep_x)

score = accuracy_score(y_test, pred_y_test1)


# score = eclf3.score(X_test, y_test)

lda_scores.append(score)
print(file, val_score, score)
tf.keras.backend.clear_session()
  

  # gcn_pathway_output = eclf3.predict_proba(X)
  # filename_output_csv = os.path.join("/content/drive/My Drive/IIITH/GCN_KEGG/KIRC_Stage_TSNE_PCA_Plots/GCN_Pathway_output_scores",file)
  # np.savetxt(filename_output_csv,gcn_pathway_output)    






102


In [None]:
from sklearn.metrics import roc_auc_score
y_pred1 = clf1.predict_proba(test_rep_x)
print(roc_auc_score(y_test, y_pred1[:,1]))