In [None]:
import numpy as np
import pandas as pd
np.random.seed(0)
import os 
path = '/content/drive/My Drive/IIITH/GCN_KEGG/GCN_Dataset/CSV'
os.listdir(path)

['Clinical_KICH_81_tumors.csv',
 'Clinical_KIRP_290_tumors.csv',
 'Clinical_KIRC_518_tumors.csv',
 'KICH_81_tumors_log_transformed.csv',
 'KIRP_290_tumors_log_transformed.csv',
 'KIRC_518_tumors_log_transformed.csv',
 '.DS_Store']

In [None]:
def load_dataset(path, filename, transpose=True):
    '''
        Loads the dataset and converts into its transpose with appropriate columns
    '''
    df = pd.read_csv(os.path.join(path, filename))
    df.rename(columns={"Unnamed: 0": "pid"}, inplace=True)
    if transpose:
        df = df.astype({"pid": str})
        df = df.T
        new_header = df.iloc[0] 
        df = df[1:]
        df.columns = new_header
    return df

In [None]:
df_kirc = load_dataset(path,'KIRC_518_tumors_log_transformed.csv',transpose=True)
patient_data_kirc = load_dataset(path,'Clinical_KIRC_518_tumors.csv',transpose=False)
pid_kirc_drop1 = patient_data_kirc[patient_data_kirc['ajcc_pathologic_tumor_stage']=='[Not Available]'].pid
pid_kirc_drop2 = patient_data_kirc[patient_data_kirc['ajcc_pathologic_tumor_stage']=='[Discrepancy]'].pid
patient_data_kirc.drop(patient_data_kirc[patient_data_kirc['ajcc_pathologic_tumor_stage']=='[Not Available]'].index, inplace=True )
patient_data_kirc.drop(patient_data_kirc[patient_data_kirc['ajcc_pathologic_tumor_stage']=='[Discrepancy]'].index, inplace=True )
df_kirc.drop(pid_kirc_drop1,inplace=True)
df_kirc.drop(pid_kirc_drop2,inplace=True)

y_kirc=[]
for pid in df_kirc.index:
    stage=patient_data_kirc[patient_data_kirc['pid']==pid]['ajcc_pathologic_tumor_stage']
    stage = stage.values[0]
    if stage=='Stage I':
        y_kirc.append(0)
    elif stage=='Stage II':
        y_kirc.append(0)
    elif stage=='Stage III':
        y_kirc.append(1)
    elif stage=='Stage IV':
        y_kirc.append(1)

In [None]:
from sklearn.model_selection import train_test_split


In [None]:
data = df_kirc
y_kirc = np.asarray(y_kirc)
data = data.assign(y=pd.Series(y_kirc).values)
data = data.apply(pd.to_numeric)
data_train, data_test, y_train, y_test = train_test_split(data, y_kirc, test_size=0.2, random_state=0, stratify=y_kirc)


X_train = data_train.drop(['y'], axis=1)
y_train = data_train['y']
y_train = y_train.values

X_test = data_test.drop(['y'], axis=1)
y_test = data_test['y']
y_test = y_test.values

In [None]:
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_classif

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [None]:
fvalue_selector = SelectKBest(f_classif, k=3000)
X_kbest = fvalue_selector.fit_transform(X_train, y_train)
X_test_kbest = fvalue_selector.transform(X_test)

  4376  4655  4808  4814  4816  4817  4818  4823  4835  5288  7474  7661
  7662  7663  7664  7665  8121  9304  9311  9312  9313  9315  9318  9350
  9452 10121 10139 11130 11223 11958 12826 13520 14159 14160 14161 14162
 14756 14758 15139 15141 15142 15564 16567 16569 16570 16572 16575 16576
 16579 16580 16581 16606 16631 16639 16676 16698 16699 16700 16701 16702
 16704 16705 16706 16707 16708 16709 16710 16711 16712 16713 16714 16715
 16716 16717 16718 16719 16720 16721 16722 16723 16724 16725 16726 16727
 16728 16729 16730 16731 16732 16733 16734 16735 16736 16737 16738 16739
 16740 16741 16742 16743 16744 16745 16746 16747 16749 16750 16751 16752
 16753 16754 16755 16756 16758 16759 16760 16761 16762 16763 16764 16765
 16766 16767 16768 16769 16770 16771 16772 16773 16775 16776 16777 16778
 16779 16780 16781 16782 16783 16784 16785 16786 16788 16789 16790 16791
 16792 16793 16794 16796 16797 16799 16800 16801 16802 16803 16804 16805
 16806 16807 16808 16809 16810 16811 16812 16813 16

In [None]:
X_kbest.shape, X_test_kbest.shape

((412, 3000), (103, 3000))

In [None]:
# Classifiers
from sklearn.svm import NuSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
clf1 = ExtraTreesClassifier(max_depth=5, random_state=0, n_estimators=200)
clf2 = RandomForestClassifier(n_estimators = 200, criterion = "gini", max_depth = 5,
                                max_features = "auto", min_samples_leaf = 0.005,
                                min_samples_split = 0.005, n_jobs = -1, random_state = 0) 



eclf3 = VotingClassifier(estimators=[
      ('lr', clf1), ('rf', clf2)],
      voting='hard', weights=[1,2],
      flatten_transform=True)
eclf3 = clf2.fit(X_kbest, y_train)

In [None]:
eclf3.score(X_test_kbest, y_test)

0.7475728155339806

In [None]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [None]:
sel_ = SelectFromModel(Lasso(alpha=0.005))
X_train = data_train.drop(['y'], axis=1)
y_train = data_train['y']
y_train = y_train.values

X_test = data_test.drop(['y'], axis=1)
y_test = data_test['y']
y_test = y_test.values
sel_.fit(sc.fit_transform(X_train), y_train)

SelectFromModel(estimator=Lasso(alpha=0.005, copy_X=True, fit_intercept=True,
                                max_iter=1000, normalize=False, positive=False,
                                precompute=False, random_state=None,
                                selection='cyclic', tol=0.0001,
                                warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [None]:
# make a list with the selected features and print the outputs
selected_feat = X_train.columns[(sel_.get_support())]

print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
    np.sum(sel_.estimator_.coef_ == 0)))

total features: 20531
selected features: 336
features with coefficients shrank to zero: 20195


In [None]:

X_train = data_train[selected_feat]
y_train = data_train['y']
y_train = y_train.values


X_test = data_test[selected_feat]
y_test = data_test['y']
y_test = y_test.values

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [None]:
clf1 = ExtraTreesClassifier(max_depth=5, random_state=0, n_estimators=200)
clf2 = RandomForestClassifier(n_estimators = 100, criterion = "gini", max_depth = 5,
                                max_features = "auto", min_samples_leaf = 0.005,
                                min_samples_split = 0.005, n_jobs = -1, random_state = 0) 



eclf3 = VotingClassifier(estimators=[
      ('lr', clf1), ('rf', clf2)],
      voting='hard', weights=[1,2],
      flatten_transform=True)
eclf3 = eclf3.fit(X_train, y_train)
eclf3.score(X_test, y_test)

0.7281553398058253

In [None]:
# creating set to hold the correlated features
corr_features = set()

# create the correlation matrix (default to pearson)
corr_matrix = data_train.corr(method='kendall')

# optional: display a heatmap of the correlation matrix
plt.figure(figsize=(11,11))
sns.heatmap(corr_matrix)

for i in range(len(corr_matrix .columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > 0.8:
            colname = corr_matrix.columns[i]
            corr_features.add(colname)
            


KeyboardInterrupt: ignored

In [None]:
data_train.drop(labels=corr_features, axis=1)


In [None]:
data_test.drop(labels=corr_features, axis=1)

In [None]:
from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFE
from sklearn.svm import SVC

svc = SVC(kernel="rbf")
rfe = RFE(estimator=svc, n_features_to_select=1000)
rfe.fit(X_train, y_train)
rfe.ranking_