# importing needed libraries

In [1]:
import os
import sys
import time
import psutil
import pandas as pd
import numpy  as np
import concurrent.futures
import datetime


from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Downloading data from github

In [2]:
!wget https://raw.githubusercontent.com/ibabedal/BLCA-analysis/main/processing-mrna-cna.tar.gz
!tar zxf processing-mrna-cna.tar.gz

--2022-10-14 10:15:31--  https://raw.githubusercontent.com/ibabedal/BLCA-analysis/main/processing-mrna-cna.tar.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 21932951 (21M) [application/octet-stream]
Saving to: ‘processing-mrna-cna.tar.gz.1’


2022-10-14 10:15:32 (173 MB/s) - ‘processing-mrna-cna.tar.gz.1’ saved [21932951/21932951]

tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.quarantine'
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.macl'
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.quarantine'
tar: Ignoring unknown extended header keyword 'LIBARCHIVE.xattr.com.apple.macl'


# Start CNA processing

In [3]:
df_cna = pd.read_csv('./CNA-TMB.csv')
df_cna.head()

Unnamed: 0,SAMPLE_ID,ACAP3,ACTRT2,AGRN,ANKRD65,ATAD3A,ATAD3B,ATAD3C,AURKAIP1,B3GALT6,...,hsa-mir-361,hsa-mir-548m,hsa-mir-652,hsa-mir-220a,hsa-mir-513c,hsa-mir-513b,hsa-mir-513a-1,hsa-mir-513a-2,hsa-mir-224,TMB_CLASS
0,TCGA-2F-A9KO-01,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,0,0,0,0,0,0,0,0,0,2
1,TCGA-2F-A9KP-01,2,2,2,2,2,2,2,2,2,...,0,0,0,0,0,0,0,0,0,1
2,TCGA-2F-A9KQ-01,0,0,0,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,1
3,TCGA-2F-A9KR-01,0,0,0,0,0,0,0,0,0,...,-1,-1,-1,-1,0,0,0,0,-1,2
4,TCGA-2F-A9KT-01,0,0,0,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,2


In [4]:
x = df_cna.drop(['SAMPLE_ID', 'TMB_CLASS'], axis=1)
x.head()

Unnamed: 0,ACAP3,ACTRT2,AGRN,ANKRD65,ATAD3A,ATAD3B,ATAD3C,AURKAIP1,B3GALT6,C1orf159,...,hsa-mir-1321,hsa-mir-361,hsa-mir-548m,hsa-mir-652,hsa-mir-220a,hsa-mir-513c,hsa-mir-513b,hsa-mir-513a-1,hsa-mir-513a-2,hsa-mir-224
0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,0,0,0,0,0,0,0,0,0,0
1,2,2,2,2,2,2,2,2,2,2,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,0,0,0,0,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,0,0,0,0,-1
4,0,0,0,0,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [5]:
y = df_cna[['TMB_CLASS']]
y.head()

Unnamed: 0,TMB_CLASS
0,2
1,1
2,1
3,2
4,2


In [6]:
y_2 = df_cna['TMB_CLASS']
y_2.head()

0    2
1    1
2    1
3    2
4    2
Name: TMB_CLASS, dtype: int64

### Doing shifting and normalization

In [6]:


def do_something(x):
    '''
    do stuff
    '''
    pid  = os.getpid()
    ppid = os.getppid()
    start = time.time()
    print("PPID %s->%s Started"%(ppid,pid))
    #df['diff'] = datetime.datetime.now() - pd.to_datetime(df['Order Date'])
    for i in x.columns:
      x[i] = ((x[i] - x[i].min()) + 0.1)
    stop  = time.time()
    completed_in  = round(stop - start,2)
    return(x)


In [7]:
## parallel execution , credit to https://betterprogramming.pub/pandas-how-to-process-a-dataframe-in-parallel-make-pandas-lightning-fast-669978cf5356

logical    = False
df_results = []
num_procs  = psutil.cpu_count(logical=logical)
#if len(sys.argv) > 1:
#    num_procs = int(sys.argv[1])

big_dataframe = x.copy()
splitted_df = np.array_split(big_dataframe, num_procs)
start = time.time()
with concurrent.futures.ProcessPoolExecutor(max_workers=num_procs) as executor:
    results = [ executor.submit(do_something, x=df) for df in splitted_df ]
    for result in concurrent.futures.as_completed(results):
        try:
            df_results.append(result.result())
        except Exception as ex:
            print(str(ex))
            pass
end = time.time()
print("-------------------------------------------")
print("PPID %s Completed in %s"%(os.getpid(), round(end-start,2)))
df_results = pd.concat(df_results)

PPID 5746->5768 Started
PPID 5746->5771 Started
-------------------------------------------
PPID 5746 Completed in 120.61


In [8]:
x = df_results.copy()
x.head()

Unnamed: 0,ACAP3,ACTRT2,AGRN,ANKRD65,ATAD3A,ATAD3B,ATAD3C,AURKAIP1,B3GALT6,C1orf159,...,hsa-mir-1321,hsa-mir-361,hsa-mir-548m,hsa-mir-652,hsa-mir-220a,hsa-mir-513c,hsa-mir-513b,hsa-mir-513a-1,hsa-mir-513a-2,hsa-mir-224
204,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,...,3.1,3.1,2.1,2.1,2.1,3.1,3.1,3.1,3.1,2.1
205,2.1,2.1,2.1,2.1,2.1,2.1,2.1,2.1,2.1,2.1,...,2.1,2.1,1.1,1.1,0.1,1.1,1.1,1.1,1.1,0.1
206,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,...,2.1,2.1,1.1,1.1,1.1,2.1,2.1,2.1,2.1,1.1
207,2.1,2.1,2.1,2.1,2.1,2.1,2.1,2.1,2.1,2.1,...,3.1,3.1,2.1,2.1,2.1,3.1,3.1,3.1,3.1,2.1
208,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,...,2.1,2.1,1.1,1.1,1.1,2.1,2.1,2.1,2.1,1.1


In [9]:
## saving to file to avoid recacluation
x.to_csv('cna_features_shifted.csv',index=False)

In [10]:
x = pd.read_csv('cna_features_shifted.csv')
x.head()

Unnamed: 0,ACAP3,ACTRT2,AGRN,ANKRD65,ATAD3A,ATAD3B,ATAD3C,AURKAIP1,B3GALT6,C1orf159,...,hsa-mir-1321,hsa-mir-361,hsa-mir-548m,hsa-mir-652,hsa-mir-220a,hsa-mir-513c,hsa-mir-513b,hsa-mir-513a-1,hsa-mir-513a-2,hsa-mir-224
0,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,...,3.1,3.1,2.1,2.1,2.1,3.1,3.1,3.1,3.1,2.1
1,2.1,2.1,2.1,2.1,2.1,2.1,2.1,2.1,2.1,2.1,...,2.1,2.1,1.1,1.1,0.1,1.1,1.1,1.1,1.1,0.1
2,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,...,2.1,2.1,1.1,1.1,1.1,2.1,2.1,2.1,2.1,1.1
3,2.1,2.1,2.1,2.1,2.1,2.1,2.1,2.1,2.1,2.1,...,3.1,3.1,2.1,2.1,2.1,3.1,3.1,3.1,3.1,2.1
4,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,...,2.1,2.1,1.1,1.1,1.1,2.1,2.1,2.1,2.1,1.1


In [11]:
def do_something_2(x):
    '''
    do stuff
    '''
    pid  = os.getpid()
    ppid = os.getppid()
    start = time.time()
    print("PPID %s->%s Started"%(ppid,pid))
    #df['diff'] = datetime.datetime.now() - pd.to_datetime(df['Order Date'])
    for i in x.columns:
      sum_feature = x[i].sum()
      x[i] = x[i]/sum_feature
    stop  = time.time()
    completed_in  = round(stop - start,2)
    return(x)

In [12]:
## parallel execution , credit to https://betterprogramming.pub/pandas-how-to-process-a-dataframe-in-parallel-make-pandas-lightning-fast-669978cf5356

logical    = False
df_results = []
num_procs  = psutil.cpu_count(logical=logical)
#if len(sys.argv) > 1:
#    num_procs = int(sys.argv[1])

big_dataframe = x.copy()
splitted_df = np.array_split(big_dataframe, num_procs)
start = time.time()
with concurrent.futures.ProcessPoolExecutor(max_workers=num_procs) as executor:
    results = [ executor.submit(do_something_2, x=df) for df in splitted_df ]
    for result in concurrent.futures.as_completed(results):
        try:
            df_results.append(result.result())
        except Exception as ex:
            print(str(ex))
            pass
end = time.time()
print("-------------------------------------------")
print("PPID %s Completed in %s"%(os.getpid(), round(end-start,2)))
df_results = pd.concat(df_results)

PPID 5746->5879 Started
PPID 5746->5882 Started
-------------------------------------------
PPID 5746 Completed in 119.91


In [13]:
x = df_results.copy()
x.to_csv('cna_scaled_normailized.csv', index=False)
x.head()

Unnamed: 0,ACAP3,ACTRT2,AGRN,ANKRD65,ATAD3A,ATAD3B,ATAD3C,AURKAIP1,B3GALT6,C1orf159,...,hsa-mir-1321,hsa-mir-361,hsa-mir-548m,hsa-mir-652,hsa-mir-220a,hsa-mir-513c,hsa-mir-513b,hsa-mir-513a-1,hsa-mir-513a-2,hsa-mir-224
204,0.002521,0.002521,0.002521,0.002521,0.002521,0.002521,0.002521,0.002521,0.002521,0.002521,...,0.005179,0.005179,0.005014,0.005007,0.005014,0.004816,0.004816,0.004816,0.004816,0.004754
205,0.009395,0.009395,0.009395,0.009395,0.009395,0.009395,0.009395,0.009395,0.009395,0.009395,...,0.005179,0.005179,0.005014,0.005007,0.005014,0.004816,0.004816,0.004816,0.004816,0.004754
206,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,...,0.000471,0.000471,0.000456,0.002623,0.000456,0.000438,0.000438,0.000438,0.000438,0.000432
207,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,...,0.000471,0.000471,0.000456,0.002623,0.000456,0.004816,0.004816,0.004816,0.004816,0.000432
208,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,...,0.000471,0.000471,0.000456,0.002623,0.000456,0.000438,0.000438,0.000438,0.000438,0.000432


In [7]:
x = pd.read_csv('cna_scaled_normailized.csv')
x.head()

Unnamed: 0,ACAP3,ACTRT2,AGRN,ANKRD65,ATAD3A,ATAD3B,ATAD3C,AURKAIP1,B3GALT6,C1orf159,...,hsa-mir-1321,hsa-mir-361,hsa-mir-548m,hsa-mir-652,hsa-mir-220a,hsa-mir-513c,hsa-mir-513b,hsa-mir-513a-1,hsa-mir-513a-2,hsa-mir-224
0,0.002521,0.002521,0.002521,0.002521,0.002521,0.002521,0.002521,0.002521,0.002521,0.002521,...,0.005179,0.005179,0.005014,0.005007,0.005014,0.004816,0.004816,0.004816,0.004816,0.004754
1,0.009395,0.009395,0.009395,0.009395,0.009395,0.009395,0.009395,0.009395,0.009395,0.009395,...,0.005179,0.005179,0.005014,0.005007,0.005014,0.004816,0.004816,0.004816,0.004816,0.004754
2,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,...,0.000471,0.000471,0.000456,0.002623,0.000456,0.000438,0.000438,0.000438,0.000438,0.000432
3,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,...,0.000471,0.000471,0.000456,0.002623,0.000456,0.004816,0.004816,0.004816,0.004816,0.000432
4,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,0.004812,...,0.000471,0.000471,0.000456,0.002623,0.000456,0.000438,0.000438,0.000438,0.000438,0.000432


### starting with Chi2

In [8]:
chi2_selector = SelectKBest(chi2, k=1000)
kbest_df = chi2_selector.fit_transform(x, y)
column_names = x.columns[chi2_selector.get_support()]

In [9]:
kbest_df_chi2=pd.DataFrame(kbest_df, columns = column_names, index=df_cna['SAMPLE_ID'])

kbest_df_chi2.head()

Unnamed: 0_level_0,POGZ,TUFT1,OR10J3,OR10J1,OR10J5,APCS,CRP,DUSP23,FCRL6,SLAMF8,...,hsa-mir-3119-1,hsa-mir-378b,hsa-mir-3134,hsa-mir-3135,hsa-mir-3139,hsa-mir-548v,hsa-mir-548h-4,hsa-mir-302e,hsa-mir-3159,hsa-mir-220a
SAMPLE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-2F-A9KO-01,0.006454,0.006454,0.006473,0.006473,0.006434,0.006375,0.006337,0.00628,0.006261,0.006243,...,0.006473,0.007256,0.00741,0.007738,0.000564,0.00338,0.00335,0.00081,0.000761,0.005014
TCGA-2F-A9KP-01,0.00338,0.00338,0.006473,0.006473,0.006434,0.006375,0.006337,0.00628,0.006261,0.006243,...,0.006473,0.010712,0.00741,0.007738,0.006201,0.00338,0.00335,0.008914,0.008371,0.005014
TCGA-2F-A9KQ-01,0.00338,0.00338,0.003391,0.003391,0.00337,0.003339,0.003319,0.003289,0.00328,0.00327,...,0.003391,0.003801,0.003881,0.004053,0.006201,0.006454,0.006395,0.008914,0.008371,0.000456
TCGA-2F-A9KR-01,0.00338,0.00338,0.003391,0.003391,0.00337,0.003339,0.003319,0.003289,0.00328,0.00327,...,0.006473,0.000346,0.000353,0.000368,0.006201,0.00338,0.00335,0.00081,0.000761,0.000456
TCGA-2F-A9KT-01,0.006454,0.006454,0.003391,0.003391,0.00337,0.003339,0.003319,0.003289,0.00328,0.00327,...,0.003391,0.003801,0.003881,0.004053,0.000564,0.00338,0.00335,0.00081,0.000761,0.000456


In [10]:
kbest_df_chi2.isnull().sum().sum()

0

In [11]:
kbest_df_chi2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 408 entries, TCGA-2F-A9KO-01 to TCGA-ZF-AA5P-01
Columns: 1000 entries, POGZ to hsa-mir-220a
dtypes: float64(1000)
memory usage: 3.1+ MB


In [12]:
kbest_df_chi2['TMB_CLASS'] = y.iloc[:,0].values

In [13]:
kbest_df_chi2.head()

Unnamed: 0_level_0,POGZ,TUFT1,OR10J3,OR10J1,OR10J5,APCS,CRP,DUSP23,FCRL6,SLAMF8,...,hsa-mir-378b,hsa-mir-3134,hsa-mir-3135,hsa-mir-3139,hsa-mir-548v,hsa-mir-548h-4,hsa-mir-302e,hsa-mir-3159,hsa-mir-220a,TMB_CLASS
SAMPLE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-2F-A9KO-01,0.006454,0.006454,0.006473,0.006473,0.006434,0.006375,0.006337,0.00628,0.006261,0.006243,...,0.007256,0.00741,0.007738,0.000564,0.00338,0.00335,0.00081,0.000761,0.005014,2
TCGA-2F-A9KP-01,0.00338,0.00338,0.006473,0.006473,0.006434,0.006375,0.006337,0.00628,0.006261,0.006243,...,0.010712,0.00741,0.007738,0.006201,0.00338,0.00335,0.008914,0.008371,0.005014,1
TCGA-2F-A9KQ-01,0.00338,0.00338,0.003391,0.003391,0.00337,0.003339,0.003319,0.003289,0.00328,0.00327,...,0.003801,0.003881,0.004053,0.006201,0.006454,0.006395,0.008914,0.008371,0.000456,1
TCGA-2F-A9KR-01,0.00338,0.00338,0.003391,0.003391,0.00337,0.003339,0.003319,0.003289,0.00328,0.00327,...,0.000346,0.000353,0.000368,0.006201,0.00338,0.00335,0.00081,0.000761,0.000456,2
TCGA-2F-A9KT-01,0.006454,0.006454,0.003391,0.003391,0.00337,0.003339,0.003319,0.003289,0.00328,0.00327,...,0.003801,0.003881,0.004053,0.000564,0.00338,0.00335,0.00081,0.000761,0.000456,2


In [20]:
kbest_df_chi2.to_csv('./CNA-TMB-Chi2.csv',index=False)

### Using information gain probability based method

In [14]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif

In [15]:
ig_selector = SelectKBest(chi2, k=1000)
kbest_df = ig_selector.fit_transform(x, y)
column_names = x.columns[ig_selector.get_support()]

In [16]:
kbest_df_ig=pd.DataFrame(kbest_df, columns = column_names, index=df_cna['SAMPLE_ID'])

kbest_df_ig.head()

Unnamed: 0_level_0,POGZ,TUFT1,OR10J3,OR10J1,OR10J5,APCS,CRP,DUSP23,FCRL6,SLAMF8,...,hsa-mir-3119-1,hsa-mir-378b,hsa-mir-3134,hsa-mir-3135,hsa-mir-3139,hsa-mir-548v,hsa-mir-548h-4,hsa-mir-302e,hsa-mir-3159,hsa-mir-220a
SAMPLE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-2F-A9KO-01,0.006454,0.006454,0.006473,0.006473,0.006434,0.006375,0.006337,0.00628,0.006261,0.006243,...,0.006473,0.007256,0.00741,0.007738,0.000564,0.00338,0.00335,0.00081,0.000761,0.005014
TCGA-2F-A9KP-01,0.00338,0.00338,0.006473,0.006473,0.006434,0.006375,0.006337,0.00628,0.006261,0.006243,...,0.006473,0.010712,0.00741,0.007738,0.006201,0.00338,0.00335,0.008914,0.008371,0.005014
TCGA-2F-A9KQ-01,0.00338,0.00338,0.003391,0.003391,0.00337,0.003339,0.003319,0.003289,0.00328,0.00327,...,0.003391,0.003801,0.003881,0.004053,0.006201,0.006454,0.006395,0.008914,0.008371,0.000456
TCGA-2F-A9KR-01,0.00338,0.00338,0.003391,0.003391,0.00337,0.003339,0.003319,0.003289,0.00328,0.00327,...,0.006473,0.000346,0.000353,0.000368,0.006201,0.00338,0.00335,0.00081,0.000761,0.000456
TCGA-2F-A9KT-01,0.006454,0.006454,0.003391,0.003391,0.00337,0.003339,0.003319,0.003289,0.00328,0.00327,...,0.003391,0.003801,0.003881,0.004053,0.000564,0.00338,0.00335,0.00081,0.000761,0.000456


In [17]:
kbest_df_ig.isnull().sum().sum()

0

In [18]:
kbest_df_ig.info()

<class 'pandas.core.frame.DataFrame'>
Index: 408 entries, TCGA-2F-A9KO-01 to TCGA-ZF-AA5P-01
Columns: 1000 entries, POGZ to hsa-mir-220a
dtypes: float64(1000)
memory usage: 3.1+ MB


In [19]:
kbest_df_ig['TMB_CLASS'] = y.iloc[:,0].values

In [20]:
kbest_df_ig.head()

Unnamed: 0_level_0,POGZ,TUFT1,OR10J3,OR10J1,OR10J5,APCS,CRP,DUSP23,FCRL6,SLAMF8,...,hsa-mir-378b,hsa-mir-3134,hsa-mir-3135,hsa-mir-3139,hsa-mir-548v,hsa-mir-548h-4,hsa-mir-302e,hsa-mir-3159,hsa-mir-220a,TMB_CLASS
SAMPLE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-2F-A9KO-01,0.006454,0.006454,0.006473,0.006473,0.006434,0.006375,0.006337,0.00628,0.006261,0.006243,...,0.007256,0.00741,0.007738,0.000564,0.00338,0.00335,0.00081,0.000761,0.005014,2
TCGA-2F-A9KP-01,0.00338,0.00338,0.006473,0.006473,0.006434,0.006375,0.006337,0.00628,0.006261,0.006243,...,0.010712,0.00741,0.007738,0.006201,0.00338,0.00335,0.008914,0.008371,0.005014,1
TCGA-2F-A9KQ-01,0.00338,0.00338,0.003391,0.003391,0.00337,0.003339,0.003319,0.003289,0.00328,0.00327,...,0.003801,0.003881,0.004053,0.006201,0.006454,0.006395,0.008914,0.008371,0.000456,1
TCGA-2F-A9KR-01,0.00338,0.00338,0.003391,0.003391,0.00337,0.003339,0.003319,0.003289,0.00328,0.00327,...,0.000346,0.000353,0.000368,0.006201,0.00338,0.00335,0.00081,0.000761,0.000456,2
TCGA-2F-A9KT-01,0.006454,0.006454,0.003391,0.003391,0.00337,0.003339,0.003319,0.003289,0.00328,0.00327,...,0.003801,0.003881,0.004053,0.000564,0.00338,0.00335,0.00081,0.000761,0.000456,2


In [28]:
kbest_df_ig.to_csv('./CNA-TMB-IG.csv',index=False)

## using RFE (Recursive Feature Elimination)

In [21]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

In [22]:

start = time.time()
rfe_selector = RFE(estimator=RandomForestClassifier(), n_features_to_select=1000, step=100, verbose=5)
rfe_selector.fit(x, y_2)
rfe_support = rfe_selector.get_support()
end = time.time()
print(round(end-start,2))

Fitting estimator with 25128 features.
Fitting estimator with 25028 features.
Fitting estimator with 24928 features.
Fitting estimator with 24828 features.
Fitting estimator with 24728 features.
Fitting estimator with 24628 features.
Fitting estimator with 24528 features.
Fitting estimator with 24428 features.
Fitting estimator with 24328 features.
Fitting estimator with 24228 features.
Fitting estimator with 24128 features.
Fitting estimator with 24028 features.
Fitting estimator with 23928 features.
Fitting estimator with 23828 features.
Fitting estimator with 23728 features.
Fitting estimator with 23628 features.
Fitting estimator with 23528 features.
Fitting estimator with 23428 features.
Fitting estimator with 23328 features.
Fitting estimator with 23228 features.
Fitting estimator with 23128 features.
Fitting estimator with 23028 features.
Fitting estimator with 22928 features.
Fitting estimator with 22828 features.
Fitting estimator with 22728 features.
Fitting estimator with 22

In [23]:
rfe_feature = x.loc[:,rfe_support].columns.tolist()
print(rfe_feature)
print(len(rfe_feature))

['PHF13', 'DNAJC11', 'TNFRSF9', 'SLC45A1', 'H6PD', 'RBP7', 'CORT', 'FBXO44', 'CLCN6', 'MIIP', 'HNRNPCL1', 'PRAMEF12', 'PRAMEF17', 'PRAMEF1', 'PRAMEF4', 'DDI2', 'CLCNKA', 'PADI6', 'RCC2', 'RCAN3AS', 'AK2', 'GJA4', 'CLSPN', 'AGO3', 'TEKT2', 'CSF3R', 'MEAF6', 'MTF1', 'SF3A3', 'UTP11L', 'KIAA0754', 'TOE1', 'KTI12', 'GLIS1', 'YIPF1', 'SSBP3-AS1', 'MIR186', 'NEGR1', 'SNORD45B', 'GIPC2', 'LPHN2', 'HIAT1', 'EPS8L3', 'NBPF7', 'ACP6', 'FAM72D', 'HIST2H2AB', 'HIST2H3D', 'LINC00875', 'LOC102723769', 'LINC02591', 'PLEKHO1', 'CA14', 'C1orf54', 'FAM63A', 'EFNA1', 'MIR7851|chr1', 'OR10X1', 'FCRL6', 'C1orf204', 'LINC01133', 'SLAMF9', 'ATP1A2', 'ATP1A4', 'CASQ1', 'DCAF8', 'PEX19', 'NCSTN', 'SLAMF6', 'LY9', 'ITLN2', 'F11R', 'ARHGAP30', 'PVRL4', 'DEDD', 'ADAMTS4', 'FCER1G', 'TOMM40L', 'PCP4L1', 'NOS1AP', 'DDR2', 'NUF2', 'LOC100422212', 'PBX1-AS1', 'LMX1A', 'LRRC52', 'MIR3658', 'FAM78B', 'MIR921', 'TIPRL', 'LINC00626', 'XPR1', 'MR1', 'LAMC2', 'NMNAT2', 'APOBEC4', 'MIR548F1', 'CDC73', 'MIR1278', 'TNNI1', 'L

In [24]:
kbest_df_rfe=x[rfe_feature].copy()
kbest_df_rfe['TMB_CLASS'] = y.iloc[:,0].values
kbest_df_rfe['SAMPLE_ID'] = df_cna['SAMPLE_ID'].copy()
kbest_df_rfe.set_index('SAMPLE_ID', inplace=True)
kbest_df_rfe.head()
#x[rfe_feature]

Unnamed: 0_level_0,PHF13,DNAJC11,TNFRSF9,SLC45A1,H6PD,RBP7,CORT,FBXO44,CLCN6,MIIP,...,hsa-mir-1826,hsa-mir-3181,hsa-mir-548h-3,hsa-mir-633,hsa-mir-548d-2,hsa-mir-1302-11,hsa-mir-1302-5,hsa-mir-3195,hsa-mir-941-1,TMB_CLASS
SAMPLE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-2F-A9KO-01,0.000428,0.000432,0.000427,0.000425,0.000423,0.000423,0.000427,0.000421,0.000421,0.000421,...,0.002532,0.005068,0.003185,0.00351,0.003532,0.005435,0.006115,0.006595,0.006595,2
TCGA-2F-A9KP-01,0.013282,0.013397,0.013225,0.013169,0.013113,0.013113,0.013225,0.013058,0.013058,0.013058,...,0.004834,0.005068,0.008975,0.00351,0.003532,0.010611,0.006115,0.006595,0.006595,1
TCGA-2F-A9KQ-01,0.004713,0.004754,0.004693,0.004673,0.004653,0.004653,0.004693,0.004634,0.004634,0.004634,...,0.004834,0.005068,0.00608,0.00351,0.003532,0.005435,0.003203,0.003455,0.003455,1
TCGA-2F-A9KR-01,0.004713,0.004754,0.004693,0.004673,0.004653,0.004653,0.004693,0.004634,0.004634,0.004634,...,0.004834,0.005068,0.003185,0.009892,0.009955,0.005435,0.006115,0.006595,0.006595,2
TCGA-2F-A9KT-01,0.004713,0.004754,0.004693,0.004673,0.004653,0.004653,0.004693,0.004634,0.004634,0.004634,...,0.004834,0.005068,0.003185,0.00351,0.003532,0.005435,0.009027,0.003455,0.003455,2


In [25]:
kbest_df_ig.to_csv('./CNA-TMB-RFE.csv',index=False)

## comparing features

In [26]:
kbest_df_chi2.drop('TMB_CLASS', axis=1 ,inplace=True)
list_of_fea_chi2 = kbest_df_chi2.columns.to_list()
len(list_of_fea_chi2)

1000

In [27]:
kbest_df_ig.drop('TMB_CLASS', axis=1, inplace=True)
list_of_fea_ig = kbest_df_ig.columns.to_list()
len(list_of_fea_ig)

1000

In [28]:
list_of_fea_rfe = rfe_feature
print(len(list_of_fea_rfe))

1000


In [29]:
# compare Chi2 to IG
common_features_chi2_ig = set(list_of_fea_chi2).intersection(list_of_fea_ig)
print(list(common_features_chi2_ig))
print(len(list(common_features_chi2_ig)))

['TRIM21', 'DPH3', 'MIR1295B', 'TRPC2', 'UPF3B', 'ADCY10', 'OR51G1', 'PRRX1', 'TMEM9B-AS1', 'LDHA', 'C11orf16', 'DCAF6', 'CCDC174', 'LYVE1', 'DKK3', 'SNORA52', 'OR52J3', 'OR56A3', 'APCS', 'IGSF9', 'PP2D1', 'PLAC8L1', 'OR51M1', 'NEFM', 'PRRC2C', 'ARL8B', 'HSPA6', 'C1orf105', 'CDKN1C', 'ADAM28', 'RP11-370A5.1', 'USH1C', 'METTL11B', 'C3orf83', 'MIR4485', 'PIWIL2', 'KRTAP5-3', 'CHMP7', 'EPHX2', 'CT47B1', 'USP21', 'ANKRD50', 'OR52K1', 'SCOC', 'LINC00294', 'ITLN2', 'TNNT3', 'C1orf226', 'OLFML2B', 'POLR3D', 'FCRLB', 'SLC6A5', 'PARVA', 'TSTD1', 'POGK', 'LSM6', 'FBXO16', 'LDHC', 'C1GALT1C1', 'ILK', 'FAM99A', 'MTUS1', 'HDAC11', 'OR52I1', 'EFCAB4A', 'ST5', 'DENND5A', 'SBF2-AS1', 'SLC6A1-AS1', 'SEPT6', 'ARMCX1', 'ZNF195', 'LHFPL4', 'MIR4442', 'PCDHB15', 'HRH1', 'NHLH1', 'PCDHB1', 'PFDN2', 'LINC00626', 'WRN', 'IL15', 'LINC02749', 'MIR4687', 'EFCAB6', 'PPP2R2B', 'ARPC4-TTLL3', 'DDR2', 'PCDH18', 'TUSC3', 'CCKBR', 'SH2D1B', 'LINC00620', 'DLGAP2', 'CT47A5', 'OR56A1', 'GVINP1', 'FMO1', 'KCNJ10', 'LOC100

In [30]:
# compare Chi2 to RFE
common_features_chi2_rfe = set(list_of_fea_chi2).intersection(list_of_fea_rfe)
print(list(common_features_chi2_rfe))
print(len(list(common_features_chi2_rfe)))

['AFF3', 'LINC01267', 'RHOXF1', 'PVRL4', 'RAB5A', 'SCUBE1', 'NKAPP1', 'LSM3', 'LEPROTL1', 'FGD5P1', 'C3orf83', 'PPARG', 'SLAMF9', 'RPL39', 'TNFRSF10C', 'BEX5', 'ITLN2', 'RP11-115J16.1', 'LMX1A', 'CAND2', 'C1GALT1C1', 'TOMM40L', 'EMC3', 'SLC6A1-AS1', 'SEPT6', 'MYOM2', 'TNFRSF13C', 'FGD5-AS1', 'CT47A11', 'HRH1', 'LINC00626', 'SLC25A5', 'EFCAB6', 'DDR2', 'SNORA69', 'FGD5', 'CASQ1', 'CSMD1', 'hsa-mir-548h-4', 'LRRC52', 'LOC100422212', 'ATP1A4', 'hsa-mir-302e', 'UBE2A', 'TMEM43', 'MIR320A', 'ADAMTS4', 'CIDECP', 'SYN2', 'NDUFA1', 'FUT10', 'hsa-mir-548v', 'RNF113A', 'IQSEC1', 'C1orf204', 'DAZL', 'hsa-mir-3134', 'LINC02466', 'LINC01239', 'MIR3658', 'ATP1A2', 'SLC25A43', 'hsa-mir-3119-1', 'TCEAL6', 'FCER1G', 'ARHGAP30', 'MRPS25', 'MIR921', 'EMC3-AS1', 'SLAMF6', 'LINC01133', 'DEDD', 'CDH2', 'TPRXL', 'EAF1', 'PCP4L1', 'PBX1-AS1', 'PRR5-ARHGAP8', 'GATA6', 'NCSTN', 'PCDH19', 'CT47A3', 'PEX19', 'TIPRL', 'FAM78B', 'FCRL6', 'VGLL4', 'F11R', 'SLC6A1', 'LY9', 'MIR4270', 'METTL6', 'MIR766', 'DCAF8', 'MIR

In [31]:
# compare IG to RFE
common_features_ig_rfe = set(list_of_fea_ig).intersection(list_of_fea_rfe)
print(list(common_features_ig_rfe))
print(len(list(common_features_ig_rfe)))

['AFF3', 'LINC01267', 'RHOXF1', 'PVRL4', 'RAB5A', 'SCUBE1', 'NKAPP1', 'LSM3', 'LEPROTL1', 'FGD5P1', 'C3orf83', 'PPARG', 'SLAMF9', 'RPL39', 'TNFRSF10C', 'BEX5', 'ITLN2', 'RP11-115J16.1', 'LMX1A', 'CAND2', 'C1GALT1C1', 'TOMM40L', 'EMC3', 'SLC6A1-AS1', 'SEPT6', 'MYOM2', 'TNFRSF13C', 'FGD5-AS1', 'CT47A11', 'HRH1', 'LINC00626', 'SLC25A5', 'EFCAB6', 'DDR2', 'SNORA69', 'FGD5', 'CASQ1', 'CSMD1', 'hsa-mir-548h-4', 'LRRC52', 'LOC100422212', 'ATP1A4', 'hsa-mir-302e', 'UBE2A', 'TMEM43', 'MIR320A', 'ADAMTS4', 'CIDECP', 'SYN2', 'NDUFA1', 'FUT10', 'hsa-mir-548v', 'RNF113A', 'IQSEC1', 'C1orf204', 'DAZL', 'hsa-mir-3134', 'LINC02466', 'LINC01239', 'MIR3658', 'ATP1A2', 'SLC25A43', 'hsa-mir-3119-1', 'TCEAL6', 'FCER1G', 'ARHGAP30', 'MRPS25', 'MIR921', 'EMC3-AS1', 'SLAMF6', 'LINC01133', 'DEDD', 'CDH2', 'TPRXL', 'EAF1', 'PCP4L1', 'PBX1-AS1', 'PRR5-ARHGAP8', 'GATA6', 'NCSTN', 'PCDH19', 'CT47A3', 'PEX19', 'TIPRL', 'FAM78B', 'FCRL6', 'VGLL4', 'F11R', 'SLC6A1', 'LY9', 'MIR4270', 'METTL6', 'MIR766', 'DCAF8', 'MIR

In [32]:
# common between all three methods
common_features = set(common_features_chi2_rfe).intersection(common_features_ig_rfe)
print(list(common_features))
print(len(list(common_features)))

['AFF3', 'LINC01267', 'RHOXF1', 'PVRL4', 'RAB5A', 'SCUBE1', 'NKAPP1', 'LSM3', 'LEPROTL1', 'FGD5P1', 'C3orf83', 'PPARG', 'SLAMF9', 'RPL39', 'TNFRSF10C', 'BEX5', 'ITLN2', 'RP11-115J16.1', 'LMX1A', 'CAND2', 'C1GALT1C1', 'TOMM40L', 'EMC3', 'SLC6A1-AS1', 'SEPT6', 'MYOM2', 'TNFRSF13C', 'FGD5-AS1', 'CT47A11', 'HRH1', 'LINC00626', 'SLC25A5', 'EFCAB6', 'DDR2', 'SNORA69', 'FGD5', 'CASQ1', 'CSMD1', 'hsa-mir-548h-4', 'LRRC52', 'LOC100422212', 'ATP1A4', 'hsa-mir-302e', 'UBE2A', 'TMEM43', 'MIR320A', 'ADAMTS4', 'CIDECP', 'SYN2', 'NDUFA1', 'FUT10', 'hsa-mir-548v', 'RNF113A', 'IQSEC1', 'C1orf204', 'DAZL', 'hsa-mir-3134', 'LINC02466', 'LINC01239', 'MIR3658', 'ATP1A2', 'SLC25A43', 'hsa-mir-3119-1', 'TCEAL6', 'FCER1G', 'ARHGAP30', 'MRPS25', 'MIR921', 'EMC3-AS1', 'SLAMF6', 'LINC01133', 'DEDD', 'CDH2', 'TPRXL', 'EAF1', 'PCP4L1', 'PBX1-AS1', 'PRR5-ARHGAP8', 'GATA6', 'NCSTN', 'PCDH19', 'CT47A3', 'PEX19', 'TIPRL', 'FAM78B', 'FCRL6', 'VGLL4', 'F11R', 'SLC6A1', 'LY9', 'MIR4270', 'METTL6', 'MIR766', 'DCAF8', 'MIR

In [34]:
## Creating the CSV file from those features
df_final = x[list(common_features)].copy()
df_final['TMB_CLASS'] = y.iloc[:,0].values
df_final['SAMPLE_ID'] = df_cna['SAMPLE_ID']
df_final.set_index('SAMPLE_ID', inplace=True)
df_final.head()

Unnamed: 0_level_0,AFF3,LINC01267,RHOXF1,PVRL4,RAB5A,SCUBE1,NKAPP1,LSM3,LEPROTL1,FGD5P1,...,SNORA7A,TMEM40,hsa-mir-3139,PCM1,hsa-mir-3159,hsa-mir-378b,CDKN2B,NOS1AP,BHLHE40-AS1,TMB_CLASS
SAMPLE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-2F-A9KO-01,0.004446,0.007307,0.00506,0.005827,0.007738,0.005827,0.005014,0.007358,0.003319,0.007384,...,0.007014,0.006876,0.000564,0.00338,0.000761,0.007256,0.000403,0.005976,0.007598,2
TCGA-2F-A9KP-01,0.004446,0.007307,0.00506,0.008602,0.007738,0.003052,0.005014,0.007358,0.003319,0.007384,...,0.007014,0.006876,0.006201,0.00338,0.008371,0.010712,0.004428,0.005976,0.011216,1
TCGA-2F-A9KQ-01,0.000404,0.003827,0.00046,0.003052,0.004053,0.005827,0.000456,0.003854,0.006337,0.003868,...,0.003674,0.003602,0.006201,0.006454,0.008371,0.003801,0.000403,0.00313,0.00398,1
TCGA-2F-A9KR-01,0.004446,0.000348,0.00046,0.003052,0.000368,0.008602,0.000456,0.00035,0.003319,0.000352,...,0.000334,0.000327,0.006201,0.00338,0.000761,0.000346,0.004428,0.00313,0.000362,2
TCGA-2F-A9KT-01,0.004446,0.010786,0.00046,0.008602,0.004053,0.005827,0.000456,0.010862,0.003319,0.003868,...,0.003674,0.003602,0.000564,0.00338,0.000761,0.003801,0.000403,0.00313,0.00398,2


In [35]:
df_final.to_csv('./CNA-TMB-Reduced.csv', index=False)

## using csv2arff to convert .csv to arff : https://pypi.org/project/csv2arff/

In [36]:
#!pip install numpy
#!pip install python-javabridge
#!pip install python-weka-wrapper3

### installing the java runtime on the colab

In [37]:
!sudo apt-get update
!sudo apt-get install -y default-jdk 
!git clone https://github.com/heshida01/mrmd.git

Hit:1 http://azure.archive.ubuntu.com/ubuntu focal InRelease
Get:2 http://azure.archive.ubuntu.com/ubuntu focal-updates InRelease [114 kB]
Get:3 http://azure.archive.ubuntu.com/ubuntu focal-backports InRelease [108 kB]
Get:4 http://azure.archive.ubuntu.com/ubuntu focal-security InRelease [114 kB]
Get:5 http://azure.archive.ubuntu.com/ubuntu focal-security/restricted amd64 Packages [1241 kB]
Fetched 1577 kB in 1s (2574 kB/s)  
Reading package lists... Done
Reading package lists... Done
Building dependency tree       
Reading state information... Done
default-jdk is already the newest version (2:1.11-72).
0 upgraded, 0 newly installed, 0 to remove and 11 not upgraded.
fatal: destination path 'mrmd' already exists and is not an empty directory.


In [38]:
# to get terminal on colab ##
#!pip install colab-xterm
#%load_ext colabxterm
#%xterm

In [39]:
# start JVM
#import weka.core.jvm as jvm
#jvm.start()
#jvm.stop()

In [40]:
!pip install csv2arff



In [59]:
df_temp = pd.read_csv('./CNA-TMB-Reduced.csv')
df_temp.head()

Unnamed: 0,AFF3,LINC01267,RHOXF1,PVRL4,RAB5A,SCUBE1,NKAPP1,LSM3,LEPROTL1,FGD5P1,...,SNORA7A,TMEM40,hsa-mir-3139,PCM1,hsa-mir-3159,hsa-mir-378b,CDKN2B,NOS1AP,BHLHE40-AS1,TMB_CLASS
0,0.004446,0.007307,0.00506,0.005827,0.007738,0.005827,0.005014,0.007358,0.003319,0.007384,...,0.007014,0.006876,0.000564,0.00338,0.000761,0.007256,0.000403,0.005976,0.007598,2
1,0.004446,0.007307,0.00506,0.008602,0.007738,0.003052,0.005014,0.007358,0.003319,0.007384,...,0.007014,0.006876,0.006201,0.00338,0.008371,0.010712,0.004428,0.005976,0.011216,1
2,0.000404,0.003827,0.00046,0.003052,0.004053,0.005827,0.000456,0.003854,0.006337,0.003868,...,0.003674,0.003602,0.006201,0.006454,0.008371,0.003801,0.000403,0.00313,0.00398,1
3,0.004446,0.000348,0.00046,0.003052,0.000368,0.008602,0.000456,0.00035,0.003319,0.000352,...,0.000334,0.000327,0.006201,0.00338,0.000761,0.000346,0.004428,0.00313,0.000362,2
4,0.004446,0.010786,0.00046,0.008602,0.004053,0.005827,0.000456,0.010862,0.003319,0.003868,...,0.003674,0.003602,0.000564,0.00338,0.000761,0.003801,0.000403,0.00313,0.00398,2


In [41]:
#!csv2arff CNA-TMB-Chi2.csv CNA-TMB-Chi2.arff
!csv2arff CNA-TMB-Reduced.csv CNA-TMB-Reduced.arff

In [42]:
# replacing numeric with list for class TMB so that mrmd can work on it
!grep TMB CNA-TMB-Reduced.arff
!sed -i 's/TMB_CLASS numeric/TMB_CLASS \{1,2\}/' CNA-TMB-Reduced.arff
!grep TMB CNA-TMB-Reduced.arff

@relation CNA-TMB-Reduced
@attribute TMB_CLASS numeric
@relation CNA-TMB-Reduced
@attribute TMB_CLASS {1,2}


In [43]:
!java -jar ./mrmd/mrmd.jar -i ./CNA-TMB-Reduced-weka.arff -o ./CNA-TMB-MRMD.csv -m svm -a svm_output.arff

Initialization...
Initialization over!!!
Start to calculate Pearson's correlation coefficient...
Calculating Pearson's correlation coefficient over!!!
Start to calculate Euclidean Distance...
Calculating Euclidean Distance over!!!
MRMD over.
Feature selction optimation begin
model:svm
***********feature num: 1 rate: 0.5098039215686274
***********feature num: 2 rate: 0.5098039215686274
***********feature num: 3 rate: 0.5098039215686274
***********feature num: 4 rate: 0.5098039215686274
***********feature num: 5 rate: 0.5098039215686274
***********feature num: 6 rate: 0.5098039215686274
***********feature num: 7 rate: 0.5098039215686274
***********feature num: 8 rate: 0.5098039215686274
***********feature num: 9 rate: 0.5098039215686274
***********feature num: 10 rate: 0.5098039215686274
***********feature num: 11 rate: 0.5098039215686274
***********feature num: 12 rate: 0.5098039215686274
***********feature num: 13 rate: 0.5098039215686274
***********feature num: 14 rate: 0.509803921568

In [46]:
!head ./CNA-TMB-MRMD.csv

The number of selected features is: 114

The index of selected features start from 0

NO.		FeaName		Score

1		Fea111		1.0
2		Fea107		0.9961886508299538
3		Fea97		0.9824374430018402
4		Fea38		0.9824148790151633


In [47]:
!head ./svm_output.arff

@relation CNA-TMB-Reduced-weka.arff_feaSele

@attribute Fea0 numeric
@attribute TMB_CLASS {1,2}

@data

0.006224,2
0.006224,1
0.00326,1


In [48]:
!java -jar ./mrmd/mrmd.jar -i ./CNA-TMB-Reduced.arff -o ./CNA-TMB-MRMD-bagging.csv -m bagging -a bagging_output.arff


Initialization...
Initialization over!!!
Start to calculate Pearson's correlation coefficient...
Calculating Pearson's correlation coefficient over!!!
Start to calculate Euclidean Distance...
Calculating Euclidean Distance over!!!
MRMD over.
Feature selction optimation begin
model:bagging
feature num: 1 rate: 0.6262254901960784
feature num: 2 rate: 0.6262254901960784
feature num: 3 rate: 0.6262254901960784
feature num: 4 rate: 0.6237745098039216
feature num: 5 rate: 0.625
feature num: 6 rate: 0.6188725490196079
feature num: 7 rate: 0.6188725490196079
feature num: 8 rate: 0.6188725490196079
feature num: 9 rate: 0.6188725490196079
feature num: 10 rate: 0.6188725490196079
feature num: 11 rate: 0.6188725490196079
feature num: 12 rate: 0.6176470588235294
feature num: 13 rate: 0.6176470588235294
feature num: 14 rate: 0.6176470588235294
feature num: 15 rate: 0.6188725490196079
feature num: 16 rate: 0.6139705882352942
feature num: 17 rate: 0.6139705882352942
feature num: 18 rate: 0.62745098039

In [49]:
!head ./CNA-TMB-MRMD-bagging.csv

The number of selected features is: 106

The index of selected features start from 0

NO.		FeaName		Score

1		Fea79		1.0
2		Fea82		0.996025036198251
3		Fea70		0.9860644983757552
4		Fea16		0.9850337431017239


In [56]:
!head -106 ./bagging_output.arff

@relation CNA-TMB-Reduced.arff_feaSele

@attribute Fea0 numeric
@attribute Fea1 numeric
@attribute Fea2 numeric
@attribute Fea3 numeric
@attribute Fea4 numeric
@attribute Fea5 numeric
@attribute Fea6 numeric
@attribute Fea7 numeric
@attribute Fea8 numeric
@attribute Fea9 numeric
@attribute Fea10 numeric
@attribute Fea11 numeric
@attribute Fea12 numeric
@attribute Fea13 numeric
@attribute Fea14 numeric
@attribute Fea15 numeric
@attribute Fea16 numeric
@attribute Fea17 numeric
@attribute Fea18 numeric
@attribute Fea19 numeric
@attribute Fea20 numeric
@attribute Fea21 numeric
@attribute Fea22 numeric
@attribute Fea23 numeric
@attribute Fea24 numeric
@attribute Fea25 numeric
@attribute Fea26 numeric
@attribute Fea27 numeric
@attribute Fea28 numeric
@attribute Fea29 numeric
@attribute Fea30 numeric
@attribute Fea31 numeric
@attribute Fea32 numeric
@attribute Fea33 numeric
@attribute Fea34 numeric
@attribute Fea35 numeric
@attribute Fea36 numeric
@attribute Fea37 numeric
@attribute Fea38 num

In [57]:
!java -jar ./mrmd/mrmd.jar -i ./CNA-TMB-Reduced.arff -o ./CNA-TMB-MRMD-N.csv -m N  -a N_output.arff


Initialization...
Initialization over!!!
Start to calculate Pearson's correlation coefficient...
Calculating Pearson's correlation coefficient over!!!
Start to calculate Euclidean Distance...
Calculating Euclidean Distance over!!!
MRMD over.
Feature selction optimation begin
model:N
feature num: 1 rate: 0.6274509803921569
feature num: 2 rate: 0.6274509803921569
feature num: 3 rate: 0.6274509803921569
feature num: 4 rate: 0.6311274509803921
feature num: 5 rate: 0.6360294117647058
feature num: 6 rate: 0.6348039215686274
feature num: 7 rate: 0.6360294117647058
feature num: 8 rate: 0.633578431372549
feature num: 9 rate: 0.6348039215686274
feature num: 10 rate: 0.6372549019607843
feature num: 11 rate: 0.633578431372549
feature num: 12 rate: 0.6348039215686274
feature num: 13 rate: 0.633578431372549
feature num: 14 rate: 0.6348039215686274
feature num: 15 rate: 0.633578431372549
feature num: 16 rate: 0.642156862745098
feature num: 17 rate: 0.6409313725490196
feature num: 18 rate: 0.643382352

In [58]:
!head ./CNA-TMB-MRMD-N.csv

The number of selected features is: 106

The index of selected features start from 0

NO.		FeaName		Score

1		Fea79		1.0
2		Fea82		0.996025036198251
3		Fea70		0.9860644983757552
4		Fea16		0.9850337431017239


In [61]:
!head -106 ./N_output.arff

@relation CNA-TMB-Reduced.arff_feaSele

@attribute Fea0 numeric
@attribute Fea1 numeric
@attribute Fea2 numeric
@attribute Fea3 numeric
@attribute Fea4 numeric
@attribute Fea5 numeric
@attribute Fea6 numeric
@attribute Fea7 numeric
@attribute Fea8 numeric
@attribute Fea9 numeric
@attribute Fea10 numeric
@attribute Fea11 numeric
@attribute Fea12 numeric
@attribute Fea13 numeric
@attribute Fea14 numeric
@attribute Fea15 numeric
@attribute Fea16 numeric
@attribute Fea17 numeric
@attribute Fea18 numeric
@attribute Fea19 numeric
@attribute Fea20 numeric
@attribute Fea21 numeric
@attribute Fea22 numeric
@attribute Fea23 numeric
@attribute Fea24 numeric
@attribute Fea25 numeric
@attribute Fea26 numeric
@attribute Fea27 numeric
@attribute Fea28 numeric
@attribute Fea29 numeric
@attribute Fea30 numeric
@attribute Fea31 numeric
@attribute Fea32 numeric
@attribute Fea33 numeric
@attribute Fea34 numeric
@attribute Fea35 numeric
@attribute Fea36 numeric
@attribute Fea37 numeric
@attribute Fea38 num