In [1]:
# # Step 1: Install the package
!pip install git+https://github.com/smazzanti/mrmr


Collecting git+https://github.com/smazzanti/mrmr
  Cloning https://github.com/smazzanti/mrmr to /tmp/pip-req-build-kxvvc3fp
  Running command git clone -q https://github.com/smazzanti/mrmr /tmp/pip-req-build-kxvvc3fp
Collecting category_encoders
  Downloading category_encoders-2.3.0-py2.py3-none-any.whl (82 kB)
[K     |████████████████████████████████| 82 kB 223 kB/s 
Building wheels for collected packages: mrmr
  Building wheel for mrmr (setup.py) ... [?25l[?25hdone
  Created wheel for mrmr: filename=mrmr-0.1-py3-none-any.whl size=16139 sha256=2596471e178ea47dabac6466070ce5a872078917b5c8a3916a14ad8e4582ac9b
  Stored in directory: /tmp/pip-ephem-wheel-cache-tll3lfrk/wheels/97/75/f0/1af73f24fe1c223090326fd094140b43c61420a721e0ace303
Successfully built mrmr
Installing collected packages: category-encoders, mrmr
Successfully installed category-encoders-2.3.0 mrmr-0.1


In [4]:
# Step 2: Load packages

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from mrmr import mrmr_classif

sns.set_style("whitegrid")

import warnings
warnings.filterwarnings("ignore")


# Connect to google drive 
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Step 3: Load data
file1 = pd.read_csv('/content/drive/MyDrive/Data_Tutorial_BC/data_clinical_patient.csv')
file2 = pd.read_csv('/content/drive/MyDrive/Data_Tutorial_BC/data_mRNA_median_all_sample_Zscores.csv')

In [6]:
file2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24368 entries, 0 to 24367
Columns: 1906 entries, Hugo_Symbol to MB-4313
dtypes: float64(1905), object(1)
memory usage: 354.4+ MB


In [7]:
# Drop unused column
file2 = file2.drop('Entrez_Gene_Id', axis=1)

# Drop NA in GeneID
file2 = file2[file2['Hugo_Symbol'].notna()]

# Check null in GeneID columns
file2['Hugo_Symbol'].isnull().sum()

# Check duplicated values
print('The number of duplicated values of Hugo_Symbol in data:', file2['Hugo_Symbol'].duplicated().sum())

# Drop duplicated values for Gene ID
file2 = file2.drop_duplicates(subset=['Hugo_Symbol'])
print('After pre-processing, the number of duplicated values of Hugo_Symbol in data:',
     file2['Hugo_Symbol'].duplicated().sum())
print('Shape of Gene data:', file2.shape)

# Tranpose patient ID  to rows in order to match two data
file2 = file2.set_index('Hugo_Symbol').T.rename_axis('PATIENT_ID').rename_axis(None, axis=1).reset_index()
print('New shape of Gene data:', file2.shape)
file2.head(3)

0

In [11]:
# 3.1 Merge gene data with OS time and status
data = pd.merge(file1[['PATIENT_ID','OS_MONTHS','OS_STATUS']],file2, how="inner", on=["PATIENT_ID"])

In [12]:
# Have a quick look on data
data.head()

Unnamed: 0,PATIENT_ID,OS_MONTHS,OS_STATUS,473.0,494470.0,51533.0,1149.0,167153.0,10786.0,10993.0,245973.0,2152.0,196472.0,91750.0,542767.0,2911.0,2395.0,6548.0,5371.0,8763.0,126308.0,26236.0,26740.0,2787.0,8148.0,2137.0,10695.0,55009.0,246176.0,8209.0,5023.0,122769.0,482.0,134510.0,54507.0,282617.0,1674.0,440248.0,51334.0,28996.0,...,391196.0,387328.0,7045.0,90459.0,23325.0,28989.0,796.0,644809.0,9917.0,25800.0,7076.0,79077.0,60468.0,9982.0,22876.0,257101.0,4236.0,64418.0,10273.0,1991.0,387914.0,586.0,3071.0,257062.0,728095.0,29893.0,7003.0,9380.0,112398.0,9882.0,23452.0,55068.0,283104.0,317703.0,56886.0,6944.0,114788.0,54862.0,57549.0,149647.0
0,MB-0000,140.5,0:LIVING,1.3762,0.1172,-0.9217,3.8334,0.2327,0.3946,-2.2565,-0.2895,2.2327,0.5461,1.8587,-0.7744,-0.0783,-1.8107,-1.0442,0.0643,0.0505,-1.2169,0.6829,-0.2945,-0.3881,0.1801,-0.2353,-0.0924,-1.1665,-0.1022,1.1817,0.0094,-0.9257,0.0352,-0.7471,1.5412,1.2558,-1.0267,1.4681,0.7245,0.8091,...,1.5481,-0.3573,-0.3966,-0.395,-0.0323,-0.1059,-0.8943,0.6312,-0.4736,-0.5023,-2.5482,-1.3826,1.221,4.526,0.9605,-1.0011,-0.0716,0.1751,-0.7045,1.5764,-0.9827,-0.2256,0.0365,0.4823,-1.2252,-0.6151,0.5585,0.2225,-1.1379,0.9536,1.5148,1.6283,-1.2388,0.317,-1.1909,-0.4725,-0.1735,-0.3961,-1.6152,-0.7582
1,MB-0002,84.633333,0:LIVING,-0.0226,-0.929,-1.058,-0.1394,0.2341,1.3907,-1.4654,-0.2331,0.3193,1.5094,2.2304,-0.4856,1.4402,0.5351,-0.1628,-1.1188,0.542,0.78,0.7732,-0.5552,0.541,-1.9566,-0.0339,-0.6713,-0.009,0.4005,-1.0498,-0.0805,-0.3244,-1.5794,0.3662,-0.2706,0.4244,0.059,-0.3235,-0.1333,0.965,...,0.1116,-1.017,-0.3536,0.6907,0.8065,1.2343,-2.8612,0.3665,-0.2789,0.7577,-0.2237,0.915,-0.3749,-0.2859,-1.1426,-1.0643,-0.8158,0.7742,0.1348,0.4266,1.0541,-0.5618,-1.8892,-3.3594,1.774,0.82,2.3542,0.5591,-1.216,-1.0666,0.7127,-0.1851,-0.0826,-0.9186,0.0247,-0.5276,-0.4878,-0.0706,-0.0789,1.277
2,MB-0005,163.7,1:DECEASED,-2.2425,-1.2323,-0.4655,0.1429,1.3516,-0.1929,1.4458,-0.1441,2.0353,-1.3994,0.9565,0.1754,0.6339,-1.8873,-1.6586,-2.2144,4.0836,-1.3095,-0.4369,0.5033,-0.5381,-2.304,-0.8414,-0.0953,-1.3873,0.4561,-0.0876,-0.7237,2.028,-2.0347,3.3014,0.12,0.1631,0.1457,0.2634,-0.6139,-0.0198,...,-1.0464,1.2526,0.4309,3.1059,4.0191,1.03,-0.9386,0.8768,-0.8789,0.5521,-1.7638,0.8085,-0.2437,0.0877,-0.2497,-1.4789,0.0441,1.2259,-1.0296,-2.1508,0.8867,1.5554,-0.6723,0.1683,1.021,-0.1422,1.2947,-0.9314,-1.6151,0.7821,0.7938,-0.9642,-0.6574,-0.4322,-2.071,-0.3882,-0.1544,-0.1448,0.5685,1.4474
3,MB-0006,164.933333,0:LIVING,-1.7706,-1.4902,-0.4199,-1.0699,0.9961,-0.3918,-0.5995,2.0267,0.958,-1.0979,-0.4605,-0.9071,-0.6368,-1.0581,-2.3798,-1.7419,4.9051,-1.2521,-0.3924,0.9714,0.0681,-0.6412,-1.7677,-0.1735,-0.9527,0.2493,0.2737,-0.4892,1.0415,-0.4489,3.7216,-1.205,0.4123,1.0637,-1.6645,-0.2436,-1.4306,...,1.3707,1.7348,0.2411,4.0219,6.6217,1.7811,-0.3721,-1.5982,-2.3866,1.1588,-2.0756,-0.597,-0.112,-0.0524,-0.2434,0.1449,0.2237,-1.4179,-1.7483,-1.7097,1.3734,2.0644,-1.0798,-0.5155,-0.3004,-1.3129,0.391,0.6674,-1.3833,0.4492,0.4633,0.4361,-1.2354,1.8686,-1.309,-1.7143,-0.4777,0.5786,-1.2428,0.2156
4,MB-0008,41.366667,1:DECEASED,-2.0498,-1.2677,-0.0172,2.0533,-0.4041,0.3816,1.3112,0.5532,-0.3265,-1.0231,1.4937,2.1953,-0.3081,-0.0856,-0.173,-1.0259,-0.4131,-1.4403,-0.1939,-0.1588,-0.4344,-1.2376,-0.8028,0.5687,-1.4866,0.2204,1.4259,0.1507,0.4184,-1.261,-0.4316,-0.1861,0.7851,-0.2258,0.142,0.3015,0.133,...,-0.2759,-0.4812,-1.3467,1.1579,-0.2941,0.429,-0.1134,0.1754,0.2273,-0.0962,-1.8536,1.8303,-1.4793,-0.4155,-0.1982,-0.8052,0.4814,0.2395,0.0155,-0.4907,-1.1572,-0.2191,-0.6753,0.2345,-1.6906,0.7067,2.1183,-0.1455,0.5056,-1.9579,-0.1229,-0.1299,-2.0595,0.322,-1.3606,-0.2123,-0.3088,-0.0857,-1.0884,-1.2238


In [13]:
# Step 4: Preprocess data & Explore data

# 4.1 Check duplicated values
print('The number of duplicated values in data', data.duplicated().sum())
print('The shape of data is', data.shape)

# Step 4.2: Deal with missing values

# Check missing values
print('Total missing value in the dataset:', data.isnull().sum().sum())
cols_missvalue = data.columns[data.isnull().sum()>0]
print('List columns having missing data:', cols_missvalue)

# Replace missing values with average values
data[cols_missvalue] = data[cols_missvalue].fillna(data[cols_missvalue].mean())

# Check missing values agains
print('After preprocessing, the number of missing values:', data.isna().sum().sum())


The number of duplicated values in data 0
The shape of data is (1904, 18224)


In [16]:
# Step 5: Feature extraction

# 5.1: Normalise data
ss = MinMaxScaler()
X_norm = data.drop(['OS_STATUS', 'OS_MONTHS','PATIENT_ID'], axis = 1)
X_norm = pd.DataFrame(ss.fit_transform(X_norm), columns=X_norm.columns)


# 5.2: Select feature using MRMR
y_mrmr = data['OS_MONTHS']

features_selected = mrmr_classif(X_norm, y_mrmr, K = 50)
X_mrmr = data[features_selected]

In [21]:
# 5.3 Save to csv file
df_mrmr = X_mrmr
df_mrmr['PATIENT_ID'] = data['PATIENT_ID']
df_mrmr.to_csv('/content/drive/MyDrive/Data_Tutorial_BC/Gene_MRMR_50.csv', index=False)

In [22]:
df_mrmr.shape

(1904, 51)