In [12]:
import pandas as pd

from sklearn.model_selection import KFold
from numpy.random import RandomState
from numpy import nonzero
from sklearn.feature_selection import mutual_info_regression
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import mutual_info_classif
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

In [23]:
from tools import make_binary

<h2>Dataset preprocessing</h2>
### Regression data ###

dataset : blood sample's dna methylation beta value and age
feature : 487756 CpG site beta value

leave CpG site that have less than 5 missing data

In [None]:
dataset=pd.read_table('~/tcga_files/revised/multi_tissue/GSE40279_blood.tsv',sep='\t')
dataset['count']=dataset.isnull().sum(axis=1)
dataset=dataset[dataset['count']<5]
dataset = dataset.drop(['count'], axis=1)

dataset=dataset.T.fillna(dataset.mean(axis=1)).T
dataset.isnull().values.any()

In [None]:
dataset.to_csv('data/blood_age_sample.tsv',sep='\t',index=False,na_rep=float('nan'))

In [None]:
testset = pd.read_table('~/tcga_files/revised/multi_tissue/GSE41169_blood.tsv',sep='\t')
testset=testset.replace('null',float('nan'))
testset=testset.T.fillna(testset.mean(axis=1)).T
testset.isnull().values.any()
testset.rename(columns={'CpG_site': 'Composite Element REF'}, inplace=True)

In [None]:
testset.to_csv('data/blood_age_test.tsv',sep='\t')

Feature Selection using Mutual Information

In [None]:
dataset=pd.read_table('data/blood_age_sample.tsv',sep='\t')

In [None]:
input_data=dataset.iloc[:,1:].transpose()

In [None]:
X_data = input_data.iloc[:,:-1]
y_data = input_data.iloc[:,-1]

In [None]:
selected=mutual_info_regression(X_data, y_data,  n_neighbors=10, random_state=RandomState(None))

In [None]:
tmp=dataset.iloc[:-1,:]
mutual_info_dataset=tmp[selected>=0.2]

In [None]:
mutual_info_dataset=mutual_info_dataset.append(dataset.iloc[-1,:])
mutual_info_dataset.rename(columns={'CpG': 'Composite Element REF'}, inplace=True)

In [None]:
mutual_info_dataset.to_csv('blood_age_selected_mutual_info.tsv',sep='\t',index=False,na_rep=float('nan'))

Feature Selection using L1 norm

In [None]:
kf=KFold(n_splits=10,shuffle=True)
lasso=LassoCV(eps=0.001, n_alphas=100, fit_intercept=True, max_iter=1000, tol=0.0001, cv=kf, n_jobs=10, positive=False, random_state=RandomState(None), selection='random')
lasso.fit(X_data,y_data)

In [None]:
tmp=dataset.iloc[:-1,:]
a=pd.DataFrame(data=tmp[lasso.coef_!=0],columns=tmp.columns)

In [None]:
a=a.append(dataset.iloc[-1,:])
a.rename(columns={'CpG_site': 'Composite Element REF'}, inplace=True)

In [None]:
a

In [None]:
tmp.to_csv('blood_age_selected_lasso.tsv',sep='\t',index=False,na_rep=float('nan'))

In [None]:
non_zero_features

### Classification data ###

Leave CpG site that have less than 5 missing data

In [13]:
data=pd.read_table('data/breast_cancer.tsv',sep='\t')
data['count']=data.isnull().sum(axis=1)
data=data[data['count']<5]
data = data.drop(['count'], axis=1)

data=data.T.fillna(data.mean(axis=1)).T
data.isnull().values.any()

  interactivity=interactivity, compiler=compiler, result=result)


False

In [None]:
data.to_csv('data/breast_cancer_test.tsv',sep='\t',index=False,na_rep=float('nan'))

Feature selection using mutual information

In [16]:
data=pd.read_table('data/breast_cancer.tsv',sep='\t')

In [31]:
input_data=data.iloc[:,1:].transpose()
X_data=input_data.iloc[:,:-1]
y_data=input_data.iloc[:,-1]
y_data=make_binary('normal','cancer',y_data)

In [32]:
selected=mutual_info_classif(X_data, y_data,  n_neighbors=10, random_state=RandomState(None))

In [35]:
selected.min()

0.0

In [37]:
tmp=data.iloc[:-1,:]
tmp[selected>=0.2]

Unnamed: 0,ID_REF,GSM927093,GSM927094,GSM927095,GSM927096,GSM927097,GSM927098,GSM927099,GSM927100,GSM927101,...,GSM927155,GSM927156,GSM927157,GSM927158,GSM927159,GSM927160,GSM927161,GSM927162,GSM927163,GSM927164
1390,cg00058644,0.784065,0.780692,0.820477,0.798314,0.742386,0.794997,0.369205,0.742233,0.735129,...,0.880789,0.756852,0.59758,0.749848,0.750519,0.521512,0.771988,0.796992,0.804805,0.805947
1954,cg00084770,0.358018,0.408707,0.534225,0.350206,0.41307,0.781405,0.841123,0.55541,0.398989,...,0.479129,0.408709,0.595623,0.194516,0.35233,0.467913,0.458871,0.480786,0.601212,0.225512
3405,cg00152034,0.595345,0.5969,0.663836,0.626598,0.563423,0.885201,0.809189,0.727219,0.659332,...,0.652498,0.627451,0.672585,0.663323,0.605792,0.749786,0.668712,0.597943,0.708407,0.663037
4664,cg00216961,0.265486,0.30734,0.162164,0.3366,0.260682,0.150589,0.138591,0.224376,0.282884,...,0.170579,0.295687,0.288328,0.170333,0.280928,0.172252,0.282196,0.240308,0.210213,0.288561
5487,cg00256068,0.836738,0.849702,0.813606,0.891321,0.820377,0.822419,0.884159,0.691796,0.822121,...,0.749165,0.859808,0.845214,0.777662,0.870057,0.631658,0.848054,0.703863,0.877545,0.8212
6798,cg00322636,0.81142,0.799727,0.771265,0.785284,0.839917,0.411857,0.850584,0.80007,0.717398,...,0.830522,0.806099,0.726624,0.808131,0.760595,0.588012,0.754222,0.707546,0.832296,0.773557
9202,cg00435490,0.878521,0.872467,0.875944,0.891514,0.885619,0.915073,0.87789,0.898291,0.883722,...,0.868556,0.882261,0.898531,0.775767,0.886957,0.880454,0.867567,0.838888,0.881833,0.883505
9221,cg00436420,0.845707,0.866452,0.813905,0.894081,0.843406,0.464279,0.776172,0.673457,0.878857,...,0.49197,0.896085,0.890492,0.51712,0.876305,0.844143,0.861402,0.639013,0.566322,0.89534
9826,cg00464046,0.397727,0.390325,0.419323,0.366151,0.349977,0.395994,0.175783,0.237373,0.348795,...,0.304103,0.383709,0.334617,0.310494,0.417019,0.306025,0.339403,0.297772,0.294349,0.344501
10578,cg00498772,0.546192,0.535499,0.76568,0.501528,0.695205,0.77365,0.803837,0.710084,0.595995,...,0.766536,0.52706,0.610836,0.639866,0.555535,0.674619,0.603233,0.678779,0.675685,0.531291


In [None]:
mutual_info_dataset=tmp[selected>=0.075]
mutual_info_dataset=mutual_info_dataset.append(data.iloc[-1,:])
mutual_info_dataset

In [21]:
mutual_info_dataset.to_csv('data/breast_cancer_mutual_info.tsv',sep='\t',index=False,na_rep=float('nan'))

Feature selection using LinearSVC

In [13]:
svc=LinearSVC(penalty='l1', C=1.0, random_state=RandomState(None), max_iter=10000,loss='squared_hinge',dual=False)
svc.fit(X_data,y_data)

array([[ 0.        ,  0.        ,  0.        , ...,  0.11261584,
         0.08787427,  0.        ]])

In [42]:
feature_select=SelectFromModel(svc)
feature_select.fit(X_data,y_data)

SelectFromModel(estimator=LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=10000,
     multi_class='ovr', penalty='l1',
     random_state=<mtrand.RandomState object at 0x7f2287ffc910>,
     tol=0.0001, verbose=0),
        prefit=False, threshold=None)

In [62]:
index=feature_select.get_support(indices=True)
new_dataset=data.iloc[index,:]

In [65]:
new_dataset=new_dataset.append(data.iloc[-1,:])

In [69]:
new_dataset.to_csv('data/breast_cancer_svc.tsv',sep='\t',index=False,na_rep=float('nan'))

Test set preprocessing

In [5]:
testdata=pd.read_table('data/breast_cancer_test.tsv',sep='\t')

In [8]:
testdata['count']=testdata.isnull().sum(axis=1)
testdata=testdata[testdata['count']<5]
testdata = testdata.drop(['count'], axis=1)

testdata=testdata.fillna(0.5)
testdata.isnull().values.any()

False

In [9]:
testdata.replace('null',float('nan'),inplace=True)

In [11]:
testdata.to_csv('data/breast_cancer_test.tsv',sep='\t',index=False,na_rep=float('nan'))