In [12]:
import pandas as pd

from sklearn.model_selection import KFold
from numpy.random import RandomState
from numpy import nonzero
from sklearn.feature_selection import mutual_info_regression
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import mutual_info_classif
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

In [23]:
from tools import make_binary

<h2>Dataset preprocessing</h2>
### Regression data ###

dataset : blood sample's dna methylation beta value and age
feature : 487756 CpG site beta value

leave CpG site that have less than 5 missing data

In [None]:
dataset=pd.read_table('~/tcga_files/revised/multi_tissue/GSE40279_blood.tsv',sep='\t')
dataset['count']=dataset.isnull().sum(axis=1)
dataset=dataset[dataset['count']<5]
dataset = dataset.drop(['count'], axis=1)

dataset=dataset.T.fillna(dataset.mean(axis=1)).T
dataset.isnull().values.any()

In [None]:
dataset.to_csv('data/blood_age_sample.tsv',sep='\t',index=False,na_rep=float('nan'))

In [None]:
testset = pd.read_table('~/tcga_files/revised/multi_tissue/GSE41169_blood.tsv',sep='\t')
testset=testset.replace('null',float('nan'))
testset=testset.T.fillna(testset.mean(axis=1)).T
testset.isnull().values.any()
testset.rename(columns={'CpG_site': 'Composite Element REF'}, inplace=True)

In [None]:
testset.to_csv('data/blood_age_test.tsv',sep='\t')

Feature Selection using Mutual Information

In [None]:
dataset=pd.read_table('data/blood_age_sample.tsv',sep='\t')

In [None]:
input_data=dataset.iloc[:,1:].transpose()

In [None]:
X_data = input_data.iloc[:,:-1]
y_data = input_data.iloc[:,-1]

In [None]:
selected=mutual_info_regression(X_data, y_data,  n_neighbors=10, random_state=RandomState(None))

In [None]:
tmp=dataset.iloc[:-1,:]
mutual_info_dataset=tmp[selected>=0.2]

In [None]:
mutual_info_dataset=mutual_info_dataset.append(dataset.iloc[-1,:])
mutual_info_dataset.rename(columns={'CpG': 'Composite Element REF'}, inplace=True)

In [None]:
mutual_info_dataset.to_csv('blood_age_selected_mutual_info.tsv',sep='\t',index=False,na_rep=float('nan'))

Feature Selection using L1 norm

In [None]:
kf=KFold(n_splits=10,shuffle=True)
lasso=LassoCV(eps=0.001, n_alphas=100, fit_intercept=True, max_iter=1000, tol=0.0001, cv=kf, n_jobs=10, positive=False, random_state=RandomState(None), selection='random')
lasso.fit(X_data,y_data)

In [None]:
tmp=dataset.iloc[:-1,:]
a=pd.DataFrame(data=tmp[lasso.coef_!=0],columns=tmp.columns)

In [None]:
a=a.append(dataset.iloc[-1,:])
a.rename(columns={'CpG_site': 'Composite Element REF'}, inplace=True)

In [None]:
a

In [None]:
tmp.to_csv('blood_age_selected_lasso.tsv',sep='\t',index=False,na_rep=float('nan'))

In [None]:
non_zero_features

### Classification data ###

Leave CpG site that have less than 5 missing data

In [13]:
data=pd.read_table('data/breast_cancer.tsv',sep='\t')
data['count']=data.isnull().sum(axis=1)
data=data[data['count']<5]
data = data.drop(['count'], axis=1)

data=data.T.fillna(data.mean(axis=1)).T
data.isnull().values.any()

  interactivity=interactivity, compiler=compiler, result=result)


False

In [15]:
data.to_csv('data/breast_cancer.tsv',sep='\t',index=False,na_rep=float('nan'))

Feature selection using mutual information

In [16]:
data=pd.read_table('data/breast_cancer.tsv',sep='\t')

In [31]:
input_data=data.iloc[:,1:].transpose()
X_data=input_data.iloc[:,:-1]
y_data=input_data.iloc[:,-1]
y_data=make_binary('normal','cancer',y_data)

In [32]:
selected=mutual_info_classif(X_data, y_data,  n_neighbors=10, random_state=RandomState(None))

In [None]:
tmp=data.iloc[:-1,:]
tmp[selected>=0.075]

In [None]:
mutual_info_dataset=tmp[selected>=0.075]
mutual_info_dataset=mutual_info_dataset.append(data.iloc[-1,:])
mutual_info_dataset

In [21]:
mutual_info_dataset.to_csv('data/breast_cancer_mutual_info.tsv',sep='\t',index=False,na_rep=float('nan'))

Feature selection using LinearSVC

In [13]:
svc=LinearSVC(penalty='l1', C=1.0, random_state=RandomState(None), max_iter=10000,loss='squared_hinge',dual=False)
svc.fit(X_data,y_data)

array([[ 0.        ,  0.        ,  0.        , ...,  0.11261584,
         0.08787427,  0.        ]])

In [42]:
feature_select=SelectFromModel(svc)
feature_select.fit(X_data,y_data)

SelectFromModel(estimator=LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=10000,
     multi_class='ovr', penalty='l1',
     random_state=<mtrand.RandomState object at 0x7f2287ffc910>,
     tol=0.0001, verbose=0),
        prefit=False, threshold=None)

In [62]:
index=feature_select.get_support(indices=True)
new_dataset=data.iloc[index,:]

In [65]:
new_dataset=new_dataset.append(data.iloc[-1,:])

In [69]:
new_dataset.to_csv('data/breast_cancer_svc.tsv',sep='\t',index=False,na_rep=float('nan'))

Test set preprocessing

In [5]:
testdata=pd.read_table('data/breast_cancer_test.tsv',sep='\t')

In [8]:
testdata['count']=testdata.isnull().sum(axis=1)
testdata=testdata[testdata['count']<5]
testdata = testdata.drop(['count'], axis=1)

testdata=testdata.fillna(0.5)
testdata.isnull().values.any()

False

In [9]:
testdata.replace('null',float('nan'),inplace=True)

In [11]:
testdata.to_csv('data/breast_cancer_test.tsv',sep='\t',index=False,na_rep=float('nan'))