# Create datasets using Feature Selection

In [1]:
import numpy as np
import pandas as pd

In [137]:
# Create normalized datasets using normalized MA dataset
sFolder = './datasets/'
sOrigDataSet = 'nds_MA.csv'

# Un-ballanced dataset (original)
ds_orig_pool = 'nds.orig.pool.csv'   # Full dataset
ds_orig_fs_rf = 'nds.orig.fs.rf.csv' # Feature selected dataset with RF
ds_orig_fs_univar_chi = 'nds.orig.fs.univar.chi.csv' # Feature selected dataset with univariant FS using chi test
ds_orig_fs_univar_info = 'nds.orig.fs.univar.info.csv' # Feature selected dataset with univariant FS mutual information

# Reduced dataset
ds_orig_pca = 'nds.orig.pca.csv'      # PCA dataset

# Down-sampling ballanced dataset
ds_down_pool = 'nds.down.pool.csv'    # Full dataset
ds_down_fs_rf = 'nds.down.fs.rf.csv'  # Feature selected dataset with RF or other methods!

# Up-sampling ballanced dataset
ds_up_pool = 'nds.up.pool.csv'   # Full dataset
ds_up_fs_rf = 'nds.up.fs.rf.csv' # Feature selected dataset with RF or other methods!

In [3]:
# read normalized dataset to start creation of other datasets (pool, FS, PCA, etc.)

print('>> Reading source dataset ...')
df_MA = pd.read_csv(sFolder + sOrigDataSet)
print('Done')

>> Reading source dataset ...
Done


## Create nds.orig.pool.csv

In [9]:
# Create ds_orig_pool

# Save on file ds_orig_pool
print('>> Saving ' + ds_orig_pool + ' ...')
df_MA.to_csv(sFolder + ds_orig_pool, index=False)
print('Done!')

>> Saving nds.orig.pool.csv ...
Done!


## Model Based Ranking

### RF feature selection (nds.orig.fs.rf)

We can fit a classfier to each feature and rank the predictive power. This method selects the most powerful features individually but ignores the predictive power when features are combined.

Random Forest Classifier is used in this case because it is robust, nonlinear, and doesn't require scaling.

In [4]:
# Model Based Ranking
# https://www.kaggle.com/dkim1992/feature-selection-ranking

# Get features and ouput for selection
X = df_MA.drop('Lij', axis = 1)
y = df_MA['Lij']

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

clf = RandomForestClassifier(n_estimators = 50, max_depth = 4, n_jobs = -1)

scores = []
num_features = len(X.columns)
for i in range(num_features):
    col = X.columns[i]
    score = np.mean(cross_val_score(clf, X[col].values.reshape(-1,1), y, cv=10))
    scores.append((int(score*100), col))

# print(sorted(scores, reverse = True))

[(75, 'MA-Kier2-ASSAY_CHEMBLID'), (75, 'MA-Kier1-ASSAY_CHEMBLID'), (74, 'MA-topoShape-ASSAY_ORGANISM'), (74, 'MA-khs.aaN-ASSAY_ORGANISM'), (74, 'MA-comp_P-ASSAY_CHEMBLID'), (74, 'MA-VC.4-ASSAY_CHEMBLID'), (74, 'MA-HybRatio-ASSAY_CHEMBLID'), (74, 'MA-C2SP1-ASSAY_TYPE'), (74, 'MA-ALogP-ASSAY_CHEMBLID'), (73, 'MA-topoShape-TARGET_MAPPING'), (73, 'MA-naAromAtom-ASSAY_ORGANISM'), (73, 'MA-nRotB-ORGANISM'), (73, 'MA-nHBDon-TARGET_TYPE'), (73, 'MA-nHBDon-TARGET_MAPPING'), (73, 'MA-nHBDon-ORGANISM'), (73, 'MA-nHBDon-ASSAY_TYPE'), (73, 'MA-nHBDon-ASSAY_ORGANISM'), (73, 'MA-nHBAcc-TARGET_TYPE'), (73, 'MA-nHBAcc-TARGET_MAPPING'), (73, 'MA-nHBAcc-ORGANISM'), (73, 'MA-nHBAcc-ASSAY_TYPE'), (73, 'MA-nHBAcc-ASSAY_ORGANISM'), (73, 'MA-nB-TARGET_TYPE'), (73, 'MA-nB-TARGET_MAPPING'), (73, 'MA-nB-ORGANISM'), (73, 'MA-nB-ASSAY_TYPE'), (73, 'MA-nB-ASSAY_ORGANISM'), (73, 'MA-nAtomP-TARGET_MAPPING'), (73, 'MA-nAtomP-ORGANISM'), (73, 'MA-nAtomP-ASSAY_ORGANISM'), (73, 'MA-nAtomLAC-TARGET_TYPE'), (73, 'MA-nAtomL

In [12]:
# create a dataframe with RF scores for each feature
df_RF_scores = pd.DataFrame(sorted(scores, reverse = True), columns=['RFscore','FeatureName'])

# Save on file RF scores
print('>> Saving RF scores ...')
df_RF_scores.to_csv(sFolder + 'scores_RF.csv', index=False)
print('Done!')

>> Saving RF scores ...
Done!


In [13]:
# write non ordered RF scores
(pd.DataFrame(scores, columns=['RFscore','FeatureName'])).to_csv(sFolder + 'scores_RF_not_ordered.csv', index=False)

In [92]:
# get only the list for protein descriptors
protein_descriptors = [col for col in X.columns if ('CHOC' in col) or ('BIGC' in col) or ('CHAM' in col) or ('comp_' in col) ]
# create a dataframe with these names for proteins only
df_prot_descr = pd.DataFrame(protein_descriptors, columns=['FeatureName'])

In [93]:
# Get RF score only for proteins: merge feature names for protein with the RF score
df_protein_scores = pd.merge(df_prot_descr, df_RF_scores, on=['FeatureName'])
df_protein_scores_sorted = df_protein_scores.sort_values('RFscore', ascending=False)
# save on disk the protein descriptors scores
df_protein_scores_sorted.to_csv(sFolder + 'scores_RF_onlyProts_ordered.csv', index=False)

In [102]:
# get only the best nProtFeats
nFeats = 25 # best protein descriptors
# get the best nFeats prot descriptors
BestProteinFeatures = list(df_protein_scores_sorted.FeatureName[:nFeats])
# get the best nFeats drug descriptors
BestDrugFeatures =list(df_RF_scores.FeatureName[:nFeats])

# Get the list with drug and protein descriptors for the RF FS dataset
BestRFDescriptors = [y for x in [BestProteinFeatures, BestDrugFeatures] for y in x]
# Add output feature Lij
BestRFDescriptors.append('Lij')

In [105]:
# create ds_orig_fs_rf: un-ballanced feature selected with RF
nds_fsRF = df_MA[BestRFDescriptors]
# Save on file RF feature selected dataset
print('>> Saving RF FS dataset ...')
nds_fsRF.to_csv(sFolder + ds_orig_fs_rf, index=False)
print('Done!')

>> Saving RF FS dataset ...
Done!


## Univariate Feature Selection, UFS (nds.orig.fs.univar)

### Univariate feature selection using chi-squared test

In [109]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, mutual_info_classif

test = SelectKBest(score_func=chi2, k=2)
test.fit(X, y)

SelectKBest(k=2, score_func=<function chi2 at 0x0000023E9D9348C8>)

In [110]:
scoresUFS = []
for i in range(num_features):
    score = test.scores_[i]
    scoresUFS.append((score, X.columns[i]))
        
# create a dataframe with UFS scores for each feature
df_UFS_scores = pd.DataFrame(sorted(scores, reverse = True), columns=['UFSscore','FeatureName'])

# Save on file RF scores
print('>> Saving UFS scores ...')
df_UFS_scores.to_csv(sFolder + 'scores_UFS.csv', index=False)
print('Done!')

>> Saving UFS scores ...
Done!


In [111]:
# Get RF score only for proteins: merge feature names for protein with the UFS score
df_protein_UFS_scores = pd.merge(df_prot_descr, df_UFS_scores, on=['FeatureName'])
df_protein_UFS_scores_sorted = df_protein_UFS_scores.sort_values('UFSscore', ascending=False)
# save on disk the protein descriptors scores
df_protein_UFS_scores_sorted.to_csv(sFolder + 'scores_UFS_onlyProts_ordered.csv', index=False)

In [112]:
# get only the best nProtFeats
nFeats = 25 # best protein descriptors
# get the best nFeats prot descriptors
BestProteinFeaturesUFS = list(df_protein_UFS_scores_sorted.FeatureName[:nFeats])
# get the best nFeats drug descriptors
BestDrugFeaturesUFS =list(df_UFS_scores.FeatureName[:nFeats])

# Get the list with drug and protein descriptors for the RF FS dataset
BestUFSDescriptors = [y for x in [BestProteinFeaturesUFS, BestDrugFeaturesUFS] for y in x]
# Add output feature Lij
BestUFSDescriptors.append('Lij')

In [113]:
# create ds_orig_fs_univar: un-ballanced feature selected with univariate fs
nds_fsUFS = df_MA[BestUFSDescriptors]
# Save on file UFS dataset
print('>> Saving Univariate FS dataset using chi test ...')
nds_fsUFS.to_csv(sFolder + ds_orig_fs_univar_chi, index=False)
print('Done!')

>> Saving Univariate FS dataset ...
Done!


### Univariate feature selection with mutual information

In [116]:
test = SelectKBest(score_func = mutual_info_classif, k=2)
test.fit(X, y)

SelectKBest(k=2,
      score_func=<function mutual_info_classif at 0x0000023E9D694950>)

In [118]:
scoresUFS_info = []
for i in range(num_features):
    score = test.scores_[i]
    scoresUFS_info.append((score, X.columns[i]))
        
# create a dataframe with UFS scores for each feature
df_UFSinfo_scores = pd.DataFrame(sorted(scores, reverse = True), columns=['UFSinfo_score','FeatureName'])

# Save on file RF scores
print('>> Saving UFSinfo scores ...')
df_UFSinfo_scores.to_csv(sFolder + 'scores_UFSinfo.csv', index=False)
print('Done!')

# Get RF score only for proteins: merge feature names for protein with the UFS score
df_protein_UFSinfo_scores = pd.merge(df_prot_descr, df_UFSinfo_scores, on=['FeatureName'])
df_protein_UFSinfo_scores_sorted = df_protein_UFSinfo_scores.sort_values('UFSinfo_score', ascending=False)
# save on disk the protein descriptors scores
df_protein_UFSinfo_scores_sorted.to_csv(sFolder + 'scores_UFSinfo_onlyProts_ordered.csv', index=False)


# get only the best nProtFeats
nFeats = 25 # best protein descriptors

# get the best nFeats prot descriptors
BestProteinFeaturesUFSinfo = list(df_protein_UFSinfo_scores_sorted.FeatureName[:nFeats])
# get the best nFeats drug descriptors
BestDrugFeaturesUFSinfo =list(df_UFSinfo_scores.FeatureName[:nFeats])

# Get the list with drug and protein descriptors for the RF FS dataset
BestUFSinfoDescriptors = [y for x in [BestProteinFeaturesUFSinfo, BestDrugFeaturesUFSinfo] for y in x]
# Add output feature Lij
BestUFSinfoDescriptors.append('Lij')

# create ds_orig_fs_univar: un-ballanced feature selected with univariate fs
nds_fsUFSinfo = df_MA[BestUFSinfoDescriptors]

# Save on file UFS info dataset
print('>> Saving Univariate FS dataset using mutual information ...')
nds_fsUFSinfo.to_csv(sFolder + ds_orig_fs_univar_info, index=False)
print('Done!')

>> Saving UFSinfo scores ...
Done!
>> Saving Univariate FS dataset using mutual information ...
Done!


You could different FS sets! (https://www.kaggle.com/dkim1992/feature-selection-ranking)

## Reduced datasets (PCA/tSNET)

Let's try to change the dimensions to smaller space.

### PCA

scikit-learn will choose the minimum number of principal components such that 99% of the variance is retained.

In [119]:
from sklearn.model_selection import train_test_split

Xdata = X.values # get values of features
ydata = y.values # get output values

X_train, X_test, y_train, y_test = train_test_split(Xdata, ydata,
                                                    train_size=0.75, test_size=0.25, random_state=42)

In [120]:
from sklearn.decomposition import PCA
# Make an instance of the Model
pca = PCA(.99) # use PCA for .99 variance or select a number of new dimension n_components=50

pca.fit(X_train)
print("No of PCA components for 0.99 variance:", pca.n_components_) #number of components selected for .99 variance
print("List of variance for each PCA component:")
print(pca.explained_variance_ratio_) #list of PCA component variance

No of PCA components for 0.99 variance: 79
List of variance for each PCA component:
[4.18460103e-01 1.25561097e-01 7.55177616e-02 5.64172606e-02
 3.80604302e-02 2.93960834e-02 2.29081956e-02 2.01887494e-02
 1.91241753e-02 1.29915838e-02 1.11167946e-02 1.07081841e-02
 9.97199657e-03 9.65508381e-03 9.08091065e-03 7.85202822e-03
 6.14431619e-03 5.64117789e-03 5.38210200e-03 5.25375594e-03
 4.40879091e-03 4.07670205e-03 3.79726459e-03 3.68470119e-03
 3.47191095e-03 3.25208732e-03 3.06413699e-03 3.02100503e-03
 2.81086596e-03 2.75071579e-03 2.54412255e-03 2.53473478e-03
 2.42137687e-03 2.30044339e-03 2.27268616e-03 2.17738014e-03
 2.00884818e-03 1.99123854e-03 1.90535333e-03 1.85309091e-03
 1.78484754e-03 1.74687850e-03 1.69444740e-03 1.55571989e-03
 1.52100198e-03 1.44048617e-03 1.37318178e-03 1.32066354e-03
 1.26393092e-03 1.15830096e-03 1.07204860e-03 1.03887024e-03
 1.01584759e-03 1.00364168e-03 9.95903804e-04 9.56114992e-04
 9.30777910e-04 9.12566785e-04 8.37783461e-04 7.91710454e-04
 

So, you need 79 dimension to explain 0.99 variance. Let's use only 50 dimension in order to have the same number of total features (50).

In [121]:
TotFeats = nFeats *2 # 50
pca = PCA(n_components=TotFeats) # use PCA for .99 variance or select a number of new dimension n_components=50

pca.fit(X_train)
print("No of PCA components for 0.99 variance:", pca.n_components_) #number of components selected for .99 variance
print("List of variance for each PCA component:")
print(pca.explained_variance_ratio_) #list of PCA component variance

No of PCA components for 0.99 variance: 50
List of variance for each PCA component:
[0.4184601  0.1255611  0.07551776 0.05641726 0.03806043 0.02939608
 0.0229082  0.02018875 0.01912418 0.01299158 0.01111679 0.01070818
 0.009972   0.00965508 0.00908091 0.00785203 0.00614432 0.00564118
 0.0053821  0.00525376 0.00440879 0.0040767  0.00379726 0.0036847
 0.00347191 0.00325209 0.00306414 0.00302101 0.00281087 0.00275072
 0.00254412 0.00253473 0.00242138 0.00230044 0.00227269 0.00217738
 0.00200885 0.00199124 0.00190535 0.00185309 0.00178484 0.00174686
 0.0016944  0.0015557  0.00152082 0.00144045 0.00137296 0.00132026
 0.00126345 0.00115813]


In [123]:
print('Total PCA explained variance:', sum(pca.explained_variance_ratio_))

Total PCA explained variance: 0.970637123209547


Using 50 PCA components, we can obtain an explained variance of 97%. Let's save the transformed datataset:

In [125]:
# Transform the entire dataset
X_PCA = pca.transform(Xdata)
X_PCA.shape #check the PCA dimensions

(12766, 50)

In [134]:
# create a dataframe to save
df_PCA = pd.DataFrame(data = X_PCA,
                      columns = ['PCA'+str(i) for i in range(1,TotFeats+1)])

# add output feature values
df_PCA = pd.concat([df_PCA, y], axis = 1)

# Save on file UFS info dataset
print('>> Saving PCA dataset ...')
df_PCA.to_csv(sFolder + ds_orig_pca, index=False)
print('Done!')

>> Saving PCA dataset ...
Done!
