<a href="https://colab.research.google.com/github/hudanjhh/SECB4313/blob/main/BMS_SVM_RFE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
import warnings # Lib for warning issue handling
warnings.filterwarnings('ignore') # Ignores all irrelevant warnings

In [3]:
data_path = '/content/drive/My Drive/Bio Modelling & Simulation/Breast Cancer for Group Project/'

In [5]:
omic = pd.read_csv(data_path + 'omic_class.csv')

In [6]:
omic.head()

Unnamed: 0,Sample ID,0,1,2,3,4,5,6,7,8,...,359.2,360.2,361.2,362.2,363.2,364.2,365.2,366.2,367.2,Label
0,TCGA.3C.AAAU.01,0.520453,0.289989,0.667695,0.966118,0.794771,0.152592,0.676224,0.235005,0.888044,...,0.306446,0.381551,0.761737,0.376389,0.587891,0.814829,0.564357,0.357618,0.781823,1
1,TCGA.3C.AALI.01,0.882304,0.441655,0.513831,0.955438,0.719837,0.776939,0.771318,0.048206,0.839268,...,0.391569,0.383333,0.337995,0.465384,0.279263,0.661252,0.726921,0.455099,0.271641,4
2,TCGA.3C.AALJ.01,0.825815,0.619812,0.637452,0.954691,0.788179,0.24233,0.66703,0.1729,0.454904,...,0.647046,0.120418,0.37084,0.293082,0.338759,0.767904,0.836007,0.420596,0.648107,2
3,TCGA.3C.AALK.01,0.779689,0.748916,0.602097,0.924903,0.728475,0.714145,0.634296,0.126887,0.880805,...,0.308502,0.285007,0.312642,0.285412,0.329648,0.702706,0.474745,0.699326,0.523885,1
4,TCGA.5L.AAT0.01,0.399848,0.671264,0.651998,0.930839,0.710646,0.772222,0.669729,0.209022,0.862095,...,0.240436,0.080876,0.289608,0.324665,0.338159,0.75618,0.467601,0.623251,0.412952,1


In [11]:
X = omic.iloc[:,1:-1]
y = omic.iloc[:,-1]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# SVM
estimator = SVC(kernel="linear")
estimator.fit(X_train, y_train)
result = estimator.predict(X_test)
print("accuracy: ", accuracy_score(y_test,result))
print("weights of omics: ", estimator.coef_)

# Perform RFE
selector = RFE(estimator, n_features_to_select = 12000, step = 200)
selector = selector.fit(X_train, y_train)

# Selected variable
selected_variables = np.where(selector.support_)[0]
print("\nSelected variables indices:", selected_variables)

accuracy:  0.8888888888888888
weights of omics:  [[-4.95694274e-03  7.23452063e-03  3.82794157e-03 ...  1.71856177e-03
   4.37699440e-03 -6.28719134e-04]
 [-4.55886231e-04  4.21128706e-04  6.65493368e-04 ... -8.46241220e-04
  -3.38570629e-04 -1.65422735e-03]
 [ 2.15918296e-03  5.97100013e-04  2.69142904e-03 ...  1.95339334e-03
   1.17921920e-03 -3.22100399e-04]
 ...
 [ 1.80219406e-03 -5.59999389e-04  1.10145655e-03 ...  1.73348041e-03
   4.51721302e-04  1.04481686e-03]
 [ 4.62503130e-04  8.53334478e-05  1.34743914e-03 ...  3.37768634e-04
  -2.29502297e-03  4.25926085e-03]
 [-2.63253551e-03 -7.36566231e-04  7.32760604e-04 ... -9.24784219e-04
  -2.57153887e-03  2.15262383e-03]]

Selected variables indices: [    0     1     2 ... 37615 37621 37622]


In [14]:
features_selected = pd.DataFrame({'Columns':X_train.columns, 'Selected':selector.support_})
print("\nSelected Features: \n",features_selected)

# Get features ranking list
features_rank = pd.DataFrame({'Columns': X_train.columns, 'Ranking': selector.ranking_})
print("\nFeatures Ranking: \n",features_rank)

# Get unselected features list
features_unselected = X_train.columns[np.logical_not(selector.get_support())]
print("\nUnselected Features: \n", features_unselected)


Selected Features: 
       Columns  Selected
0           0      True
1           1      True
2           2      True
3           3     False
4           4      True
...       ...       ...
37618   363.2     False
37619   364.2     False
37620   365.2     False
37621   366.2      True
37622   367.2      True

[37623 rows x 2 columns]

Features Ranking: 
       Columns  Ranking
0           0        1
1           1        1
2           2        1
3           3       13
4           4        1
...       ...      ...
37618   363.2       53
37619   364.2       59
37620   365.2        5
37621   366.2        1
37622   367.2        1

[37623 rows x 2 columns]

Unselected Features: 
 Index(['3', '6', '7', '8', '9', '10', '11', '12', '14', '15',
       ...
       '339.2', '341.2', '342.2', '349.2', '359.2', '361.2', '362.2', '363.2',
       '364.2', '365.2'],
      dtype='object', length=25623)


In [24]:
label = omic['Label']
sampleid = omic['Sample ID']
features = omic.drop(['Sample ID', 'Label'], axis=1)

In [21]:
featuresT = features.transpose()

In [22]:
selected_features = featuresT.iloc[selected_variables]

In [23]:
selected_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,661,662,663,664,665,666,667,668,669,670
0,0.520453,0.882304,0.825815,0.779689,0.399848,0.940916,0.877582,0.874979,0.634968,0.812758,...,0.595168,0.912950,0.776165,0.849368,0.592049,0.806582,0.750347,0.657978,0.855324,0.585315
1,0.289989,0.441655,0.619812,0.748916,0.671264,0.459632,0.941642,0.752312,0.583879,0.384910,...,0.550642,0.216929,0.609646,0.849751,0.568298,0.851500,0.847763,0.669353,0.812983,0.246498
2,0.667695,0.513831,0.637452,0.602097,0.651998,0.687138,0.674940,0.838471,0.519847,0.643791,...,0.615687,0.288419,0.895455,0.791560,0.591059,0.730897,0.884052,0.340158,0.700029,0.728697
4,0.794771,0.719837,0.788179,0.728475,0.710646,0.854993,0.025541,0.678752,0.646211,0.692107,...,0.850923,0.293421,0.701358,0.474657,0.618568,0.733526,0.644478,0.735896,0.577335,0.638120
5,0.152592,0.776939,0.242330,0.714145,0.772222,0.102698,0.022095,0.735208,0.340723,0.073815,...,0.046074,0.816473,0.072334,0.801254,0.112498,0.588830,0.589524,0.093216,0.588421,0.050180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
357.2,0.424878,0.446824,0.700012,0.413087,0.515279,0.497920,0.526085,0.429559,0.379015,0.485893,...,0.576529,0.654156,0.539772,0.373529,0.515607,0.351509,0.525156,0.475639,0.273227,0.837578
358.2,0.602760,0.188317,0.404718,0.130772,0.091184,0.672866,0.070614,0.282657,0.117536,0.279485,...,0.756335,0.151897,0.522520,0.216717,0.408330,0.090849,0.308859,0.145190,0.198301,0.116767
360.2,0.381551,0.383333,0.120418,0.285007,0.080876,0.347967,0.178805,0.286512,0.300102,0.142932,...,0.347443,0.385984,0.431707,0.235572,0.216847,0.451626,0.251333,0.270420,0.348925,0.465544
366.2,0.357618,0.455099,0.420596,0.699326,0.623251,0.347576,0.882062,0.721524,0.749174,0.543267,...,0.177315,0.625748,0.597025,0.767562,0.668743,0.707533,0.802207,0.715436,0.710496,0.567854


In [26]:
final_mo = selected_features.transpose()
final_mo.insert(0, 'Sample ID', sampleid)
final_mo = pd.concat([final_mo, label], axis=1)
final_mo

Unnamed: 0,Sample ID,0,1,2,4,5,13,19,29,39,...,353.2,354.2,355.2,356.2,357.2,358.2,360.2,366.2,367.2,Label
0,TCGA.3C.AAAU.01,0.520453,0.289989,0.667695,0.794771,0.152592,0.085257,0.162053,0.770148,0.085497,...,0.709519,0.405177,0.418195,0.754511,0.424878,0.602760,0.381551,0.357618,0.781823,1
1,TCGA.3C.AALI.01,0.882304,0.441655,0.513831,0.719837,0.776939,0.066492,0.660669,0.714136,0.532519,...,0.564290,0.187257,0.206425,0.515165,0.446824,0.188317,0.383333,0.455099,0.271641,4
2,TCGA.3C.AALJ.01,0.825815,0.619812,0.637452,0.788179,0.242330,0.073562,0.257596,0.826405,0.490446,...,0.611077,0.278506,0.296027,0.523748,0.700012,0.404718,0.120418,0.420596,0.648107,2
3,TCGA.3C.AALK.01,0.779689,0.748916,0.602097,0.728475,0.714145,0.518062,0.374080,0.866040,0.848814,...,0.293331,0.368270,0.389947,0.705412,0.413087,0.130772,0.285007,0.699326,0.523885,1
4,TCGA.5L.AAT0.01,0.399848,0.671264,0.651998,0.710646,0.772222,0.113149,0.426588,0.679740,0.671611,...,0.195962,0.322275,0.346405,0.521261,0.515279,0.091184,0.080876,0.623251,0.412952,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
666,TCGA.WT.AB44.01,0.806582,0.851500,0.730897,0.733526,0.588830,0.062016,0.306683,0.829830,0.000000,...,0.135084,0.263854,0.283072,0.716154,0.351509,0.090849,0.451626,0.707533,0.372267,1
667,TCGA.XX.A899.01,0.750347,0.847763,0.884052,0.644478,0.589524,0.086712,0.623134,0.819196,0.639909,...,0.353730,0.534451,0.534234,0.550520,0.525156,0.308859,0.251333,0.802207,0.371451,1
668,TCGA.XX.A89A.01,0.657978,0.669353,0.340158,0.735896,0.093216,0.068325,0.500651,0.663540,0.449037,...,0.273503,0.303364,0.299129,0.431705,0.475639,0.145190,0.270420,0.715436,0.459216,1
669,TCGA.Z7.A8R5.01,0.855324,0.812983,0.700029,0.577335,0.588421,0.395544,0.380333,0.874528,0.000000,...,0.173776,0.352135,0.372494,0.368058,0.273227,0.198301,0.348925,0.710496,0.424886,1


In [27]:
final_mo.to_csv(data_path + 'SVMRFE.csv', header=True, index=False)