In [4]:
import numpy as np
import pandas as pd

# load the sarcoma transcriptome data
sarcoma_df = pd.read_csv('../Data/sarcoma-gene-exp-FPKM-no-labels.csv')
sarcoma_df.shape

(206, 20605)

In [5]:
sarcoma_labels_df = pd.read_csv('../Data/sarcoma-gene-exp-FPKM-labels-only.csv')
sarcoma_labels_df.shape

(206, 1)

In [3]:
# Perform Chi2 analysis to identify 200 best features
from sklearn.feature_selection import chi2, SelectKBest
chi_best = SelectKBest(chi2, k=200)
sarcoma_fs = chi_best.fit_transform(sarcoma_df, sarcoma_labels_df)
sarcoma_fs.shape

(206, 200)

In [4]:
sarcoma_features_df = pd.DataFrame(sarcoma_fs)
sarcoma_features_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,386.746486,2.449039,1650.942083,7.320192,313.564817,6.238991,75.287285,174.512529,450.6798,1795.391642,...,250.438493,298.183852,2.989608,1406.132345,3418.02384,279.534051,138.308413,193.530635,133.307419,1405.361806
1,221.84744,1.618259,751.351877,28.012846,218.204368,0.176337,109.493391,329.665178,524.600755,1787.683581,...,800.432084,231.602964,67.087453,126.631062,2056.122159,9.777337,0.607324,379.004516,10.237354,155.380148
2,1501.132266,10.334774,671.737434,1.393683,222.882056,0.138505,174.749952,574.80284,1081.536709,1394.694007,...,357.708504,787.152822,17.067148,274.471193,2689.111858,0.178776,1.784545,455.80806,117.733055,1376.375842
3,566.908583,0.860552,597.198368,0.292326,140.009636,96.305853,181.779696,315.580757,283.240867,727.107574,...,220.592312,510.943255,2.735652,278.641625,1684.797516,5.367955,48.083169,295.288686,9.234245,276.010753
4,209.786803,0.029524,3249.513159,10.173948,400.744764,1.254673,174.919558,360.738096,223.623012,2129.88401,...,694.151744,419.882274,1.503315,1856.74547,2249.123326,2.039145,1.955179,217.97407,24.899399,232.425521


In [5]:
# Save csv file with best features
sarcoma_features_df.to_csv("Data/sarcoma-gene-exp-FPKM-chi2-features.csv", index=False)

In [6]:
# Get the names associated with the 200 best features
feature_names = list(sarcoma_df)
#list of booleans
mask = chi_best.get_support() 
# The list of your K best features
new_features = [] 

for bool, feature in zip(mask, feature_names):
    if bool:
        (prefix, sep, suffix) = feature.rpartition('.')
        new_features.append(prefix)
        
print(len(new_features))
print(new_features)

200
['ENSG00000227097', 'ENSG00000105048', 'ENSG00000198888', 'ENSG00000211679', 'ENSG00000211459', 'ENSG00000124107', 'ENSG00000138326', 'ENSG00000198034', 'ENSG00000074800', 'ENSG00000197956', 'ENSG00000239951', 'ENSG00000163661', 'ENSG00000174807', 'ENSG00000135506', 'ENSG00000049540', 'ENSG00000198786', 'ENSG00000198899', 'ENSG00000137124', 'ENSG00000125730', 'ENSG00000211892', 'ENSG00000143632', 'ENSG00000167526', 'ENSG00000142937', 'ENSG00000054938', 'ENSG00000104879', 'ENSG00000100316', 'ENSG00000211598', 'ENSG00000198938', 'ENSG00000269936', 'ENSG00000152583', 'ENSG00000019582', 'ENSG00000198727', 'ENSG00000140416', 'ENSG00000197746', 'ENSG00000182107', 'ENSG00000087245', 'ENSG00000012171', 'ENSG00000135446', 'ENSG00000234745', 'ENSG00000145423', 'ENSG00000101335', 'ENSG00000170891', 'ENSG00000177600', 'ENSG00000196126', 'ENSG00000196154', 'ENSG00000112306', 'ENSG00000087086', 'ENSG00000198125', 'ENSG00000175084', 'ENSG00000107796', 'ENSG00000172403', 'ENSG00000133392', 'ENSG00

In [7]:
import mygene

mg = mygene.MyGeneInfo()
gene_symbol_list = mg.getgenes(new_features, fields='symbol', as_dataframe = True)
print(gene_symbol_list)

querying 1-200...done.
                             _id     _score   symbol notfound
query                                                        
ENSG00000227097  ENSG00000227097   6.910001  RPS28P7      NaN
ENSG00000105048             7138  20.365880    TNNT1      NaN
ENSG00000198888             4535  19.651003      ND1      NaN
ENSG00000211679  ENSG00000211679  21.327978    IGLC3      NaN
ENSG00000211459  ENSG00000211459  20.545593  MT-RNR1      NaN
...                          ...        ...      ...      ...
ENSG00000163431            25802  19.844347    LMOD1      NaN
ENSG00000131471             8639  19.967790     AOC3      NaN
ENSG00000158710             8407  21.115300   TAGLN2      NaN
ENSG00000109610             6649  19.696651     SOD3      NaN
ENSG00000102265             7076  19.391690    TIMP1      NaN

[200 rows x 4 columns]


In [8]:
# Save csv file with list of best feature names
features_list_df = pd.DataFrame(gene_symbol_list)
features_list_df.to_csv("../Data/sarcoma-gene-exp-FPKM-chi2-gene_symbols-list.csv", index=False)