In [1]:
import pandas as pd
import os
import gzip
import shutil

In [2]:
# explore harmonizome/dataset_info.txt
with open('../datasets/harmonizome/dataset_info.txt') as f:
    for line in f:
        print(line)
        
# seems to just be description of datasets in harmonizome

abbreviation	name	path

biogpshuman_cleaned	BioGPS Human Cell Type and Tissue Gene Expression Profiles - Cleaned	C:/Users/ar988996/Documents/Harmonizome/datasets/biogpshuman/gene_attribute_matrix_cleaned.txt.gz

biogpsmouse_cleaned	BioGPS Mouse Cell Type and Tissue Gene Expression Profiles - Cleaned	C:/Users/ar988996/Documents/Harmonizome/datasets/biogpsmouse/gene_attribute_matrix_cleaned.txt.gz

biogrid	BioGRID Protein-Protein Interactions	C:/Users/ar988996/Documents/Harmonizome/datasets/biogrid/gene_attribute_matrix.txt.gz

brainatlasadulthuman_cleaned	Allen Brain Atlas Adult Human Brain Tissue Gene Expression Profiles - Cleaned	C:/Users/ar988996/Documents/Harmonizome/datasets/brainatlasadulthuman/gene_attribute_matrix_cleaned.txt.gz

brainatlasadultmouse_cleaned	Allen Brain Atlas Adult Mouse Brain Tissue Gene Expression Profiles - Cleaned	C:/Users/ar988996/Documents/Harmonizome/datasets/brainatlasadultmouse/gene_attribute_matrix_cleaned.txt.gz

clinvar	ClinVar SNP-Phenotype Associat

In [3]:
# what are the validated final featurers, before removing them for model selection?
df = pd.read_csv('../datasets/generalizable_features/generalizable_summary.csv')
df.drop_duplicates(subset=['abbreviation', 'feature'], inplace=True)
df.dropna(subset=['abbreviation', 'feature'], inplace=True)
print(len(df))
print(df)

51
       validation_rep  validation_fold  \
0                 0.0              0.0   
1                 0.0              0.0   
3                 0.0              0.0   
4                 0.0              0.0   
5                 0.0              0.0   
6                 0.0              0.0   
7                 0.0              0.0   
8                 0.0              0.0   
9                 0.0              0.0   
10                0.0              0.0   
11                0.0              0.0   
12                0.0              0.0   
13                0.0              0.0   
14                0.0              0.0   
15                0.0              0.0   
16                0.0              0.0   
17                0.0              0.0   
18                0.0              0.0   
19                0.0              0.0   
20                0.0              0.0   
25                0.0              2.0   
26                0.0              2.0   
30                0.0          

In [4]:
# get just the necessary info from df
df = df[['abbreviation', 'feature']]
print(df)

                       abbreviation                       feature
0      brainatlasadultmouse_cleaned                 r3 roof plate
1      brainatlasadultmouse_cleaned                          mean
3      brainatlasadulthuman_cleaned                          mean
4               biogpsmouse_cleaned                          mean
5            hpatissuesmrna_cleaned                          mean
6            hpatissuesmrna_cleaned                          stdv
7               biogpshuman_cleaned                          mean
8               biogpshuman_cleaned                          stdv
9         jensentissueexpts_cleaned                          mean
10        jensentissueexpts_cleaned              peripheral blood
11        jensentissueexpts_cleaned            hematopoietic cell
12        jensentissueexpts_cleaned                  t-lymphocyte
13        jensentissueexpts_cleaned                   marrow cell
14        jensentissueexpts_cleaned                   spinal cord
15        

In [13]:
# script to search through directory for dataset name, save dataframe as csv
def save_df(dataset, features):
    dir = '/Users/kanetian7/omic-features-successful-targets/datasets/nonredundant_features/'
    for file in os.listdir(dir):
        f = dir + file.split('.')[0]
        file = dir + file
        if file == dir + dataset + '.txt.gz':
            with gzip.open(file, 'rb') as f_in:
                with open(f + '.txt', 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
            filea = dir + dataset + '.txt'
        if file == dir + dataset + '.txt':
            data = pd.read_csv(file, sep='\t')
            data = data[~data['#'].str.contains("#")]
            feature_cols = [dataset + '_' + str(x) for x in list(data.columns)[4:]]
            data_cols = list(data.iloc[0, :].values)[:4]
            cols = data_cols + feature_cols
            data = data[~data['#'].str.contains("Gene")]
            data.columns = cols
            
            data = data[['GeneSym', 'class', 'GeneID'] + [dataset + '_' + str(x) for x in features]]
            data = data[data['class'] != 'unknown']
            data.reset_index(drop=True, inplace=True)
            
            print(data.columns)
            print(len(data))
    data.to_csv('/Users/kanetian7/omic-features-successful-targets/kane/' + dataset + '.csv', index=False)

In [14]:
# create all the csvs 
datasets_features = {}
for index, row in df.iterrows():
    dataset = row['abbreviation']
    feature = row['feature']
    if dataset not in datasets_features:
        datasets_features[dataset] = [feature]
    else:
        datasets_features[dataset].append(feature)

for dataset, features in datasets_features.items():
    print('on ' + dataset)
    save_df(dataset, features)

on brainatlasadultmouse_cleaned
Index(['GeneSym', 'class', 'GeneID',
       'brainatlasadultmouse_cleaned_r3 roof plate',
       'brainatlasadultmouse_cleaned_mean'],
      dtype='object')
287
on brainatlasadulthuman_cleaned
Index(['GeneSym', 'class', 'GeneID', 'brainatlasadulthuman_cleaned_mean'], dtype='object')
287
on biogpsmouse_cleaned
Index(['GeneSym', 'class', 'GeneID', 'biogpsmouse_cleaned_mean',
       'biogpsmouse_cleaned_stdv'],
      dtype='object')
313
on hpatissuesmrna_cleaned
Index(['GeneSym', 'class', 'GeneID', 'hpatissuesmrna_cleaned_mean',
       'hpatissuesmrna_cleaned_stdv'],
      dtype='object')
314
on biogpshuman_cleaned
Index(['GeneSym', 'class', 'GeneID', 'biogpshuman_cleaned_mean',
       'biogpshuman_cleaned_stdv'],
      dtype='object')
320
on jensentissueexpts_cleaned
Index(['GeneSym', 'class', 'GeneID', 'jensentissueexpts_cleaned_mean',
       'jensentissueexpts_cleaned_peripheral blood',
       'jensentissueexpts_cleaned_hematopoietic cell',
       'jense

In [21]:
# combine all the csvs 
fdf = None
for file in os.listdir('/Users/kanetian7/omic-features-successful-targets/kane'):
    if file.endswith('.csv') and file != 'final_data.csv':
        if fdf is None:
            fdf = pd.read_csv('/Users/kanetian7/omic-features-successful-targets/kane/' + file)
        else:
            fdf2 = pd.read_csv('/Users/kanetian7/omic-features-successful-targets/kane/' + file)
            fdf = pd.merge(fdf, fdf2, how='outer', on=['GeneSym', 'class', 'GeneID'])
print(len(fdf.columns))
fdf.fillna(0, inplace=True)
fdf.to_csv('/Users/kanetian7/omic-features-successful-targets/kane/final_data.csv', index=False)

54


In [22]:
# number examples positive and negative
print(len(fdf[fdf['class'] == 'positive']))
print(len(fdf[fdf['class'] == 'negative']))

259
72
