In [8]:
import os
import json

import pandas as pd
import numpy as np
import matplotlib 
import matplotlib.pyplot as plt

### Load data and sample metadata

In [2]:
data_path = os.path.join(
    os.getcwd(),
    "..",
    "data",
    "S_cereviseae_compendia_recount_bio_2022_03_16"
)

In [29]:
with open(
        os.path.join(data_path, 'aggregated_metadata.json'), 'r') as jsonfile:
    metadata_file = json.load(jsonfile)

tables = {}
for k, v in metadata_file['experiments'].items():
    tables[v["accession_code"]] = v
    
metadata = pd.DataFrame(tables).T

In [68]:
data = pd.read_csv(os.path.join(data_path, "SACCHAROMYCES_CEREVISIAE.tsv"), sep="\t", header=0, index_col=0)
print("Loaded data with dimensions:", str(data.shape))

Loaded data with dimensions: (5370, 12428)


In [69]:
train = data.sample(frac=0.8, random_state=42, axis=1)
print("Train data dimensions:", train.shape)

Train data dimensions: (5370, 9942)


In [70]:
test = data.drop(train.columns, axis=1)
print("Test data dimensions:", test.shape)

Test data dimensions: (5370, 2486)


### Load gene metadata

In [71]:
# Get metadata for genes and extract genes with "transporter" annotation
gene_mapper = pd.read_csv(os.path.join(
    os.getcwd(),
    "..",
    "data",
    "yeast_orf_dict.csv"
), header=None, names=["id", "symbol", "name", "description"])

transporters = gene_mapper.loc[gene_mapper["description"].str.contains("transporter")]
transporters_list = transporters["id"].tolist()

In [74]:
train_transporters = train.reindex(labels=transporters_list, axis=0)

test_transporters = test.reindex(labels=transporters_list, axis=0)