In [9]:
import os
import json

import pandas as pd
import numpy as np
import matplotlib 
import matplotlib.pyplot as plt

from linear_modeling import *

### Load data and sample metadata

In [2]:
data_path = os.path.join(
    os.getcwd(),
    "..",
    "data",
    "S_cereviseae_compendia_recount_bio_2022_03_16"
)

In [3]:
with open(
        os.path.join(data_path, 'aggregated_metadata.json'), 'r') as jsonfile:
    metadata_file = json.load(jsonfile)

tables = {}
for k, v in metadata_file['experiments'].items():
    tables[v["accession_code"]] = v
    
metadata = pd.DataFrame(tables).T

In [4]:
data = pd.read_csv(os.path.join(data_path, "SACCHAROMYCES_CEREVISIAE.tsv"), sep="\t", header=0, index_col=0)
print("Loaded data with dimensions:", str(data.shape))

Loaded data with dimensions: (5370, 12428)


In [277]:
train = data.sample(frac=0.6, random_state=42, axis=1)
print("Train data dimensions:", train.shape)

Train data dimensions: (5370, 7457)


In [278]:
test = data.drop(train.columns, axis=1)
print("Test data dimensions:", test.shape)

Test data dimensions: (5370, 4971)


### Load gene metadata

In [279]:
# Get metadata for genes and extract genes with "transporter" annotation
gene_mapper = pd.read_csv(os.path.join(
    os.getcwd(),
    "..",
    "data",
    "yeast_orf_dict.csv"
), header=None, names=["id", "symbol", "name", "description"])

transporters = gene_mapper.loc[gene_mapper["description"].str.contains("transporter")]
transporters_list = transporters["id"].tolist()

In [280]:
# Extract transporters from data
train_transporters = train.reindex(labels=transporters_list, axis=0)
test_transporters = test.reindex(labels=transporters_list, axis=0)

In [281]:
# Get ID for transporter
_name = "CTP1"
_this = transporters.loc[transporters["symbol"] == _name]["id"].tolist()[0]

# Extract transporter from `train` and `test` for labels
training_labels = train.loc[[_this]]
test_labels = test.loc[[_this]]

# Extract all other genes `train` and `test` for input data
training_data = train.loc[train.index != _this]
test_data = test.loc[test.index != _this]

# Format axis for analysis
training_labels = np.array(training_labels.T)
test_labels = np.array(test_labels.T)
training_data = np.array(training_data.T)
test_data = np.array(test_data.T)

# Pad `train` and `test` with 1s
training_data = np.hstack((training_data, np.ones((training_data.shape[0], 1))))
test_data = np.hstack((test_data, np.ones((test_data.shape[0], 1))))

In [282]:
X = training_data
y = training_labels