### Imports

In [1]:
import pandas as pd
import numpy as np
import zipfile
import pandas as pd
import json
from sklearn.preprocessing import StandardScaler

### Unzip and read in data

In [None]:
# Specify path to zipped data
zip_path = "data/expression_data.zip"
tsv_filename = "master_expression_data.tsv"

# Open the zip file, extract the tsv, and read into dataframe
# Should take ~4 mins
with zipfile.ZipFile(zip_path, 'r') as z:
    with z.open(tsv_filename) as f:
        df = pd.read_csv(f, sep="\t")

### Filter and process data based on genes.json

In [None]:
# Open the gene markers list and read it in
with open("../genes.json", "r") as f:
    marker_genes = json.load(f)

# we don't need everything - just the genes of interest and ids
keep = ["id", "project_id"]
data = df[keep + list(set([gene for genes in marker_genes.values() for gene in genes]))]
data = data.copy()
data["project_id"] = data["project_id"].str[5:]

# Log transform the data and zscore it
data[list(data.columns)[2:]] = np.log1p(data[list(data.columns[2:])])
scaler = StandardScaler()
data[list(data.columns)[2:]] = scaler.fit_transform(data[list(data.columns)[2:]])

# Get intermediate scores for each marker
for marker in marker_genes:
    data[marker] = data[marker_genes[marker]].mean(axis=1)

# Get the overall score
data["overall_score"] = data[list(marker_genes.keys())].mean(axis=1)

### Save data as processed_data.tsv

In [4]:
# Save the dataframe as processed_data.tsv
data.to_csv("data/processed_data.tsv", sep="\t", index=False)