# gnerate node 1

In [1]:
# prepare_condition_nodes.py
from pathlib import Path
import numpy as np
import pandas as pd

# ---- paths ----
BASE = Path("/storage/users/job37yv/Projects/Franziska_faber")
DATA_PROCESSED = BASE / "data" / "processed_data"
ANALYSIS = BASE / "analysis"
OUT_DIR = ANALYSIS / "Networks" / "nodes"
OUT_DIR.mkdir(parents=True, exist_ok=True)

H5AD = DATA_PROCESSED / "host_bulk_fractional_counts_ensembl.h5ad"
MAPS_PKL = DATA_PROCESSED / "ensembl_mappings.pkl"

# ---- params ----
COND_COL = "condition"
CUTOFF_Q = 0.05   # 5th percentile on nonzero condition-mean values

# ---- load adata_ens ----
import anndata as ad
adata = ad.read_h5ad(H5AD)
assert COND_COL in adata.obs.columns, f"'{COND_COL}' not found in adata.obs"

# use raw fractional counts (layers['counts']) if available
if "counts" in adata.layers:
    counts = pd.DataFrame(np.asarray(adata.layers["counts"]), index=adata.obs_names, columns=adata.var_names)
else:
    counts = pd.DataFrame(np.asarray(adata.X), index=adata.obs_names, columns=adata.var_names)

# ---- replicate means per condition ----
meta = adata.obs[[COND_COL]].copy()
cond_means = counts.groupby(meta[COND_COL]).mean()  # rows = conditions, cols = Ensembl (with version)

# ---- per-condition threshold & filter ----
def thresh_series(s: pd.Series, q=CUTOFF_Q):
    nz = s[s > 0]
    thr = float(np.quantile(nz, q)) if len(nz) else 0.0
    # keep values >= thr, zero out the rest
    s2 = s.where(s >= thr, 0.0)
    return s2, thr

filtered = {}
thresholds = {}
for cond in cond_means.index:
    s2, thr = thresh_series(cond_means.loc[cond], q=CUTOFF_Q)
    filtered[cond] = s2
    thresholds[cond] = thr
filtered = pd.DataFrame(filtered).T  # same shape as cond_means

print("[INFO] per-condition thresholds (5th percentile of nonzero means):")
for k, v in thresholds.items():
    print(f"  {k}: {v:.4g}")

# ---- Ensembl(version) → UniProt mapping ----
ens2uniprot = {}

# 1) try pickle
if MAPS_PKL.exists():
    import pickle
    with open(MAPS_PKL, "rb") as f:
        maps = pickle.load(f)
    if isinstance(maps, dict) and "ens2uniprot" in maps and isinstance(maps["ens2uniprot"], dict):
        ens2uniprot = maps["ens2uniprot"]

# 2) try adata.var column if pickle absent/empty
if not ens2uniprot and "uniprot_id" in adata.var.columns:
    def first_u(x):
        if isinstance(x, list):
            return x[0] if x else None
        return x
    ens2uniprot = {ensg: first_u(u) for ensg, u in zip(adata.var_names, adata.var["uniprot_id"])}

# 3) last resort: mygene (strip versions)
if not ens2uniprot:
    try:
        import mygene
        mg = mygene.MyGeneInfo()
        ids = pd.Index(adata.var_names.astype(str))
        core = ids.str.replace(r"\.\d+$", "", regex=True)
        q = mg.querymany(core.tolist(), scopes="ensembl.gene",
                         fields="uniprot.Swiss-Prot", species="human",
                         as_dataframe=True, returnall=False, verbose=False)
        if isinstance(q, pd.DataFrame) and not q.empty:
            df = q.reset_index().rename(columns={"query": "ensembl_core"}).drop_duplicates("ensembl_core")
            back = pd.DataFrame({"ensembl_core": core.values, "ensg_ver": ids.values}).merge(df, on="ensembl_core", how="left")
            def pick(v):
                if isinstance(v, list): return v[0] if v else None
                return v
            ens2uniprot = {r["ensg_ver"]: pick(r.get("uniprot.Swiss-Prot")) for _, r in back.iterrows()}
    except Exception as e:
        print(f"[WARN] mygene fallback failed: {e}")

# clean mapping (drop Nones/NaNs/empty)
ens2uniprot = {k: v for k, v in ens2uniprot.items() if v not in [None, [], "nan"]}

# ---- for each condition: keep expressed genes, map to UniProt, aggregate ----
for cond in filtered.index:
    s = filtered.loc[cond]
    s = s[s > 0.0]  # expressed above threshold
    if s.empty:
        print(f"[WARN] {cond}: no genes pass threshold; skipping file.")
        continue

    # map Ensembl(version) → UniProt
    uni = s.index.to_series().map(ens2uniprot)
    keep = uni.notna()
    if keep.sum() == 0:
        print(f"[WARN] {cond}: no UniProt matches; skipping file.")
        continue

    df = pd.DataFrame({"uniprot": uni[keep].astype(str).values, "expr": s[keep].values})
    # if multiple Ensembl map to same UniProt, keep max expr
    df_nodes = df.groupby("uniprot", as_index=True)["expr"].max().sort_values(ascending=False).reset_index()

    outp = OUT_DIR / f"{cond}_nodes.csv"
    df_nodes.to_csv(outp, index=False)
    print(f"[OK] {cond}: wrote {len(df_nodes)} nodes → {outp}")

print("[DONE] node preparation complete.")


  cond_means = counts.groupby(meta[COND_COL]).mean()  # rows = conditions, cols = Ensembl (with version)


[INFO] per-condition thresholds (5th percentile of nonzero means):
  Bt: 0.06333
  Cd: 0.05667
  Co: 0.06333
  Mock: 0.09017
[OK] Bt: wrote 16484 nodes → /storage/users/job37yv/Projects/Franziska_faber/analysis/Networks/nodes/Bt_nodes.csv
[OK] Cd: wrote 16504 nodes → /storage/users/job37yv/Projects/Franziska_faber/analysis/Networks/nodes/Cd_nodes.csv
[OK] Co: wrote 16502 nodes → /storage/users/job37yv/Projects/Franziska_faber/analysis/Networks/nodes/Co_nodes.csv
[OK] Mock: wrote 16580 nodes → /storage/users/job37yv/Projects/Franziska_faber/analysis/Networks/nodes/Mock_nodes.csv
[DONE] node preparation complete.


# generate nodes 2

In [3]:
# === Build and save expr_mean (genes × conditions) with UniProt & symbol ===
from pathlib import Path
import numpy as np
import pandas as pd

BASE = Path("/storage/users/job37yv/Projects/Franziska_faber")
DATA_PROCESSED = BASE / "data" / "processed_data"
ANALYSIS = BASE / "analysis"
OUT = ANALYSIS / "expr_mean"
OUT.mkdir(parents=True, exist_ok=True)

H5AD = DATA_PROCESSED / "host_bulk_fractional_counts_ensembl.h5ad"
EXPR_MEAN_CSV = OUT / "expr_mean.csv"          # main table
EXPR_MEAN_PKL = OUT / "expr_mean.pkl"          # exact dtypes (optional)

COND_COL = "condition"   # change if needed

import anndata as ad
adata = ad.read_h5ad(H5AD)
assert COND_COL in adata.obs.columns, f"'{COND_COL}' not found in adata.obs"

# choose a layer for averaging:
# - prefer 'log1p' if you want log-scale means
# - else use 'counts' (fractional)
if "log1p" in adata.layers:
    MAT = np.asarray(adata.layers["log1p"])   # samples × genes
else:
    MAT = np.asarray(adata.layers["counts"] if "counts" in adata.layers else adata.X)

counts = pd.DataFrame(MAT, index=adata.obs_names, columns=adata.var_names)

# replicate means per condition (rows=conditions, cols=genes)
meta = adata.obs[[COND_COL]].copy()
cond_means = counts.groupby(meta[COND_COL], observed=False).mean()  # silence FutureWarning

# expr_mean format: index=ENSG, columns=conditions
expr_mean = cond_means.T.copy()   # genes × conditions
ensg_idx = expr_mean.index.astype(str)

# ---- Add UniProtID (Swiss-Prot) ----
# Priority: existing adata.var column -> mygene fallback
uni = None
for col in ["uniprot_id", "uniprot", "UniProtID", "uniprot_swissprot"]:
    if col in adata.var.columns:
        # if column contains lists, pick first; if strings, pass-through
        vals = adata.var[col]
        if hasattr(vals.iloc[0], "__iter__") and not isinstance(vals.iloc[0], str):
            vals = vals.apply(lambda v: (v[0] if isinstance(v, list) and v else None))
        uni = pd.Series(vals.values, index=adata.var_names, dtype="object")
        break

if uni is None:
    try:
        import mygene
        mg = mygene.MyGeneInfo()
        core = pd.Index(ensg_idx).str.replace(r"\.\d+$", "", regex=True)
        q = mg.querymany(core.tolist(), scopes="ensembl.gene",
                         fields="uniprot.Swiss-Prot", species="human",
                         as_dataframe=True, returnall=False, verbose=False)
        if isinstance(q, pd.DataFrame) and not q.empty:
            df = q.reset_index().rename(columns={"query": "ensembl_core"}).drop_duplicates("ensembl_core")
            back = pd.DataFrame({"ensembl_core": core.values, "ensg_ver": ensg_idx}) \
                     .merge(df, on="ensembl_core", how="left")
            def pick(v):
                if isinstance(v, list): return v[0] if v else None
                return v
            uni = pd.Series([pick(x) for x in back.get("uniprot.Swiss-Prot")], index=back["ensg_ver"])
    except Exception as e:
        print(f"[WARN] mygene unavailable/failed: {e}")

if uni is None:
    uni = pd.Series(index=expr_mean.index, dtype="object")

# ---- Add gene_symbol ----
# Prefer adata.var['gene_symbol']; fallback to Ensembl ID
if "gene_symbol" in adata.var.columns:
    sym = pd.Series(adata.var["gene_symbol"].astype(str).values, index=adata.var_names)
    sym = sym.reindex(expr_mean.index)
    # fix: fill using a Series, not an Index
    sym = sym.where(sym.notna(), pd.Series(expr_mean.index, index=expr_mean.index))
else:
    sym = pd.Series(expr_mean.index, index=expr_mean.index)

# attach annotations
expr_mean["UniProtID"]   = uni.reindex(expr_mean.index).astype("object")
expr_mean["gene_symbol"] = sym.astype(str)

# (optional) drop rows with no UniProt or symbol
# expr_mean = expr_mean.dropna(subset=["UniProtID", "gene_symbol"])

# Save
expr_mean.to_csv(EXPR_MEAN_CSV)
expr_mean.to_pickle(EXPR_MEAN_PKL)
print(f"[OK] expr_mean saved →\n  {EXPR_MEAN_CSV}\n  {EXPR_MEAN_PKL}")
print("Preview:")
preview_cols = list(expr_mean.columns[: min(5, expr_mean.shape[1]-2)]) + ["UniProtID", "gene_symbol"]
print(expr_mean.iloc[:5][preview_cols])


Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed


[OK] expr_mean saved →
  /storage/users/job37yv/Projects/Franziska_faber/analysis/expr_mean/expr_mean.csv
  /storage/users/job37yv/Projects/Franziska_faber/analysis/expr_mean/expr_mean.pkl
Preview:
condition                   Bt          Cd      Co         Mock UniProtID  \
ENSG00000223972.5     5.033333   11.420000    4.97    10.010000       NaN   
ENSG00000227232.5  1052.246667  946.046667  800.13  1313.433333       NaN   
ENSG00000278267.1     0.000000    0.000000    0.00     0.000000       NaN   
ENSG00000243485.5     0.323333    0.400000    0.93     1.036667       NaN   
ENSG00000284332.1     0.000000    0.000000    0.00     0.000000       NaN   

condition          gene_symbol  
ENSG00000223972.5      DDX11L1  
ENSG00000227232.5       WASH7P  
ENSG00000278267.1    MIR6859-1  
ENSG00000243485.5  MIR1302-2HG  
ENSG00000284332.1    MIR1302-2  


In [5]:
from pathlib import Path
import pandas as pd
import numpy as np

BASE = Path("/storage/users/job37yv/Projects/Franziska_faber")
OUT  = BASE / "analysis" / "expr_mean"
EXPR_MEAN_CSV = OUT / "expr_mean.csv"           # already created in your run
EXPR_MEAN_PKL = OUT / "expr_mean.pkl"

# load the current expr_mean (genes × conditions + UniProtID + gene_symbol)
expr_mean = pd.read_pickle(EXPR_MEAN_PKL)
cond_cols = [c for c in expr_mean.columns if c not in {"UniProtID","gene_symbol"}]

# ---------- 1) Coverage report for current table ----------
n_genes   = expr_mean.shape[0]
n_symbol  = expr_mean["gene_symbol"].notna().sum()
n_uniprot = expr_mean["UniProtID"].notna().sum()
print(f"[COVERAGE] rows={n_genes} | gene_symbol mapped={n_symbol} ({n_symbol/n_genes:.1%}) | "
      f"UniProtID mapped={n_uniprot} ({n_uniprot/n_genes:.1%})")

# Optional: how many look like Ensembl IDs (with version)
is_ens = expr_mean.index.to_series().str.match(r"^ENSG\d+(?:\.\d+)?$").fillna(False)
print(f"[INFO] Ensembl-like IDs (with version): {is_ens.sum()} ({is_ens.mean():.1%})")

# ---------- 2) Remap UniProt after stripping Ensembl version ----------
ids = expr_mean.index.astype(str)
ens_core = ids.str.replace(r"\.\d+$", "", regex=True)  # strip version suffix

# try to reuse existing adata.var mapping if it’s already aligned by core ID; else use mygene
uni_novers = None
try:
    import mygene
    mg = mygene.MyGeneInfo()
    q = mg.querymany(ens_core.tolist(), scopes="ensembl.gene",
                     fields="uniprot.Swiss-Prot", species="human",
                     as_dataframe=True, returnall=False, verbose=False)
    if isinstance(q, pd.DataFrame) and not q.empty:
        df = q.reset_index().rename(columns={"query":"ensembl_core"}).drop_duplicates("ensembl_core")
        # map back to expr_mean order
        back = pd.DataFrame({"ensembl_core": ens_core.values, "ensg_ver": ids}).merge(df, on="ensembl_core", how="left")
        def pick(v):
            if isinstance(v, list): return v[0] if v else None
            return v
        uni_novers = pd.Series([pick(x) for x in back.get("uniprot.Swiss-Prot")], index=back["ensg_ver"])
except Exception as e:
    print(f"[WARN] mygene remap failed: {e}")

if uni_novers is None:
    # fall back to the existing column (no change)
    uni_novers = expr_mean["UniProtID"].copy()

# coverage report after remap
cover_remap = uni_novers.notna().sum()
print(f"[REMAP] UniProtID (version-stripped Ensembl) mapped={cover_remap}/{n_genes} ({cover_remap/n_genes:.1%})")

# ---------- 3) Build a second expr_mean with the remapped UniProt ----------
expr_mean_novers = expr_mean.copy()
expr_mean_novers["UniProtID"] = uni_novers.astype("object")

# (optional) keep a clean version with only mapped UniProt rows
# expr_mean_novers_mapped = expr_mean_novers.dropna(subset=["UniProtID"])

# ---------- 4) Save both ----------
expr_mean.to_csv(OUT / "expr_mean_current.csv")
expr_mean_novers.to_csv(OUT / "expr_mean_novers.csv")
expr_mean_novers.to_pickle(OUT / "expr_mean_novers.pkl")

print(f"[OK] wrote:\n  {OUT/'expr_mean_current.csv'}\n  {OUT/'expr_mean_novers.csv'}\n  {OUT/'expr_mean_novers.pkl'}")

# quick peek: first 10 with newly mapped UniProt
peek = expr_mean_novers.loc[expr_mean_novers["UniProtID"].notna()].head(10)
print("\n[PREVIEW remapped (first 10 with UniProt)]")
print(peek[cond_cols + ["UniProtID","gene_symbol"]])


Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed


[COVERAGE] rows=67016 | gene_symbol mapped=67016 (100.0%) | UniProtID mapped=21807 (32.5%)
[INFO] Ensembl-like IDs (with version): 67016 (100.0%)
[REMAP] UniProtID (version-stripped Ensembl) mapped=21806/67016 (32.5%)
[OK] wrote:
  /storage/users/job37yv/Projects/Franziska_faber/analysis/expr_mean/expr_mean_current.csv
  /storage/users/job37yv/Projects/Franziska_faber/analysis/expr_mean/expr_mean_novers.csv
  /storage/users/job37yv/Projects/Franziska_faber/analysis/expr_mean/expr_mean_novers.pkl

[PREVIEW remapped (first 10 with UniProt)]
condition                   Bt          Cd          Co        Mock UniProtID  \
ENSG00000186092.6     0.243333    1.110000    0.250000    0.513333    Q8NH21   
ENSG00000284733.1     0.000000    0.000000    0.000000    0.000000    Q6IEY1   
ENSG00000284662.1     0.000000    0.000000    0.000000    0.000000    Q6IEY1   
ENSG00000187634.12   54.796667   59.793333   64.236667   73.033333    Q96NU1   
ENSG00000188976.11  895.780000  782.843333  753.106667 

In [7]:
print(expr_mean_novers.iloc[:45][preview_cols])

condition                    Bt           Cd           Co          Mock  \
ENSG00000223972.5      5.033333    11.420000     4.970000     10.010000   
ENSG00000227232.5   1052.246667   946.046667   800.130000   1313.433333   
ENSG00000278267.1      0.000000     0.000000     0.000000      0.000000   
ENSG00000243485.5      0.323333     0.400000     0.930000      1.036667   
ENSG00000284332.1      0.000000     0.000000     0.000000      0.000000   
ENSG00000237613.2      0.120000     0.730000     0.660000      0.820000   
ENSG00000268020.3      0.000000     0.036667     0.000000      0.583333   
ENSG00000240361.2      0.443333     0.193333     0.396667      0.046667   
ENSG00000186092.6      0.243333     1.110000     0.250000      0.513333   
ENSG00000238009.6     86.910000   114.360000    67.123333    119.203333   
ENSG00000239945.1      0.000000     0.000000     0.000000      0.000000   
ENSG00000233750.3      8.760000     7.750000    11.173333      8.276667   
ENSG00000268903.1     86.

## Uniprot only

In [8]:
# === Reduce expr_mean to UniProt-mapped rows only ===

from pathlib import Path
import pandas as pd

BASE = Path("/storage/users/job37yv/Projects/Franziska_faber")
OUT  = BASE / "analysis" / "expr_mean"

# load the remapped version
expr_mean_novers = pd.read_pickle(OUT / "expr_mean_novers.pkl")

# drop rows without UniProtID
expr_mean_up = expr_mean_novers.dropna(subset=["UniProtID"]).copy()

print(f"[FILTER] kept {expr_mean_up.shape[0]}/{expr_mean_novers.shape[0]} rows "
      f"({expr_mean_up.shape[0]/expr_mean_novers.shape[0]:.1%}) with UniProt IDs")

# save reduced table
expr_mean_up.to_csv(OUT / "expr_mean_uniprot.csv")
expr_mean_up.to_pickle(OUT / "expr_mean_uniprot.pkl")

# preview first 10 rows
cond_cols = [c for c in expr_mean_up.columns if c not in {"UniProtID","gene_symbol"}]
print(expr_mean_up.head(10)[cond_cols + ["UniProtID","gene_symbol"]])


[FILTER] kept 21806/67016 rows (32.5%) with UniProt IDs
condition                   Bt          Cd          Co        Mock UniProtID  \
ENSG00000186092.6     0.243333    1.110000    0.250000    0.513333    Q8NH21   
ENSG00000284733.1     0.000000    0.000000    0.000000    0.000000    Q6IEY1   
ENSG00000284662.1     0.000000    0.000000    0.000000    0.000000    Q6IEY1   
ENSG00000187634.12   54.796667   59.793333   64.236667   73.033333    Q96NU1   
ENSG00000188976.11  895.780000  782.843333  753.106667  823.106667    Q9Y3T9   
ENSG00000187961.14  272.196667  224.173333  200.083333  285.500000    Q6TDP4   
ENSG00000187583.11  149.853333  111.120000  134.616667  141.266667    Q494U1   
ENSG00000187642.9    22.803333   19.370000   25.050000   18.526667    Q5SV97   
ENSG00000188290.11  276.123333  171.333333  177.183333  152.000000    Q9HCC6   
ENSG00000187608.10  381.853333  312.763333  349.496667  306.050000    P05161   

condition          gene_symbol  
ENSG00000186092.6        OR4F5

## LogFC transform

In [10]:
# === Log2-transform expr_mean_uniprot and save as expr_mean ===

import numpy as np
import pandas as pd
from pathlib import Path

BASE = Path("/storage/users/job37yv/Projects/Franziska_faber")
OUT  = BASE / "analysis" / "expr_mean"

# load UniProt-filtered version
expr_mean_up = pd.read_pickle(OUT / "expr_mean_uniprot.pkl")

# identify numeric condition columns (exclude annotation cols)
cond_cols = [c for c in expr_mean_up.columns if c not in {"UniProtID", "gene_symbol"}]

# log2(x+1) transform
expr_mean_log2 = expr_mean_up.copy()
expr_mean_log2[cond_cols] = np.log2(expr_mean_log2[cond_cols] + 1)

print("[TRANSFORM] Applied log2(x+1) to expression values.")

# save as the main expr_mean
expr_mean_log2.to_csv(OUT / "expr_mean.csv")
expr_mean_log2.to_pickle(OUT / "expr_mean.pkl")

# quick preview
print(expr_mean_log2.head(10)[cond_cols + ["UniProtID", "gene_symbol"]])


[TRANSFORM] Applied log2(x+1) to expression values.
condition                 Bt        Cd        Co      Mock UniProtID  \
ENSG00000186092.6   0.314213  1.077243  0.321928  0.597730    Q8NH21   
ENSG00000284733.1   0.000000  0.000000  0.000000  0.000000    Q6IEY1   
ENSG00000284662.1   0.000000  0.000000  0.000000  0.000000    Q6IEY1   
ENSG00000187634.12  5.802107  5.925841  6.027611  6.210103    Q96NU1   
ENSG00000188976.11  9.808610  9.614422  9.558625  9.686687    Q9Y3T9   
ENSG00000187961.14  8.093796  7.814892  7.651650  8.162391    Q6TDP4   
ENSG00000187583.11  7.237003  6.808900  7.083391  7.152454    Q494U1   
ENSG00000187642.9   4.573092  4.348374  4.703211  4.287374    Q5SV97   
ENSG00000188290.11  8.114384  7.429058  7.477219  7.257388    Q9HCC6   
ENSG00000187608.10  8.580648  8.293533  8.453257  8.262330    P05161   

condition          gene_symbol  
ENSG00000186092.6        OR4F5  
ENSG00000284733.1       OR4F29  
ENSG00000284662.1       OR4F16  
ENSG00000187634.12     

## Save table

In [11]:
# === Log2-transform and save expr_mean ===
import numpy as np
import pandas as pd
from pathlib import Path

BASE = Path("/storage/users/job37yv/Projects/Franziska_faber")
OUT  = BASE / "analysis" / "expr_mean"
OUT.mkdir(parents=True, exist_ok=True)

# load the UniProt-filtered version first
expr_mean_up = pd.read_pickle(OUT / "expr_mean_uniprot.pkl")

# numeric condition columns (exclude annotation)
cond_cols = [c for c in expr_mean_up.columns if c not in {"UniProtID", "gene_symbol"}]

# log2(x+1) transform
expr_mean = expr_mean_up.copy()
expr_mean[cond_cols] = np.log2(expr_mean[cond_cols] + 1)

# save only log2-transformed version
EXPR_MEAN_PKL = OUT / "expr_mean.pkl"
EXPR_MEAN_CSV = OUT / "expr_mean.csv"

expr_mean.to_pickle(EXPR_MEAN_PKL)
expr_mean.to_csv(EXPR_MEAN_CSV)

print(f"[OK] log2-transformed expr_mean saved:\n  {EXPR_MEAN_PKL}\n  {EXPR_MEAN_CSV}")
print(expr_mean.shape)
print(expr_mean.head())


[OK] log2-transformed expr_mean saved:
  /storage/users/job37yv/Projects/Franziska_faber/analysis/expr_mean/expr_mean.pkl
  /storage/users/job37yv/Projects/Franziska_faber/analysis/expr_mean/expr_mean.csv
(21806, 6)
condition                 Bt        Cd        Co      Mock UniProtID  \
ENSG00000186092.6   0.314213  1.077243  0.321928  0.597730    Q8NH21   
ENSG00000284733.1   0.000000  0.000000  0.000000  0.000000    Q6IEY1   
ENSG00000284662.1   0.000000  0.000000  0.000000  0.000000    Q6IEY1   
ENSG00000187634.12  5.802107  5.925841  6.027611  6.210103    Q96NU1   
ENSG00000188976.11  9.808610  9.614422  9.558625  9.686687    Q9Y3T9   

condition          gene_symbol  
ENSG00000186092.6        OR4F5  
ENSG00000284733.1       OR4F29  
ENSG00000284662.1       OR4F16  
ENSG00000187634.12      SAMD11  
ENSG00000188976.11       NOC2L  


## Load later

In [12]:
from pathlib import Path
import pandas as pd

BASE = Path("/storage/users/job37yv/Projects/Franziska_faber")
EXPR_MEAN_PKL = BASE / "analysis" / "expr_mean" / "expr_mean.pkl"
EXPR_MEAN_CSV = BASE / "analysis" / "expr_mean" / "expr_mean.csv"

# pick one:
expr_mean = pd.read_pickle(EXPR_MEAN_PKL)
# expr_mean = pd.read_csv(EXPR_MEAN_CSV, index_col=0)

print(expr_mean.shape)
expr_mean.head()


(21806, 6)


condition,Bt,Cd,Co,Mock,UniProtID,gene_symbol
ENSG00000186092.6,0.314213,1.077243,0.321928,0.59773,Q8NH21,OR4F5
ENSG00000284733.1,0.0,0.0,0.0,0.0,Q6IEY1,OR4F29
ENSG00000284662.1,0.0,0.0,0.0,0.0,Q6IEY1,OR4F16
ENSG00000187634.12,5.802107,5.925841,6.027611,6.210103,Q96NU1,SAMD11
ENSG00000188976.11,9.80861,9.614422,9.558625,9.686687,Q9Y3T9,NOC2L
