In [None]:
import urllib.request, gzip, json, time, os

gtf_gz = "/content/gencode.v36.annotation.gtf.gz"
if not os.path.exists(gtf_gz):
    print("download GENCODE v36...")
    tic = time.time()
    urllib.request.urlretrieve(
        "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_36/gencode.v36.annotation.gtf.gz",
        gtf_gz
    )
    print("download:", round(time.time() - tic, 1), "second.")

id2sym = {}
with gzip.open(gtf_gz, "rt") as f:
    for ln in f:
        if "\tgene\t" in ln and 'gene_id "' in ln and 'gene_name "' in ln:
            gid  = ln.split('gene_id "')[1].split('"')[0].split('.')[0]
            name = ln.split('gene_name "')[1].split('"')[0]
            id2sym[gid] = name

out_path = "/content/drive/MyDrive/Data_RNA-Seq/id2sym.json"
with open(out_path, "w") as f:
    json.dump(id2sym, f)

print(f" ENSG → symbol: {len(id2sym):,} gene.")
print("Save:", out_path)


In [None]:
import os

zip_path = "/content/drive/MyDrive/Data_RNA-Seq/TCGA_CESC_STAR.zip"

flat_dir = "/content/TCGA_CESC_STAR_flat"

!rm -rf "$flat_dir"
!mkdir -p "$flat_dir"

!unzip -j "$zip_path" "*.tsv" -d "$flat_dir"
!ls -lh "$flat_dir" | head


In [None]:
import glob, os, pandas as pd, numpy as np, tqdm

OUT_DIR = "/content/TCGA_CESC_STAR_flat"

tsv_files = sorted(glob.glob(os.path.join(OUT_DIR, "*.tsv")))
print("Number file .tsv:", len(tsv_files))

def read_tpm(path):
    df = pd.read_csv(path, sep="\t", comment="#")
    df = df[df["gene_id"].astype(str).str.startswith("ENSG")]
    sid = os.path.basename(path).split(".")[0]
    return df[["gene_id", "tpm_unstranded"]].rename(columns={"tpm_unstranded": sid})

print("⇢ mapping file...")
mat = read_tpm(tsv_files[0])
for f in tqdm.tqdm(tsv_files[1:]):
    mat = mat.merge(read_tpm(f), on="gene_id", how="outer")


In [None]:
from pathlib import Path
import pandas as pd, numpy as np, urllib.request, gzip, os, re, time

PROJECT_ID = "TCGA-CESC"
SAVE_DIR   = Path("/content/drive/MyDrive/Data_RNA-Seq")
csv_out    = SAVE_DIR / f"{PROJECT_ID}_TPM_log2_filtered_coding.csv.gz"
pq_out     = csv_out.with_suffix(".parquet")

mat = (mat.set_index("gene_id")  if "gene_id" in mat.columns else mat) \
        .apply(pd.to_numeric, errors="coerce").fillna(0).astype(np.float32)

print("After type conversion:", mat.shape, mat.dtypes.unique())

mat = mat[(mat > 1).any(axis=1)]
mat = np.log2(mat + 1)
mat = mat[(mat > 1).sum(axis=1) >= 0.1 * mat.shape[1]]
print("After TPM filtering & log₂ transform:", mat.shape)

gtf_gz = "/content/gencode.v36.annotation.gtf.gz"
if not os.path.exists(gtf_gz):
    print("↻ Downloading GENCODE v36…"); tic = time.time()
    urllib.request.urlretrieve(
        "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_36/"
        "gencode.v36.annotation.gtf.gz", gtf_gz)
    print("   Done in", round(time.time() - tic, 1), "s")

coding = set()
with gzip.open(gtf_gz, "rt") as fh:
    for ln in fh:
        if "\tgene\t" in ln and "gene_type \"protein_coding\"" in ln:
            gid = ln.split('gene_id "')[1].split('"')[0].split('.')[0]
            coding.add(gid)

idx_base = mat.index.str.replace(r"\.\d+$", "", regex=True)
mat      = mat.loc[idx_base.isin(coding)]
print("After retaining protein-coding genes:", mat.shape)

mat.to_csv(csv_out, compression="gzip")
mat.to_parquet(pq_out, compression="zstd")
print("Saved files:")
print("   •", csv_out)
print("   •", pq_out)


In [None]:
# kNN
import pandas as pd, numpy as np, time, json, os, urllib.request, re
from sklearn.neighbors import NearestNeighbors

mat = pd.read_parquet(
    "/content/drive/MyDrive/Data_RNA-Seq/TCGA-CESC_TPM_log2_filtered_coding.csv.parquet")
print("Expression matrix:", mat.shape)

if mat.index.str.startswith("ENSG").any():
    id2sym = json.load(open("/content/drive/MyDrive/Data_RNA-Seq/id2sym.json"))
    base   = mat.index.to_series().str.replace(r"\.\d+$", "", regex=True)
    mat.index = base.map(id2sym).fillna(base)

ppi_path = "/content/drive/MyDrive/PPI_STRING/ppi_for_gnn_filled.csv"
ppi_df   = pd.read_csv(ppi_path)[["protein1", "protein2"]].rename(
              columns={"protein1": "gene1", "protein2": "gene2"})
ppi_nodes = set(ppi_df["gene1"]) | set(ppi_df["gene2"])
print("Original PPI:", len(ppi_df)//2, "edges |", len(ppi_nodes), "nodes")

mat = mat.loc[mat.index.isin(ppi_nodes)]
print("Filtered matrix intersected with PPI:", mat.shape)

k = 3
X = mat.values.astype(np.float32)

t0 = time.time()
nbrs = NearestNeighbors(n_neighbors=k+1,
                        metric='correlation',
                        algorithm='brute',
                        n_jobs=-1).fit(X)
ind = nbrs.kneighbors(X, return_distance=False)
print("k-NN completed in:", round(time.time()-t0,1), "seconds")

genes = mat.index.to_numpy()
ppi_set = {tuple(sorted(t)) for t in ppi_df.itertuples(index=False, name=None)}
new_pairs = set()

for row_idx, neigh in enumerate(ind):
    src = genes[row_idx]
    for n_idx in neigh[1:]:
        dst = genes[n_idx]
        pair = tuple(sorted((src, dst)))
        if pair not in ppi_set:
            new_pairs.add(pair)

print("New k-NN edges (not in original PPI):", len(new_pairs))

new_edges = pd.DataFrame(list(new_pairs), columns=["gene1", "gene2"])
all_edges = pd.concat([ppi_df, new_edges], ignore_index=True)
all_edges["pair"] = all_edges.apply(lambda r: tuple(sorted((r.gene1, r.gene2))), axis=1)
all_edges = all_edges.drop_duplicates("pair")[["gene1", "gene2"]]

print("Original PPI edges:", len(ppi_df))
print("New k-NN edges   :", len(new_edges))
print("Total edges after merging:", len(all_edges))

out_path = "/content/drive/MyDrive/PPI_STRING/ppi_CESC_plus_knn3.csv.gz"
all_edges.to_csv(out_path, index=False, compression="gzip")
print("New network saved to:", out_path)


In [None]:
import pandas as pd
src = "/content/drive/MyDrive/PPI_STRING/ppi_CESC_plus_knn3.csv.gz"
edges = pd.read_csv(src)

print("⭐ Number of unique undirected edges:", len(edges))

edges_rev = edges.rename(columns={"gene1": "gene2", "gene2": "gene1"})

edges_rev = edges_rev[edges_rev["gene1"] != edges_rev["gene2"]]

edges_bidir = pd.concat([edges, edges_rev], ignore_index=True).drop_duplicates()

print("⭐ Number of edges after adding reversed direction:", len(edges_bidir))
print("→ Each undirected edge appears twice as (A, B) & (B, A)")

dst = "/content/drive/MyDrive/PPI_STRING/ppi_CESC_plus_knn3_bidirectional.csv.gz"
edges_bidir.to_csv(dst, index=False, compression="gzip")
print("💾 Saved bidirectional network to:", dst)
