In [2]:
%reload_ext autoreload
%autoreload 2
import sys
sys.path.append('/home/sebastian/masters/') # add my repo to python path
import os
import torch
import torch.nn.functional as F
import torch_geometric
import kmbio  # fork of biopython PDB with some changes in how the structure, chain, etc. classes are defined.
import numpy as np
import pandas as pd
import proteinsolver
import modules

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import *
from torch import nn, optim
from pathlib import Path

from modules.dataset import *
from modules.utils import *
from modules.models import *
from modules.lstm_utils import *

### Get sequences

In [4]:
root = Path("/home/sebastian/masters/data/")
data_root = root / "neat_data"
metadata_path = data_root / "metadata.csv"
processed_dir = data_root / "processed" / "tcr_binding"
state_file = root / "state_files" / "e53-s1952148-d93703104.state"
out_dir = root / "state_files" / "tcr_binding"
model_dir = data_root / "raw" / "tcrpmhc"

one_letter_map = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',
 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N', 
 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W', 
 'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'}

paths = list(model_dir.glob("*"))
join_key = [int(x.name.split("_")[0]) for x in paths]
path_df = pd.DataFrame({'#ID': join_key, 'path': paths})

metadata = pd.read_csv(metadata_path)
metadata = metadata.join(path_df.set_index("#ID"), on="#ID", how="inner")  # filter to non-missing data
metadata = metadata.reset_index(drop=True)

out_path = data_root / "full_seqs_raw.fsa"

with open(out_path, "w") as outfile:
    for idx, raw_file in zip(metadata["#ID"], metadata["path"]):

        structure_all = kmbio.PDB.load(raw_file)
        structure_all = merge_chains(structure_all) 
        structure = kmbio.PDB.Structure(idx, structure_all[0].extract("A"))
        seq = [one_letter_map[res.resname] for res in structure.residues]
        seq = "".join(seq)

        print(f">{idx}", file=outfile)
        print(seq, file=outfile)

### Rename stuff and organize more neatly

In [None]:
from_dir = "/home/sebastian/masters/data/210916_TCRpMHCmodels/models/"
to_dir = "/home/sebastian/masters/data/neat_data/tcrpmhc/"
model_suffix = "model_TCR-pMHC.pdb"
for subdir in os.listdir(from_dir):
    subdir_id = subdir.split("_")[0]
    new_name = f"{subdir_id}_tcrpmhc.pdb"
    os.system(f"mv {from_dir}/{subdir}/{model_suffix} {to_dir}/{new_name}")

In [None]:
from_dir = "/home/sebastian/masters/data/embedding_verification/raw_filtered_models"
to_dir = "/home/sebastian/masters/data/neat_data/pmhc/"
model_suffix = "model_pMHC.pdb"
for subdir in os.listdir(from_dir):
    subdir_id = subdir.split("_")[0]
    new_name = f"pmhc_{subdir_id}.pdb"
    os.system(f"mv {from_dir}/{subdir}/{model_suffix} {to_dir}/{new_name}")

In [None]:
from_dir = "/home/sebastian/masters/data/embedding_verification/raw_filtered_models"
to_dir = "/home/sebastian/masters/data/neat_data/p/"
model_suffix = "model_p.pdb"
for subdir in os.listdir(from_dir):
    subdir_id = subdir.split("_")[0]
    new_name = f"p_{subdir_id}.pdb"
    os.system(f"mv {from_dir}/{subdir}/{model_suffix} {to_dir}/{new_name}")

In [2]:
l = ["p", "pmhc", "tcrpmhc"]

for x in l:
    path = f"/home/sebastian/masters/data/neat_data/raw/{x}/"
    files = os.listdir(path)
    for file in files:
        name = os.path.basename(file)
        name = name.split(".")[0].split("_")[1]
        new_name = f"/home/sebastian/masters/data/neat_data/raw/{x}/{name}_{x}.pdb"
        os.system(f"mv {path}/{file} {new_name}")