# DYNAMIC GAIN-GRN Assigning
This is an interactive notebook for assigning the GAIN Generic Residue Numbering Scheme to an unknown protein. Please enter the UniProtKB accession ID below.

In [None]:
# Inititalize modules and functions.
import os, tempfile, re, json
import pickle as pkl
import nglview as nv
# LOCAL IMPORTS
from gaingrn.utils.gain_classes import GainDomainNoAln
import gaingrn.utils.structure_utils
import gaingrn.utils.request_utils
import gaingrn.utils.bb_angle_tools
import gaingrn.utils.io
import gaingrn.utils.assign

In [None]:
GESAMT_BIN = "/opt/xtal/ccp4-9/bin/gesamta"
STRIDE_BIN = "/home/guille/Programs/stride/stridea"
GESAMT_BIN, STRIDE_BIN = gaingrn.utils.io.check_3rd_party(GESAMT_BIN, STRIDE_BIN)

### Here are some examples of distant GAIN domains
#### Invertebrates
- "Q8SZ78" D.melanogaster mayo/CG11318
- "G5EDW2" C.elegans lat-1
- "A1Z7G7" D.melanogaster Cirl
#### PKD GAIN domains
- "Q8R526" M.musculus PKD1L1
- "H2LRU7" O.latipes PKD2
- "P98161" H.sapiens PKD1
#### OTHER
- "B3SDA6" Trichoplax adhaerens Protein kinase domain-containing protein (NEGATIVE CONTROL)
- "Q8CJ12" M.musculus ADGRG2 (already in dataset)

In [None]:
uniprot_accession = "G5EDW2"
output_folder = "../../G5EDW2"
maxres = 360 # The maximum number of residues in a model to be evaluated. Increase this if you suspect the GAIN to be larger than that.

#### Run the following cells one after another. An **nglviewer** widget will display the step-by-step progress.
Follow the documentation under http://nglviewer.org/nglview/release/v0.5.1/index.html#installation if you experience issues with the nglviewer widget.

In [None]:
uniprot_info = gaingrn.utils.request_utils.request_uniprot(uniprot_accession) # This is a list of dictionaries of the results
if uniprot_info is None:
    raise NameError("The provided UniProtKB accession did not yield any results. Please check your accession number.")

gps_end, gps_end_sequence, protein_name = gaingrn.utils.request_utils.extract_gain_end(uniprot_accession, uniprot_info)
if gps_end is not None and gps_end < maxres:
    maxres = gps_end

if not os.path.isdir(output_folder):
    os.mkdir(output_folder)
json.dump(uniprot_info, open(f'{output_folder}/uniprotkb_{uniprot_accession}.json', 'w'))

gaingrn.utils.request_utils.request_alphafolddb_model(uniprot_accession, output_folder)
pdbfile = f'{output_folder}/AF-{uniprot_accession}-F1.pdb'
jsonfile = f'{output_folder}/AF-{uniprot_accession}-F1.json'

# Routine for manual determination of gps_end
view = nv.show_file(pdbfile, background='white')
view.background = 'white'
view

#### Display the Information about this GAIN domain in NGLviewer: Subdomain A is represented as blue, Subdomain B as orange. The respective boundaries are labeled.

In [None]:
if gps_end is None:
    print("You need to manually set the C-terminal GAIN domain boundary. Please look into the structure and find the residue matching the GAIN domain end. You can likely find it by looking directly N-terminal of the seven-transmembrane domain, if present.")
    gps_end = int(input("Manually set the C-terminal GAIN end: "))
    print("Set gps_end to", gps_end)
    gps_end_sequence = gaingrn.utils.request_utils.get_uniprot_seq(uniprot_info, uniprot_accession, c_end=gps_end)
truncated_sequence = gps_end_sequence[gps_end-maxres:] # matches the sequences to the PDB sequence (zero-indexed!)

# Run STRIDE for evaluating the secondary structure items.
target_stride = f"{output_folder}/AF-{uniprot_accession}-F1.stride"

gaingrn.utils.io.run_stride(pdbfile, target_stride, STRIDE_BIN) # This will be a stride analysis of the WHOLE protein.
outlier_stride = f"{output_folder}/AF-{uniprot_accession}-F1.outliers.stride"
gaingrn.utils.bb_angle_tools.detect_outliers(target_stride, outlier_stride, sigmas=2)
complete_sse_dict = gaingrn.utils.io.read_sse_loc(outlier_stride)

# Cut the complete_ssse_dict down to only include entries before the detected end residue and within the maximum residue number.
truncated_sse_dict = gaingrn.utils.structure_utils.truncate_stride_dict(complete_sse_dict, start=gps_end-maxres, end=gps_end)

# Find the GAIN domain start and subdomain boundary, if applicable
gain_start, gain_subdomain_boundary = gaingrn.utils.structure_utils.find_boundaries(truncated_sse_dict, seq_len=gps_end, bracket_size=30, domain_threshold=15, coil_weight=0.08, truncate_N=3)
truncated_pdbfile = gaingrn.utils.structure_utils.truncate_pdb(pdbfile, start=gain_start, end=gps_end) # gps_end-maxres+1 ensures that the resulting PDB has exactly $maxres resiudes
gain_sequence = gps_end_sequence[gain_start-gps_end-1:]

# Highlight Subdomain A, Subdomain B and the Boundary between them.
view = nv.show_file(truncated_pdbfile)
view.clear()
c = {gain_subdomain_boundary:"red",gain_start:"blue",gps_end:"orange"}
for o in [gain_subdomain_boundary, gain_start, gps_end]:
    view.add_representation(repr_type="label", name ="label", showBackground =True, labelType="res", color=c[o],
                     sele = f"{o} and .CA", xOffset = 0.5 , zOffset =5, fixedSize=True )
    view.add_hyperball(selection=f"{o}", color=c[o])
view.add_cartoon(selection=f'{gain_subdomain_boundary}-{gps_end}', color='orange')
view.add_cartoon(selection=f'{gain_start}-{gain_subdomain_boundary}', color='blue')
view

#### With the detected boundaries of GAIN subdomains, proceed to map them onto the available templates.
For this, find the best fitting templates for each subdomain, align and assign the GAIN-GRN.

In [None]:
# First, rewrite the STRIDE file for Outlier detection:
print(f'[NOTE] Creating instance of GainDomain: {uniprot_accession}_{protein_name.replace(" ","-")}')
target_gain = GainDomainNoAln(
                start=gain_start, 
                subdomain_boundary=gain_subdomain_boundary, 
                end=gps_end,
                name=f'{uniprot_accession}_{protein_name.replace(" ","-")}',
                sequence=gain_sequence,
                explicit_stride_file=outlier_stride,
                is_truncated=True,
                stride_outlier_mode=True,
                debug=False)

print("THE FOLLOWING HELICAL SEGMENTS IN SUBDOMAIN A WERE DETECTED:", target_gain.sda_helices, sep="\n")
print("THE FOLLOWING STRAND SEGMENTS IN SUBDOMAIN B WERE DETECTED:", target_gain.sdb_sheets, sep="\n")

pkl.dump(target_gain, open(f"{output_folder}/{uniprot_accession}.pkl",'wb'))

gaingrn.utils.io.write2fasta(sequence=target_gain.sequence, name=target_gain.name, filename=f"{output_folder}/{target_gain.name}.fa")

element_intervals, element_centers, residue_labels, unindexed_elements, params = gaingrn.utils.assign.assign_indexing(
                                gain_obj=target_gain,
                                file_prefix=f"{output_folder}/indexing",
                                gain_pdb=truncated_pdbfile,
                                template_dir='../data/template_pdbs/',
                                template_json='../data/template_data.json',
                                gesamt_bin=GESAMT_BIN,
                                debug=False,
                                create_pdb=True,
                                hard_cut={"S2":7,"S6":3,"H5":3},
                                patch_gps=True,
                                template_mode='extent',
                                sda_mode='q'
                                )

pkl.dump([element_intervals, element_centers, residue_labels, unindexed_elements, params], open(f"{output_folder}/indexing.pkl",'wb'))

print("[DEBUG]", residue_labels)
rmsds = {sd:float(re.search(r"RMSD\W+\:\W+[0-9]+\.[0-9]+",open(f"{output_folder}/indexing_{sd}.out").read()).group(0).split()[-1]) for sd in ["sda","sdb"]}
#print(rmsds)
print(f"The RMSD values of Subdomain Matches are:\n\tSDA: {round(rmsds['sda'], 3)} A\n\tSDB: {round(rmsds['sdb'], 3)} A")
if rmsds["sdb"] > 2.0: print("WARNING: The Matching RMSD in Subdomain B is very high. This GAIN domain is likely not a good fit, if a GAIN domain at all. Please check your protein further.")

#print(element_intervals, element_centers, residue_labels, unindexed_elements, params, sep="\n")
res2label = {v:k for k,v in residue_labels.items() if v is not None}

#### Map the generated GAIN-GRN indexing visually on the GAIN Domain model

In [None]:
view = nv.show_file(truncated_pdbfile)

view.clear()
view.background = 'white'
ca_indices = gaingrn.utils.structure_utils.get_ca_indices(truncated_pdbfile, offset=gaingrn.utils.structure_utils.get_pdb_offset(truncated_pdbfile))
label_dict = {ca_indices[res]:label for label, res in residue_labels.items() if res is not None}

mysel = ",".join([str(k) for k in label_dict.keys()])

view.add_representation(repr_type="label", name ="label", showBackground =True, labelType="text", color='black', 
                        labelText=label_dict,
                        sele = f'@{mysel}', xOffset = 0.5 , zOffset =5, fixedSize=False )

view.add_cartoon(selection=f'{gain_subdomain_boundary}-{gps_end}', color='orange')
view.add_cartoon(selection=f'{gain_start}-{gain_subdomain_boundary}', color='blue')

view


#### Lastly, the generated GAIN-GRN indexing is written to file.
All GAIN-GRN-related info can now be found in your target directory.

In [None]:
# Write the generated GAIN-GRN to files
gaingrn.utils.io.label2b(pdbfile=pdbfile, outfile=pdbfile.replace(".pdb","_grn.pdb"),res2label=res2label, clear_b=True)
gaingrn.utils.io.label2b(pdbfile=truncated_pdbfile, outfile=truncated_pdbfile.replace(".pdb","_grn.pdb"), res2label=res2label, clear_b=True)
gaingrn.utils.io.grn2csv(res2label, outfile=f"{output_folder}/{uniprot_accession}_grn.csv", target_gain=target_gain)
print(res2label)
print(label_dict)

In [None]:
import pandas as pd
pd.DataFrame(res2label.keys(), index=res2label.values(), columns=["resSeq"])

In [None]:
open(f"{output_folder}/{uniprot_accession}_grn.csv").read().splitlines()[:10]