# Yersinia phage Phi R1-37

Stages of analysis are presented as were performed with the Yersinia phage

In [2]:
#required imports
import os
import pathlib
import re

import numpy as np
import pandas as pd

import Bio.SeqIO as SeqIO
import Bio.SeqUtils.ProtParam as bp

import matplotlib.pyplot as plt
import seaborn as sns

import ungin_functions as ung

In [3]:
#storing proteomes of uracil-DNA phages
genome = "phages/phir1-37.gb"
df = ung.parse_uracil_genome(genome=genome)
print(df)

       ID                                           Sequence  Seq Length
0    g001  MKKNVCVITERDVVLAKRRKGCKLQTDVDVALTEMNNIRFDFRVII...          90
1    g002  MKILKATNPRWFAQSNSTFSKFAIRGATEVLSDWMDNAFSAFCQLY...          83
2    g003  MYITRRKKRFNTTVYYNIYLVPGKMLTVRDKNIYSVYKIFEKELDF...          82
3    g004  MKTIKARGFNKNKILDLTPIKETRRSAMDTLMRRILFIEVDISNHM...          72
4    g005  MKIVRISRKKDRDIGRLIIGHIFATYKELKGTLLFRYDYIGYKCKV...          62
..    ...                                                ...         ...
362  g363  MKTIRTIKHLPFDRYNEQMTKTFLLDQPISKIFMIRKDAEDFDNEN...          84
363  g364  MYISRTNEREMNFKRKSNPKTVREIYTNLLHDDFSLPKYKMYFGFE...          97
364  g365  MKALSIIDDEEREGSIRDYDYKILEPINEILEDLEEEIMISGIGVT...          75
365  g366  MRRLKGKVVGIDNLLRWDRQWGGRMWLEGSAFNDYDVLFKDTVLEF...          74
366  g367  MKIKKFKMKYEYVYHDWLQQNAEEIIWGYQLIKIDPRHFSDIKVEI...          61

[367 rows x 3 columns]


In [4]:
#generating fasta file storing all sequences
ung.write_df_to_fasta(df=df, output_file="uploads/all_seqs.fasta")

## Structure prediction: confidence filter

In [None]:
#monomer prediction using esm_fold
#note that only 363 out of 367 were able to be analysed
#those missing are g083, g099, g294, g295

In [4]:
#complex prediction using esm_fold
#generating required fasta file
with open('uploads/all_seqs.fasta','r') as f:
    scraped_text = f.readlines()
i = 0
with open('uploads/ye_mult.fasta',"w") as g:
    for line in scraped_text:
        if i % 2 == 0:
            g.write(line)
        else:
            g.write(f'MPASLTWHDVIGQEKEQPYFKDTLAYVAAERNAGKTIYPAQHDVFNAFRLTELDQVKVVILGQDPYHGPNQAHGLSFSVLPGVPAPPSLVNIYKELATDIPGFQRPNHGFLQSWAEQGVLLLNTVLTVEAGNAHSHANLGWETFTDKVIAALNEHRDGVIFMLWGAHAQKKGRIIDTQRHFILKAPHPSPLSAHRGFLGCKHFSQANQLLQQHGQQPIDWQPKLPTAE:{line}')
        i+=1
#note that only 356 out of 367 were able to be analysed due to lack of available memory on Colab
#those missing were the 4 missing from monomer prediction, also g061, g070, g160, g196, g207, g234 and g296

In [None]:
#first filter = confidence of predicted structure
#analysing confidence data output of monomer predictions
ung.analyse_confidence(input_dir="/d/user6/tl003/thesis/monomer_prediction", df=df, scale=True, prediction="monomer")
os.chdir("/d/user6/tl003/thesis")

In [None]:
#analysing confidence data output of ye predictions
#again, may need to run in batches
ung.analyse_confidence(input_dir="/d/user6/tl003/thesis/ye_prediction", df=df, scale=True, prediction="ye")
os.chdir("/d/user6/tl003/thesis")

In [7]:
#storing all data
df.to_csv("filter1.csv", index=False)
print(df)

       ID                                           Sequence  Seq Length  \
0    g001  MKKNVCVITERDVVLAKRRKGCKLQTDVDVALTEMNNIRFDFRVII...          90   
1    g002  MKILKATNPRWFAQSNSTFSKFAIRGATEVLSDWMDNAFSAFCQLY...          83   
2    g003  MYITRRKKRFNTTVYYNIYLVPGKMLTVRDKNIYSVYKIFEKELDF...          82   
3    g004  MKTIKARGFNKNKILDLTPIKETRRSAMDTLMRRILFIEVDISNHM...          72   
4    g005  MKIVRISRKKDRDIGRLIIGHIFATYKELKGTLLFRYDYIGYKCKV...          62   
..    ...                                                ...         ...   
362  g363  MKTIRTIKHLPFDRYNEQMTKTFLLDQPISKIFMIRKDAEDFDNEN...          84   
363  g364  MYISRTNEREMNFKRKSNPKTVREIYTNLLHDDFSLPKYKMYFGFE...          97   
364  g365  MKALSIIDDEEREGSIRDYDYKILEPINEILEDLEEEIMISGIGVT...          75   
365  g366  MRRLKGKVVGIDNLLRWDRQWGGRMWLEGSAFNDYDVLFKDTVLEF...          74   
366  g367  MKIKKFKMKYEYVYHDWLQQNAEEIIWGYQLIKIDPRHFSDIKVEI...          61   

     Mean pLDDT: monomer  Mean pLDDT: ye  ...ye only  ...without ye  
0              29

## Structure alignment: similarity filter

In [3]:
#second filter = structural similarity to known UngIns
#performed with tm-align local install
#bash align_structures.sh -a tm -t template -q predicted_structures -o tmalign_output_1.txt

tm = pd.read_csv('uploads/tmalign_output_1.txt')
tm = tm.sort_values('TMScore',ascending=False)
tm_filt=tm[tm['TMScore'] > 0.5].reset_index(drop=True)
print(tm.describe())
print(tm_filt.describe())
ung.plot_tm(df=tm, filename="monomer_prediction/plots/TM")

       Aligned length        RMSD       SeqID     TMScore
count      824.000000  824.000000  824.000000  824.000000
mean        46.354369    3.703932    0.073885    0.352559
std         14.233964    0.901139    0.041982    0.080700
min         14.000000    0.460000    0.000000    0.119780
25%         37.000000    3.200000    0.045000    0.304513
50%         46.000000    3.710000    0.070000    0.350795
75%         55.000000    4.340000    0.098000    0.405210
max         88.000000    5.620000    0.243000    0.577660
       Aligned length       RMSD      SeqID    TMScore
count       32.000000  32.000000  32.000000  32.000000
mean        53.218750   3.194687   0.078406   0.523643
std          6.690071   0.364877   0.038815   0.020086
min         44.000000   2.510000   0.000000   0.502210
25%         50.000000   2.950000   0.052250   0.510840
50%         51.500000   3.140000   0.079000   0.517085
75%         56.000000   3.405000   0.098750   0.524905
max         79.000000   4.230000   0.1

In [4]:
#known UngIns compared to each other
#bash align_structures.sh -a tm -t template -q template -o tmalign_output_2.txt
tm = pd.read_csv('uploads/tmalign_output_2.txt')
ung.plot_template_comp(df=tm,filename="monomer_prediction/plots/template_comp",templates=["8AIM_A", "8AIN_B", "8AIL_CF", "6XQI_A"])

In [45]:
#complex structures
tm = pd.read_csv('uploads/ye_output.txt')
print(tm.sort_values('TMScore',ascending=False))
ung.plot_tm(df=tm, filename="ye_prediction/plots/TM")

     Template Query  Aligned length  RMSD  SeqID  TMScore
136   5JK7_GH  g244             270  3.26  0.474  0.80492
108   5JK7_GH  g208             267  3.13  0.457  0.80490
64    5JK7_GH  g149             261  2.86  0.471  0.79828
61    5JK7_GH  g145             261  3.01  0.475  0.79453
24    5JK7_GH  g084             258  2.87  0.484  0.79072
..        ...   ...             ...   ...    ...      ...
229  8AIL_BCF  g077             219  1.35  0.516  0.64625
389  8AIL_BCF  g301             219  1.35  0.516  0.64618
395  8AIL_BCF  g307             219  1.35  0.516  0.64611
407  8AIL_BCF  g329             219  1.37  0.516  0.64527
832   8AIN_AB  g337             215  1.32  0.502  0.64489

[836 rows x 6 columns]
