In [None]:
import numpy as np
import pandas as pd
import plotnine as p9
from Bio import SeqIO
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import colors
import src.rng_sequences, src.human_sequences,src.constants,src.uORF_Statistics
plt.style.use('seaborn-v0_8-colorblind')
import polygraph.input, polygraph.sequence, polygraph.visualize

# Generating data files

In [None]:
'''Defining params'''
seqnum=90000
seqlen=160
np.random.seed(12345)

In [None]:
seqs1 = src.rng_sequences.randomseqs(seqnum,seqlen,group_name="Random_gc70",seed=12345,gc_content=0.7)
seqs2 = src.rng_sequences.randomseqs(seqnum,seqlen,group_name="Random_gc50",seed=12345)

In [None]:
'''This code is used to generate a dataset with continous IDs and save it in a txt file'''
randomseqs=np.concatenate((seqs1,seqs2),axis=0)
randomseqsindex=np.array([f"seq_{i+1}" for i in range(len(randomseqs))])
randomseqs=np.vstack((randomseqsindex,randomseqs.T)).T
np.savetxt("data/random_dataset.txt",randomseqs,delimiter="\t",newline="\n",fmt='%s')

In [None]:
'''Here we read out the human genome sequences and save it in a txt file'''
humanseqs=src.human_sequences.readFASTA("data/gencode_5utrs_human.fa")
humanseqs=src.human_sequences.delDupes(humanseqs)
np.savetxt("data/humanseqsRNA.txt",humanseqs,delimiter="\t",newline="\n",fmt='%s')

In [None]:
''' Here we write every Sequence into one txt file'''
seqfiles=['data/random_dataset.txt','data/humanseqsRNA.txt']
with open('data/full_dataset.txt','w') as outfile:
    for fname in seqfiles:
        with open(fname) as infile:
            for line in infile:
                outfile.write(line)

In [None]:
seqs=polygraph.input.read_seqs('data/full_dataset.txt',incl_ids=True)
seqs

# Generating statistics for the Sequences 

In [None]:
'''GC Content'''
seqs['GC Content']=polygraph.sequence.gc(seqs)

In [None]:
'''uORF counts'''
counts=src.uORF_Statistics.uORFs(seqs)
seqs=pd.concat([seqs,counts],axis=1)

In [None]:
'''Sequence lengths'''
seqs['Sequence Length']=seqs.Sequence.apply(len)

In [None]:
seqs

# Plots

In [None]:
(p9.ggplot(seqs, p9.aes(x="Group", y="ouORF_countssum")) 
 + p9.geom_violin()
 + p9.geom_boxplot(width=0.1, outlier_size=0.05)
 + p9.ggtitle("GC Content vs. Group")
 + p9.theme_classic()
 + p9.theme(axis_text_x=p9.element_text(rotation=90, hjust=1),figure_size=(4,3))
)


In [None]:
(p9.ggplot(seqs, p9.aes(x="Group", y="uORF_countssum")) 
 + p9.geom_violin()
 + p9.geom_boxplot(width=0.1, outlier_size=0.05)
 + p9.ggtitle("uORF Count vs. Group")
 + p9.theme_classic()
 + p9.theme(axis_text_x=p9.element_text(rotation=90, hjust=1),figure_size=(4,3))
)

In [None]:
fig,axs = plt.subplots(nrows=1, ncols=2,layout='constrained' )
gc50seqs=seqs.loc[seqs['Group']=='Random_gc50']
gc70seqs=seqs.loc[seqs['Group']=='Random_gc70']
im1=axs[0].hist2d(gc50seqs['GC Content'],gc50seqs['uORF_countssum'],norm=colors.LogNorm(),cmap='inferno',bins=12)
im2= axs[1].hist2d(gc70seqs['GC Content'],gc70seqs['uORF_countssum'],norm=colors.LogNorm(),cmap='inferno',bins=10)
axs[0].set_title("50% GC Content")
axs[1].set_title("70% GC Content")
fig.colorbar(im1[3],ax=axs[0])
fig.colorbar(im2[3],ax=axs[1])
plt.show()

# Metadata

In [None]:
'''Plot of runtimes of the uORF_statistics method over the size of the data (number or length of sequences)'''
times=[0.212,1.920,19.402,200.295]
sizes=[100,1000,10000,100000]
plt.plot(sizes,times)