In [10]:
import pandas as pd

from pyranges import PyRanges
from s2rnai.io import GffRow

pd.options.display.max_rows = 200


In [11]:
colnames = ["Chromosome", "Start", "End", "Name", "Score", "Strand"]


In [12]:
# Import sample table
sampletable = pd.read_table("../../rnai-aln-wf/config/sampletable.tsv")
drscs = sampletable.drsc.unique()


In [13]:
justin = (
    pd.read_table("../../output/drsc.bed", header=None, names=colnames)
    .pipe(lambda x: x[x.Name.isin(drscs)])
    .reset_index(drop=True)
)


In [14]:
res = []
with open("../../data/external/lee/DRSC_regents.gtf", "r") as fh:
    for row in fh:
        gff = GffRow(row)
        if gff.parsed_attributes["transcript_id"] in drscs:
            res.append(
                (
                    gff.seqid,
                    gff.start,
                    gff.end,
                    gff.parsed_attributes["transcript_id"],
                    gff.score,
                    gff.strand,
                )
            )

lee = pd.DataFrame(res, columns=colnames)



In [15]:
def check_equality(series):
    return (
        (series.Chromosome_justin == series.Chromosome_lee.replace("chr", ""))
        & (int(series.Start_justin) == int(series.Start_lee))
        & (int(series.End_justin) == int(series.End_lee))
        & (str(series.Strand_justin) == str(series.Strand_lee))
    )


In [16]:
# Inconsistencies between the GFF and Lee

justin_lee = pd.merge(justin, lee, how="outer", on="Name", suffixes=("_justin", "_lee")).assign(
    equal=lambda x: x.apply(check_equality, axis=1)
)

justin_lee[~justin_lee.equal]



Unnamed: 0,Chromosome_justin,Start_justin,End_justin,Name,Score_justin,Strand_justin,Chromosome_lee,Start_lee,End_lee,Score_lee,Strand_lee,equal


In [17]:
intervals = PyRanges(justin)  # type: PyRanges


In [18]:
# List of DRSC reagents that overlap (there are 107)
overlap = (
    intervals.cluster()
    .df.groupby("Cluster")
    .agg({"Chromosome": "size", "Name": lambda x: x.str.cat(sep="|")})
    .query("Chromosome > 1")
)

overlap


Unnamed: 0_level_0,Chromosome,Name
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
7,2,DRSC00793|DRSC30091
10,2,DRSC29550|DRSC00451
17,2,DRSC03073|DRSC37498
34,2,DRSC02765|DRSC23798
42,2,DRSC25397|DRSC01927
49,2,DRSC26463|DRSC02046
51,2,DRSC26006|DRSC03346
64,2,DRSC00324|DRSC24890
77,2,DRSC00744|DRSC28177
84,2,DRSC03463|DRSC33063
