In [1]:
print('ciao')

ciao


In [62]:
repeats = ["..A..................................CC.", ".....................................---", "GG............-......................--."]
reference = "AAGTTTCCGTCCCCTTTCGGGGAATCATTTAGAAAAT--A"
line="   145662      33    93.9      33  TAGTATTTAA    ...............T................    AGAAAACCGCATAAGGACCGATACCACATTACA"
stringa = ".--.-...............T................"
count = stringa[:stringa.find(".")].count("-")
seqs = line.split()
start=int(seqs[0]) - seqs[5][:seqs[5].find(".")].count("-")
def develop_repeats(repeats, reference):
    developed_repeats = []
    for repeat in repeats:
        repeat = list(repeat)
        for i in range(len(reference)):
            if repeat[i] == '.': 
                repeat[i] = reference[i]
        repeat = ''.join(repeat)
        repeat = repeat.replace('-', '')
        developed_repeats.append(repeat)
    return developed_repeats

develop_repeats(repeats, reference)
start
# print(develop_repeats(repeats, reference))

145662

In [89]:
# PILER-CR parsing
class CRISPR:
    '''
    A class used to represent a CRISPR array

    Attributes:
        file_name (str): the name of the file that the CRISPR was found in (MAG name)
        contig_name (str): the name of the contig that the CRISPR was found in
        start (int): Position of the first base in the CRISPR (one-indexed, inclusive)
        end (int): position of the last base in the CRISPR (one-indexed, inclusive)
        spacers (list): a list of the ordered spacers in the CRISPR
        repeats (list): a list of the ordered repeats in the CRISPR
        flankers (dict): a dictionary with the left and right flankers of the CRISPR
    
    Methods:
        __init__(): Constructor
        __len__(): Returns the length of the CRISPR calculated as the sum of the lengths of the spacers and repeats
        __bool__(): Returns True if the CRISPR is valid, False otherwise
        setFile_name(file_name): Sets the file_name attribute
        setContig_name(contig_name): Sets the contig_name attribute
        setStart(start): Sets the start attribute
        setEnd(end): Sets the end attribute
        addRepeat(repeat): Adds a repeat to the repeats list
        addSpacer(spacer): Adds a spacer to the spacers list
        setFlankerLeft(left): Sets the left flanker
        setFlankerRight(right): Sets the right flanker
    '''
    def __init__(self, file_name=None, contig_name=None, start=None, end=None):
        self.file_name = file_name
        self.contig_name = contig_name
        self.start = start
        self.end = end
        self.spacers = []
        self.repeats = []
        self.flankers = {'left': '', 'right': ''}
    
    def __repr__(self):
        return f'<CRISPR object: (\n{self.file_name}\n{self.contig_name}\n{self.start}\n{self.end}\n{self.spacers}\n{self.repeats}\n{self.flankers})>\n'
    
    def __str__(self):
        return f'f_name: {self.file_name}\ncontig: {self.contig_name}\nstart: {self.start}\nend: {self.end}\nspacers: {self.spacers}\nrepeats: {self.repeats}\nflankers: {self.flankers}\n'
    
    def __len__(self):
        return sum(len(spacer) for spacer in self.spacers) + sum(len(repeat) for repeat in self.repeats)
    
    def __bool__(self):
        return (isinstance(self.file_name, str) and self.file_name != '' and
                isinstance(self.contig_name, str) and self.contig_name != '' and
                isinstance(self.start, int) and self.start >= 0 and
                isinstance(self.end, int) and self.end >= self.start and
                len(self) == (self.end - self.start + 1))
    
    def __eq__(self, other):
        return self.file_name == other.file_name and self.contig_name == other.contig_name and self.start == other.start and self.end == other.end
    
    def setFile_name(self, file_name):
        self.file_name = file_name
    
    def setContig_name(self, contig_name):
        self.contig_name = contig_name
    
    def setStart(self, start):
        self.start = start

    def setEnd(self, end):
        self.end = end
    
    def addRepeat(self, repeat):
        self.repeats.append(repeat)
    
    def addSpacer(self, spacer):
        self.spacers.append(spacer)

    def setFlankerLeft(self, left):
        self.flankers['left'] = left

    def setFlankerRight(self, right):
        self.flankers['right'] = right


def develop_repeats(repeats, reference):
    developed_repeats = []
    for repeat in repeats:
        repeat = list(repeat)
        for i in range(len(reference)):
            if repeat[i] == '.': 
                repeat[i] = reference[i]
        repeat = ''.join(repeat)
        repeat = repeat.replace('-', '')
        developed_repeats.append(repeat)
    return developed_repeats

def parse_pilercr(file_path):
    crisprs = []
    with open(file_path, 'r') as file:
        crispr_tmp = None
        for line in file:
            line = line.strip()
            if line.startswith("Array"):
                if crispr_tmp is not None:
                    raise ValueError(f"CRISPR not finished in file {file_path}, contig {crispr_tmp.contig_name}")
            elif line.startswith(">"):
                contig_name = line[1:]
            elif line[:1].isdigit():
                seqs = line.split()
                if len(seqs) == 7 and crispr_tmp is None: # first line: flankerLeft - repeat - spacer
                    start=int(seqs[0]) + seqs[5][:seqs[5].find(".")].count("-") # adjust start position if there are gaps in the repeat
                    crispr_tmp = CRISPR(file_name=file_path.split('/')[-1].split('.')[0], contig_name=contig_name, start=start, end=None)
                    crispr_tmp.setFlankerLeft(seqs[4])
                    repeats = [seqs[5]]
                    crispr_tmp.addSpacer(seqs[6])
                elif len(seqs) == 7 and crispr_tmp is not None: # next line: repeat - spacer
                    repeats.append(seqs[5])
                    crispr_tmp.addSpacer(seqs[6])
                elif len(seqs) == 6 and crispr_tmp is not None: # last line: repeat - flankerRight
                    repeats.append(seqs[4])
                    crispr_tmp.setFlankerRight(seqs[5])
                elif len(seqs) == 4 and crispr_tmp is not None: # consensus repeat
                    consensus = seqs[3]
                    for repeat in develop_repeats(repeats, consensus):
                        crispr_tmp.addRepeat(repeat)
                    repeats = []
                    crispr_tmp.setEnd(crispr_tmp.start + len(crispr_tmp) - 1)
                    if bool(crispr_tmp):
                        crisprs.append(crispr_tmp)
                        crispr_tmp = None
                    else:
                        raise ValueError(f"Invalid CRISPR format in file {file_path}")
            elif line.startswith("SUMMARY"):
                break
        return crisprs



f_name: example
contig: MGYG000197636_2
start: 145663
end: 145895
spacers: ['AGAAAACCGCATAAGGACCGATACCACATTACA', 'AACCCATTGTCGCTCTTGCGGTTCTAAACTTTCA', 'GGCTGAGGCGGAGAATGCCGCAAGCTGGGACTTTT']
repeats: ['CTGGCAACCCACTTGTCTACGAGGATTGCAAC', 'GCTGGCAACCCACTTGCCTACGAGGATTGCAAC', 'GCTGGCAACCCACTTGCCTACGAGGATTGCAAC', 'GCTGGCAACCCACTTGCCTACGAGGATTGCAAC']
flankers: {'left': 'TAGTATTTAA', 'right': 'TGCAAAATAT'}



In [82]:
# MINCED parsing ISACCO
import os
import pandas as pd
import time

class CRISPR(object):
    '''
    A class used to represent a CRISPR array

    Attributes:
        file_name (str): the name of the file that the CRISPR was found in (MAG name)
        contig_name (str): the name of the contig that the CRISPR was found in
        start (int): Position of the first base in the CRISPR (one-indexed, inclusive)
        end (int): position of the last base in the CRISPR (one-indexed, inclusive)
        spacers (list): a list of the ordered spacers in the CRISPR
        repeats (list): a list of the ordered repeats in the CRISPR
    
    Methods:
        __init__(): Constructor
        __len__(): Returns the length of the CRISPR calculated as the sum of the lengths of the spacers and repeats
        __bool__(): Returns True if the CRISPR is valid, False otherwise
        setFile_name(file_name): Sets the file_name attribute
        setContig_name(contig_name): Sets the contig_name attribute
        setPos(start, end): Sets the start and end attributes
        addRepeat(repeat): Adds a repeat to the repeats list
        addSpacer(spacer): Adds a spacer to the spacers list
    '''
    def __init__(self, file_name=None, contig_name=None, start=None, end=None):
        self.file_name = file_name
        self.contig_name = contig_name
        self.start = start
        self.end = end
        self.spacers = []
        self.repeats = []
    def __repr__(self):
        return f'<CRISPR object: (\n{self.file_name}\n{self.contig_name}\n{self.start}\n{self.end}\n{self.spacers}\n{self.repeats})>\n'
    def __str__(self):
        return f'f_name: {self.file_name}\ncontig: {self.contig_name}\nstart: {self.start}\nend: {self.end}\nspacers: {self.spacers}\nrepeats: {self.repeats}\n'
    def __len__(self):
        return sum(len(spacer) for spacer in self.spacers) + sum(len(repeat) for repeat in self.repeats)
    def __bool__(self):
        return (len(self) == (self.end - self.start + 1) and 
                type(self.file_name) is str and self.file_name != '' and
                type(self.contig_name) is str and self.contig_name != '' and
                type(self.start) is int and self.start >= 0 and
                type(self.end) is int and self.end >= self.start
                )
    def __eq__(self, other):
        return self.file_name == other.file_name and self.contig_name == other.contig_name and self.start == other.start and self.end == other.end
    
    def setFile_name(self, file_name):
        self.file_name = file_name
    def setContig_name(self, contig_name):
        self.contig_name = contig_name
    def setPos(self, start, end):
        self.start = start
        self.end = end
    def addRepeat(self, repeat):
        self.repeats.append(repeat)
    def addSpacer(self, spacer):
        self.spacers.append(spacer)

def parse_minced(file_path):
    crisprs = []
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith("Sequence '"):
                contig_name = line.split("'")[1]
            elif line.startswith("CRISPR"):
                start, end = map(int, line.split()[3:6:2]) # Take from 4th to 6th element, step 2
                crispr_tmp = CRISPR(file_name=file_path.split('/')[-1].split('.')[0], contig_name=contig_name, start=start, end=end)
            elif line[:1].isdigit():
                seqs = line.split()
                if len(seqs) == 7:
                    crispr_tmp.addRepeat(seqs[1])
                    crispr_tmp.addSpacer(seqs[2])
                if len(seqs) == 2:
                    crispr_tmp.addRepeat(seqs[1])
            # Save the instance
            elif line.startswith("Repeats"):
                if bool(crispr_tmp):
                    crisprs.append(crispr_tmp)
                else:
                    raise ValueError(f"Invalid CRISPR format in file {file_path}")
    return crisprs


time1 = time.time()
input_dir = '/Users/isaccocenacchi/Desktop/Tirocinio/out/MAGs_mini_minced'
output=f"{input_dir}_parsed.tsv"

files = [os.path.join(dirpath,filename) 
             for dirpath, _, filenames in os.walk(input_dir) 
             for filename in filenames 
             if filename.endswith('.crispr')
            ]
crisprs_total = []
crisprs_total = [crispr for file in files for crispr in parse_minced(file)]
# for file in files:
#     crisprs_total+=parse_minced(file)

crisprs_df = pd.DataFrame([[a.file_name, a.contig_name, a.start, a.end, ','.join(a.spacers), ','.join(a.repeats)] for a in crisprs_total],
                           columns=['MAG', 'contig', 'start', 'end', 'spacers', 'repeats'])

crisprs_df.to_csv(output, sep='\t')
time2 = time.time()
print(time2-time1)
print(*crisprs_total)


0.013678789138793945
f_name: M1440811244
contig: RubelMA_2020_k141_57984
start: 4275
end: 4571
spacers: ['ACACCTACAATATTAGGGTATTGTACCGGGACTGT', 'TGTAATGTGCGGATTAAAGCACGATAACCCCAT', 'TTCCAAAAGCAATTGCAGCAAGAAGCGAAAGCCTA', 'ATCAATGGCCCAGTATTCGGCAATATCGAAGATG']
repeats: ['GTCACACCCTACGTGGGTGTGTGGATTGAAAC', 'GTCACACCCTACGTGGGTGTGTGGATTGAAAC', 'GTCACACCCTACGTGGGTGTGTGGATTGAAAC', 'GTCACACCCTACGTGGGTGTGTGGATTGAAAC', 'GTCACACCCTGCGTGGGAATATATGGATTGGA']
 f_name: M1975430591
contig: RubelMA_2020_k141_34054
start: 18399
end: 19002
spacers: ['ACCTTCATCGCGCGCTCGTAAGGAACGGCAACGGTATAGGT', 'AGTTTTTCATAATTATAAAATAATTTTGGT', 'AAATTGATACCCAACCGGCGAGCAACAAAGACC', 'CAAAACCGCATACATAATCCTTATTTTTCAGTCTGTGGC', 'CCAAATGTGATAACTCCGTAATTGTCAAACAGAAGAA', 'GGGTAATCGCCAAGAGTGAAGACGTACTCGAAAGTAC', 'TGCAACTCTTGCCAAAAACAGCCAAAAAATAAATAAT', 'ACTTGTTATAGCTCACATTATGAACCGTTGCCGCA']
repeats: ['GTATCCATTGTCTACCGCTTAGGCGGTATTGAGAC', 'ATTTCTATTGTCTACTGCTGAGGCGGTATTGAGAC', 'ATTTCTATTGTCTACCGCTGAGACGGTATTGAGAC', 'ATTTCTATTGTCTACC

In [None]:
# CRISPR parsing MATTEO
import os
import glob
import pandas as pd
import re

class CRISPR(object):
    def __init__(self, sequence, MAG):
        self.sequence = sequence.rstrip()
        self.repeats = []
        self.spacers = []
        self.mag = MAG
    def setPos(self, start, end):
        self.start = int(start.rstrip())
        self.end = int(end.rstrip())
    def addRepeat(self, repeat):
        self.repeats.append(repeat.rstrip())
    def addSpacer(self, spacer):
        put_spacer = spacer.rstrip()
        if len(put_spacer) > 0:
            self.spacers.append(put_spacer)

def parse_minced(path):
    file = open(path, 'r')

    crisprs = []
    for ll in file:
        # Record sequence accession
        if ll.startswith('Sequence'):
            sequence_current = re.sub('\' \(.*', '', re.sub('Sequence \'', '', ll))
        # Create instance of CRISPR and add positions
        if ll.startswith('CRISPR'):
            crisp_tmp = CRISPR(sequence_current, path.split('/')[-1].split('.')[0])
            pos = re.sub('.*Range: ', '', ll)
            start = re.sub(' - .*', '', pos)
            end = re.sub('.* - ', '', pos)
            crisp_tmp.setPos(start, end)
        # Add Repeats and Spacers to the current instance
        if ll[:1].isdigit():
            lll = ll.split()
            if len(lll) == 7:
                crisp_tmp.addRepeat(lll[1])
                crisp_tmp.addSpacer(lll[2])
            if len(lll) == 2:
                crisp_tmp.addRepeat(lll[1])
        # Save the instance
        if ll.startswith('Repeats'):
            crisprs.append(crisp_tmp)

    file.close()

    return crisprs

time1 = time.time()
basedir = '/Users/isaccocenacchi/Desktop/Tirocinio/out/MAGs_short_minced'
outfile = f"{basedir}_parsed_matteo.tsv"
releases = ['Aug19']

for rel in releases:
    rel_dfs = []
    for chunk in os.listdir(os.path.join(basedir, rel)):
        crispr_paths = glob.glob(os.path.join(basedir, rel, chunk, '*.crispr')) 
        chunk_arrays = [array for path in crispr_paths for array in parse_minced(path)]
        chunk_array_df = pd.DataFrame([[a.mag, a.sequence, a.start, a.end, ','.join(a.spacers), ','.join(a.repeats)] for a in chunk_arrays])
        if chunk_array_df.shape[1]==6:
            chunk_array_df.columns = ['MAG', 'contig', 'start', 'end', 'spacers', 'repeats']
            rel_dfs.append(chunk_array_df)
    rel_data = pd.concat(rel_dfs).reset_index(drop=True)
    rel_data.to_csv(outfile, sep='\t')
time2 = time.time()
print(time2-time1)