In [17]:
import pandas as pd


crispr_file = '/Users/isaccocenacchi/Desktop/Tirocinio/out/crispr_parsed.tsv'
cas_database = '/Users/isaccocenacchi/Desktop/Tirocinio/samples/Aug19_cas_genes.tsv'
output_file = '/Users/isaccocenacchi/Desktop/Tirocinio/out/crispr_parsed_cas.tsv'
cas_output_file = '/Users/isaccocenacchi/Desktop/Tirocinio/out/cas_CRISPRtools.tsv'

crisprs_df = pd.read_csv(crispr_file, delimiter='\t', usecols=['MAG', 'Contig', 'Start', 'End', 'Spacers', 'Repeats'], dtype={'MAG': str, 'Contig': str, 'Start': int, 'End': int}, index_col=False)
cas_df = pd.read_csv(cas_database, delimiter='\t')

# salvo una colonna con gli indici per fare il merge
crisprs_df['index'] = crisprs_df.index
cas_df['index'] = cas_df.index

merged_df = crisprs_df.merge(cas_df, on=['MAG', 'Contig'], how="inner", 
                                           suffixes=('_CRISPR', '_Cas'))

crisprs_df["ToolCodename"] = "minced_Paper"

# Add columns to the DataFrame
crisprs_df['Cas_0-1000']=0
crisprs_df['Cas_1000-10000']=0
# crisprs_df['Cas_0-10000']=0
crisprs_df['Cas_>100000']=0
crisprs_df['Cas_overlayed']=0



# Add columns to cas_df to count the number of single spacers are near to a Cas for each Tool
cas_df["minced_Paper"] = 0

errors = []

# Calculate the distance between CRISPR and Cas
for index, row in merged_df.iterrows():
    if row['Start_Cas'] >= row['End_CRISPR']:
        # print('Cas davati al CRISPR')
        distance = row['Start_Cas'] - row['End_CRISPR']
    elif row['End_Cas'] <= row['Start_CRISPR']:
        # print('Cas prima il CRISPR')
        distance = row['Start_CRISPR'] - row['End_Cas']
    else:
        # print('Cas che sovrappone al CRISPR')
        distance = -1
    
    if distance >= 0 and distance <= 1000:
        crisprs_df.at[row['index_CRISPR'], 'Cas_0-1000'] += 1
        cas_df.at[row['index_Cas'], "minced_Paper"] += len(row['Spacers'].split(','))
    elif distance > 1000 and distance <= 10000:
        crisprs_df.at[row['index_CRISPR'], 'Cas_1000-10000'] += 1
        # cas_df.at[row['index_Cas'], "minced_Paper"] += 1
        cas_df.at[row['index_Cas'], "minced_Paper"] += len(row['Spacers'].split(','))
    # if distance >= 0 and distance <= 10000:
    #     crisprs_df.at[row['index_CRISPR'], 'Cas_0-10000'] += 1
    #     cas_df.at[row['index_Cas'], "Minced_Default"] += 1
    elif distance > 10000:
        crisprs_df.at[row['index_CRISPR'], 'Cas_>100000'] += 1
    elif distance == -1:
        crisprs_df.at[row['index_CRISPR'], 'Cas_overlayed'] += 1
    else:
        errors.append(f'Error: Distance of MAG: {row["MAG"]} and Contig: {row["Contig"]} is {distance}')

# Remove the index columns
crisprs_df = crisprs_df.drop(columns=['index'])
cas_df = cas_df.drop(columns=['index'])

# Print the errors
for error in errors:
    print(error)

# Save the DataFrame to a file
crisprs_df.to_csv(output_file, sep='\t')
cas_df.to_csv(cas_output_file, sep='\t')



In [1]:
# Cas distance

import os
import sys
import pandas as pd

crispr_file = '/home/isacco.cenacchi/data/Tirocinio/out/MAGs_minced_parsed.tsv'
cas_database = '/home/isacco.cenacchi/data/Tirocinio/samples/Aug19_cas_genes.tsv'
output_file = '/home/isacco.cenacchi/data/Tirocinio/out/MAGs_minced_parsed_cas.tsv'

# input_file = '/Users/isaccocenacchi/Desktop/Tirocinio/out/MAGs_short_minced_parsed.tsv'
# cas_database = '/Users/isaccocenacchi/Desktop/Tirocinio/samples/Aug19_cas_genes.tsv'
# output_file = '/Users/isaccocenacchi/Desktop/Tirocinio/out/MAGs_short_minced_parsed_cas.tsv'

# Carica i file TSV
try:
    CRISPR_df = pd.read_csv(crispr_file, delimiter='\t', usecols=['MAG', 'Contig', 'Start', 'End', 'Spacers', 'Repeats'], dtype={'MAG': str, 'Contig': str, 'Start': int, 'End': int}, index_col=False)
except FileNotFoundError as e:
    print("The input file does not exist, check the path", file=sys.stderr)
    # exit()
except ValueError as e:
    print('Errore: ', e)
    print('Check the column names in the input file (MAG, Contig, Start, End, Spacers, Repeats), and secure that file is a TSV file')
    # exit()

# CRISPR_df = pd.read_csv('/home/isacco.cenacchi/data/Tirocinio/out/MAGs_minced_parsed.tsv', delimiter='\t', dtype={'MAG': str, 'contig': str, 'start': int, 'end': int}, index_col=0)
# CRISPR_df.rename(columns={'contig': 'Contig', 'start': 'Start', 'end': 'End'}, inplace=True)
try:
    Cas_df = pd.read_csv(cas_database, delimiter='\t', usecols=['MAG', 'Contig', 'Start', 'End'], dtype={'MAG': str, 'Contig': str, 'Start': int, 'End': int})
except FileNotFoundError as e:
    print("The cas database file does not exist, check the path", file=sys.stderr)
    # exit()
except ValueError as e:
    print('Errore: ', e)
    print('Check the column names in the input file (MAG, Contig, Start, End), and secure that file is a TSV file')
    # exit()

# Controllare l'utilizzo della memoria
# print(CRISPR_df.memory_usage(deep=True))
# print(Cas_df.memory_usage(deep=True))

# Creo un DataFrame con i dati dei CRISPR e dei Cas combinati
merged_df = CRISPR_df.drop(columns=['Spacers', 'Repeats']).reset_index().merge(Cas_df, on=['MAG', 'Contig'], how="inner", suffixes=('_CRISPR', '_Cas')).set_index('index')

# Aggiunge le colonne al DataFrame
CRISPR_df['Cas_0-1000']=0
CRISPR_df['Cas_1000-10000']=0
CRISPR_df['Cas_>100000']=0
CRISPR_df['Cas_overlayed']=0

# Calcola la distanza tra i CRISPR e i Cas
for index, row in merged_df.iterrows():
    if row['Start_Cas'] >= row['End_CRISPR']:
        # print('Cas davati al CRISPR')
        distance = row['Start_Cas'] - row['End_CRISPR']
    elif row['End_Cas'] <= row['Start_CRISPR']:
        # print('Cas prima il CRISPR')
        distance = row['Start_CRISPR'] - row['End_Cas']
    else:
        # print('Cas che sovrappone al CRISPR')
        distance = -1
    
    if distance >= 0 and distance <= 1000:
        CRISPR_df.at[index, 'Cas_0-1000'] += 1
    elif distance > 1000 and distance <= 10000:
        CRISPR_df.at[index, 'Cas_1000-10000'] += 1
    elif distance > 10000:
        CRISPR_df.at[index, 'Cas_>100000'] += 1
    elif distance == -1:
        CRISPR_df.at[index, 'Cas_overlayed'] += 1
    else:
        print('Errore')
        print('Distanza: ', distance)

# Salva il DataFrame in un file TSV
CRISPR_df.to_csv(output_file, sep='\t')