# Script permettant de co-localiser les petit exons/introns avec la zone génomique du mismatch

In [1]:
def fasta2List(pathFasta):
    f = open(pathFasta, "r")
    title = []
    seq = []
    seq_temp = []
    for line in f:
        if line[0] == ">":
            seq.append(''.join(seq_temp).replace("\n", ""))
            title.append(line.replace("\n", ""))
            seq_temp = []
        else:
            seq_temp.append(line)
    seq.append(''.join(seq_temp).replace("\n", ""))
    seq.pop(0)
    dictionary = dict(zip(title, seq))
    return dictionary

In [2]:
# Importation de toutes les données utilisées
import pandas as pd 
intron_file = pd.read_csv("../raw/uniprot-exon-map/Intron_map.tab", sep="\t", header=None)
exon_file = pd.read_csv("../raw/uniprot-exon-map/Exon_map.tab", sep="\t", header=None)
ID_file = pd.read_csv("../raw/uniprot-exon-map/transcript_ensembl.tab", sep = "\t")
Error_file = pd.read_csv("../raw/uniprot-exon-map/uniprot_new_errors_filt.txt", sep=" ", header=None)
my_Genomic = fasta2List("../raw/uniprot-exon-map/genomics_new.fa")
my_CDS = fasta2List("../raw/uniprot-exon-map/cds_new.fa")

In [3]:
# Génération du fichier qui map chaque mismatch à un exon de début et de fin de mismatch
f = open("../raw/uniprot-mismatch-colocalize/mismatch_exon_pos.tab", "w")
f.write("Alignement\tError\tUniprotID\tPosStartError\tPosStopError\tFirstExonError\tLastExonError\n")
for index, row in Error_file.iloc[:,:].iterrows():
    fasta_name = row[0][20:-6]
    prot_name = row[2]
    error_start = row[3]
    error_stop = row[4]

    Prot_list = fasta2List("../raw/uniprot-sequence/"+fasta_name)
    CDS = [val for key, val in my_CDS.items() if prot_name in key]
    if CDS == []:
        continue
    mismtach_CDS = CDS[0][error_start*3:error_stop*3+3]
    subset = exon_file.loc[exon_file[0]==prot_name]
    exon_number_list = subset[3].to_list()
    exon_seq_list = subset[6].to_list()
    
    # Interaive pop of exon list : seq. Check CDS in exon joint: TRUE = pop if False = seq important
    fini = False
    while fini != True:
        exon_tuple = [ (exon_number_list[i], exon_seq_list[i]) for i in range(len(exon_number_list))]
        for j in range(0,len(exon_tuple), 1):
            popped_exon = exon_tuple.pop(0)
            testing_condition = (''.join(mismtach_CDS) in ''.join( [ exon_tuple[i][1] for i in range(len(exon_tuple)) ]))
            if testing_condition == False:
                start_exon = popped_exon[0]
                break

        exon_tuple = [ (exon_number_list[i], exon_seq_list[i]) for i in range(len(exon_number_list))]
        for k in range(len(exon_tuple), 0, -1):
            popped_exon = exon_tuple.pop(len(exon_tuple)-1)
            testing_condition = (''.join(mismtach_CDS) in ''.join( [ exon_tuple[i][1] for i in range(len(exon_tuple)) ]))
            if testing_condition == False:
                stop_exon = popped_exon[0]
                break
        f.write(row[0]+"\t"+row[1]+"\t"+row[2]+"\t"+str(row[3])+"\t"+str(row[4])+"\t"+str(start_exon)+"\t"+str(stop_exon)+"\n")
        fini = True
f.close()

In [3]:
# Importation de tout les introns / exons présent dans les mismatch pour regarder la distribution des tailles
mismatch_pos = pd.read_csv("../raw/uniprot-mismatch-colocalize/mismatch_exon_pos.tab", sep="\t")

In [30]:
# Récupération des séquences des exons introns localisés dans la zone du mismatch
f = open("../raw/uniprot-mismatch-colocalize/mismatch_exon_seq.tab", "w", newline='\n')
f2 = open("../raw/uniprot-mismatch-colocalize/mismatch_intron_seq.tab", "w", newline='\n')

for index, row in mismatch_pos.iloc[:,:].iterrows():
    subset_exon = exon_file.loc[exon_file[0]==row[2]]
    subset_intron = intron_file.loc[intron_file[0]==row[2]]
    for i in range(row[5], row[6]+1):
        row_to_list = subset_exon.loc[subset_exon[3] == i].values.tolist()
        row_to_list = row_to_list[0]
        my_Str = '\t'.join(map(str, row_to_list))
        f.write(my_Str+"\n")
    for i in range(row[5], row[6]):
        row_to_list = subset_intron.loc[subset_intron[3] == i].values.tolist()
        row_to_list = row_to_list[0]
        my_Str = '\t'.join(map(str, row_to_list))
        f2.write(my_Str+"\n")
        pass
f.close()
f2.close()

In [13]:
# Récup CDS et genomic que au niveau du mismatch
myExonicMap = pd.read_csv("../raw/uniprot-mismatch-colocalize/mismatch_exon_seq.tab", sep="\t", header=None)
UniprotID = set(myExonicMap.iloc[:,0].to_list())
UniprotID = list(UniprotID)
with open("../raw/uniprot-mismatch-colocalize/CDS_seq_mismatch.fa", "w") as CDS_file:
    for i in UniprotID:
        subset = myExonicMap.loc[myExonicMap[0] == i]
        myCDS = ''.join(subset[6].to_list())
        CDS_file.write(">"+subset.iloc[0,1]+" "+i+" CDS_mismatch"+"\n"+myCDS+"\n")

In [12]:
# Récup genomic que au niveau du mismatch
myExonicMap = pd.read_csv("../raw/uniprot-mismatch-colocalize/mismatch_exon_seq.tab", sep="\t", header=None)
exon_file = pd.read_csv("../raw/uniprot-mismatch-colocalize/mismatch_exon_seq.tab", sep="\t", header=None)
intron_file = pd.read_csv("../raw/uniprot-mismatch-colocalize/mismatch_intron_seq.tab", sep="\t", header=None)
UniprotID = set(myExonicMap.iloc[:,0].to_list())
UniprotID = list(UniprotID)

with open("../raw/uniprot-mismatch-colocalize/genomic_seq_mismatch.fa", "w") as CDS_file:
    for i in UniprotID[:]:
        my_CDS = []
        subset_exon = exon_file.loc[exon_file[0]== i]
        subset_intron = intron_file.loc[intron_file[0]== i]
        for n in range(0, len(subset_exon.index)):
            my_CDS.append(subset_exon.iloc[n,6])
            try:
                my_CDS.append(subset_intron.iloc[n,6])
            except:
                pass
        CDS_file.write(">"+subset_exon.iloc[0,1]+" "+i+" Genomic_mismatch"+"\n"+''.join(my_CDS)+"\n")