# Exploring MAJIQ/VOILA outputs
Exploring MAJIQ/VOILA outputs to understand the format and how to use them

### Load data

In [72]:
alta = "/home/bia/LandscapeSplicingGrasses/SplicingLandscapeGrasses/merging_outputs/data/alt3prime.tsv"
p_alta = "/home/bia/LandscapeSplicingGrasses/SplicingLandscapeGrasses/merging_outputs/data/p_alt3prime.tsv" # putative
altd = "/home/bia/LandscapeSplicingGrasses/SplicingLandscapeGrasses/merging_outputs/data/alt5prime.tsv"
p_altd = "/home/bia/LandscapeSplicingGrasses/SplicingLandscapeGrasses/merging_outputs/data/p_alt5prime.tsv"
altad = "/home/bia/LandscapeSplicingGrasses/SplicingLandscapeGrasses/merging_outputs/data/alt3and5prime.tsv"
p_altad = "/home/bia/LandscapeSplicingGrasses/SplicingLandscapeGrasses/merging_outputs/data/p_alt3and5prime.tsv"
alt_first_exon = "/home/bia/LandscapeSplicingGrasses/SplicingLandscapeGrasses/merging_outputs/data/alternate_first_exon.tsv"
p_alt_first_exon = "/home/bia/LandscapeSplicingGrasses/SplicingLandscapeGrasses/merging_outputs/data/p_alternate_first_exon.tsv"
p_alt_last_exon = "/home/bia/LandscapeSplicingGrasses/SplicingLandscapeGrasses/merging_outputs/data/p_alternate_last_exon.tsv"
alt_last_exon = "/home/bia/LandscapeSplicingGrasses/SplicingLandscapeGrasses/merging_outputs/data/alternate_last_exon.tsv"
alt_intron = "/home/bia/LandscapeSplicingGrasses/SplicingLandscapeGrasses/merging_outputs/data/alternative_intron.tsv"
cassette = "/home/bia/LandscapeSplicingGrasses/SplicingLandscapeGrasses/merging_outputs/data/cassette.tsv"
tandem_cassette = "/home/bia/LandscapeSplicingGrasses/SplicingLandscapeGrasses/merging_outputs/data/tandem_cassette.tsv"
multi_exon_spanning = "/home/bia/LandscapeSplicingGrasses/SplicingLandscapeGrasses/merging_outputs/data/multi_exon_spanning.tsv"
mutually_exclusive = "/home/bia/LandscapeSplicingGrasses/SplicingLandscapeGrasses/merging_outputs/data/mutually_exclusive.tsv"



### ALT3/ALTA
In the context of RNA splicing, an alternative acceptor (also referred to as alternative 3' splice site) is a mechanism of alternative splicing where different 3' splice sites (acceptor sites) are selected during the processing of a pre-mRNA transcript. 

- When 'spliced_with' is E2 and 'strand' is "+", 'reference_exon_coord_start' should be less than 'spliced_with_coord_start'.

- When 'strand' is "-", the opposite should be true for E2: 'reference_exon_coord_start' should be greater than 'spliced_with_coord_start'.

- For E1, everything is the opposite. So, if E1 and "+", reference start is greater; if E1 and "-", reference start is less.


In [16]:
import pandas as pd
alta_df = pd.read_csv(alta, sep="\t", header = 0, comment="#")

# split coordinates columns in start and end
alta_df[["reference_exon_coord_start", "reference_exon_coord_end"]] = alta_df["reference_exon_coord"].str.split("-", expand=True)
alta_df[["spliced_with_coord_start", "spliced_with_coord_end"]] = alta_df["spliced_with_coord"].str.split("-", expand=True)
alta_df[["junction_coord_start", "junction_coord_end"]] = alta_df["junction_coord"].str.split("-", expand=True)

# convert start and end to int
alta_df["reference_exon_coord_start"] = alta_df["reference_exon_coord_start"].astype(int)
alta_df["reference_exon_coord_end"] = alta_df["reference_exon_coord_end"].astype(int)
alta_df["spliced_with_coord_start"] = alta_df["spliced_with_coord_start"].astype(int)
alta_df["spliced_with_coord_end"] = alta_df["spliced_with_coord_end"].astype(int)
alta_df["junction_coord_start"] = alta_df["junction_coord_start"].astype(int)
alta_df["junction_coord_end"] = alta_df["junction_coord_end"].astype(int)

# print(alta_df.head())

In [21]:
condicao_e2_positivo = (alta_df["spliced_with"] == "E2") & (alta_df["strand"] == "+") & (alta_df["reference_exon_coord_start"] < alta_df["spliced_with_coord_start"])
condicao_e2_negativo = (alta_df["spliced_with"] == "E2") & (alta_df["strand"] == "-") & (alta_df["reference_exon_coord_start"] > alta_df["spliced_with_coord_start"])
condicao_e1_positivo = (alta_df["spliced_with"] == "E1") & (alta_df["strand"] == "+") & (alta_df["reference_exon_coord_start"] > alta_df["spliced_with_coord_start"])
condicao_e1_negativo = (alta_df["spliced_with"] == "E1") & (alta_df["strand"] == "-") & (alta_df["reference_exon_coord_start"] < alta_df["spliced_with_coord_start"])

alta_df["condicao_valida"] = condicao_e2_positivo | condicao_e2_negativo | condicao_e1_positivo | condicao_e1_negativo
linhas_invalidas = alta_df[~alta_df["condicao_valida"]]
print("Linhas que violam a teoria:")
print(linhas_invalidas)
print(alta_df["spliced_with"].value_counts(), alta_df["condicao_valida"].value_counts())


Linhas que violam a teoria:
Empty DataFrame
Columns: [module_id, gene_id, gene_name, seqid, strand, lsv_id, event_id, complex, denovo, reference_exon_coord, spliced_with, spliced_with_coord, junction_name, junction_coord, event_size, event_non_changing, event_changing, junction_changing, SRR11684912_median_psi, SRR11684912_var_psi, reference_exon_coord_start, reference_exon_coord_end, spliced_with_coord_start, spliced_with_coord_end, junction_coord_start, junction_coord_end, condicao_valida]
Index: []

[0 rows x 27 columns]
E2    2876
E1      98
Name: spliced_with, dtype: int64 True    2974
Name: condicao_valida, dtype: int64


### ALT5/ALTD
In RNA splicing, an alternative donor (or alternative 5' splice site) refers to a mechanism where different 5' splice sites (donor sites) are selected during pre-mRNA processing.

- When 'spliced_with' is E2 and 'strand' is "+", 'reference_exon_coord_start' should be less than 'spliced_with_coord_start'.

- When 'strand' is "-", the opposite should be true for E2: 'reference_exon_coord_start' should be greater than 'spliced_with_coord_start'.

- For E1, everything is the opposite. So, if E1 and "+", reference start is greater; if E1 and "-", reference start is less.

- ! Unknown coordinates are marked as -1

In [29]:
altd_df = pd.read_csv(altd, sep="\t", header=0, comment="#")

def split_coord(coord):
    if coord.startswith("-1-"):
        return "-1", coord.split("-1-", 1)[1]
    elif coord.endswith("--1"):
        return coord.split("--1", 1)[0], "-1"
    else:
        return coord.split("-", 1)

# split coordinates columns in start and end
altd_df[["reference_exon_coord_start", "reference_exon_coord_end"]] = altd_df["reference_exon_coord"].str.split("-", expand=True)
altd_df[["spliced_with_coord_start", "spliced_with_coord_end"]] = altd_df["spliced_with_coord"].apply(split_coord).tolist()
altd_df[["junction_coord_start", "junction_coord_end"]] = altd_df["junction_coord"].str.split("-", expand=True)

# convert start and end to int
altd_df["reference_exon_coord_start"] = altd_df["reference_exon_coord_start"].astype(int)
altd_df["reference_exon_coord_end"] = altd_df["reference_exon_coord_end"].astype(int)
altd_df["spliced_with_coord_start"] = altd_df["spliced_with_coord_start"].astype(int)
altd_df["spliced_with_coord_end"] = altd_df["spliced_with_coord_end"].astype(int)
altd_df["junction_coord_start"] = altd_df["junction_coord_start"].astype(int)
altd_df["junction_coord_end"] = altd_df["junction_coord_end"].astype(int)

print(altd_df.head())


                   module_id                  gene_id  gene_name seqid strand  \
0  AT3G29160.Araport11.447_2  AT3G29160.Araport11.447  AT3G29160  Chr3      -   
1  AT3G29160.Araport11.447_2  AT3G29160.Araport11.447  AT3G29160  Chr3      -   
2  AT1G06010.Araport11.447_1  AT1G06010.Araport11.447  AT1G06010  Chr1      +   
3  AT1G06010.Araport11.447_1  AT1G06010.Araport11.447  AT1G06010  Chr1      +   
4  AT2G03810.Araport11.447_1  AT2G03810.Araport11.447  AT2G03810  Chr2      +   

                                        lsv_id  \
0  AT3G29160.Araport11.447:s:11131004-11131188   
1  AT3G29160.Araport11.447:s:11131004-11131188   
2    AT1G06010.Araport11.447:s:1823344-1823410   
3    AT1G06010.Araport11.447:s:1823344-1823410   
4    AT2G03810.Araport11.447:t:1162633-1162712   

                         event_id  complex  denovo reference_exon_coord  ...  \
0  AT3G29160.Araport11.447_2_A5_1    False   False    11131004-11131188  ...   
1  AT3G29160.Araport11.447_2_A5_1    False    True  

In [31]:
condicao_e2_positivo = (altd_df["spliced_with"] == "E2") & (altd_df["strand"] == "+") & (altd_df["reference_exon_coord_start"] < altd_df["spliced_with_coord_start"])
condicao_e2_negativo = (altd_df["spliced_with"] == "E2") & (altd_df["strand"] == "-") & (altd_df["reference_exon_coord_start"] > altd_df["spliced_with_coord_start"])
condicao_e1_positivo = (altd_df["spliced_with"] == "E1") & (altd_df["strand"] == "+") & (altd_df["reference_exon_coord_start"] > altd_df["spliced_with_coord_start"])
condicao_e1_negativo = (altd_df["spliced_with"] == "E1") & (altd_df["strand"] == "-") & (altd_df["reference_exon_coord_start"] < altd_df["spliced_with_coord_start"])

altd_df["condicao_valida"] = condicao_e2_positivo | condicao_e2_negativo | condicao_e1_positivo | condicao_e1_negativo
linhas_invalidas = altd_df[~altd_df["condicao_valida"]]
print("Linhas que violam a teoria:")
print(linhas_invalidas)
print(altd_df["spliced_with"].value_counts(), altd_df["condicao_valida"].value_counts())


Linhas que violam a teoria:
Empty DataFrame
Columns: [module_id, gene_id, gene_name, seqid, strand, lsv_id, event_id, complex, denovo, reference_exon_coord, spliced_with, spliced_with_coord, junction_name, junction_coord, event_size, event_non_changing, event_changing, junction_changing, SRR11684912_median_psi, SRR11684912_var_psi, reference_exon_coord_start, reference_exon_coord_end, spliced_with_coord_start, spliced_with_coord_end, junction_coord_start, junction_coord_end, condicao_valida]
Index: []

[0 rows x 27 columns]
E2    1296
E1     108
Name: spliced_with, dtype: int64 True    1404
Name: condicao_valida, dtype: int64


### ALTA and ALTD 

- When 'spliced_with' is E2 and 'strand' is "+", 'reference_exon_coord_start' should be less than 'spliced_with_coord_start'.

- When 'strand' is "-", the opposite should be true for E2: 'reference_exon_coord_start' should be greater than 'spliced_with_coord_start'.

- For E1, everything is the opposite. So, if E1 and "+", reference start is greater; if E1 and "-", reference start is less.

- lines with NaN values in lsv_id column are duplicates and were removed 

- reference_exon_coord and spliced_with_coord are the same for ALTA and ALTD

In [45]:
altad_df = pd.read_csv(altad, sep="\t", header=0, comment="#")

def check_inverted_duplicates(df):
    for i in range(0, len(df), 4):  # Processa grupos de 4 linhas
        group1 = df.iloc[i:i+2]  # Linhas 1 e 2
        group2 = df.iloc[i+2:i+4]  # Linhas 3 e 4

        if not (group1["reference_exon_coord"].nunique() == 1 and
                group1["spliced_with_coord"].nunique() == 1):
            print(f"Erro de consistência no grupo {i//4 + 1}:")
            print(group1)
            continue 

        # Compara linha 1 com 3 e linha 2 com 4
        for j in range(2):
            row1 = group1.iloc[j]
            row2 = group2.iloc[j]

            # Verifica se as colunas estão invertidas
            if (row1["reference_exon_coord"] == row2["spliced_with_coord"] and
                row1["spliced_with_coord"] == row2["reference_exon_coord"]):
                # print(f"Linha {i + j} e {i + 2 + j} são duplicatas invertidas.")
                continue
            else:
                print(f"Linha {i + j} e {i + 2 + j} NÃO são duplicatas invertidas.")


# Aplicar a função ao DataFrame
check_inverted_duplicates(altad_df)

In [46]:
altad_df = altad_df.dropna(subset=["lsv_id"])

# split coordinates columns in start and end
altad_df[["reference_exon_coord_start", "reference_exon_coord_end"]] = altad_df["reference_exon_coord"].str.split("-", expand=True)
altad_df[["spliced_with_coord_start", "spliced_with_coord_end"]] = altad_df["spliced_with_coord"].str.split("-", expand=True)
altad_df[["junction_coord_start", "junction_coord_end"]] = altad_df["junction_coord"].str.split("-", expand=True)

# convert start and end to int
altad_df["reference_exon_coord_start"] = altad_df["reference_exon_coord_start"].astype(int)
altad_df["reference_exon_coord_end"] = altad_df["reference_exon_coord_end"].astype(int)
altad_df["spliced_with_coord_start"] = altad_df["spliced_with_coord_start"].astype(int)
altad_df["spliced_with_coord_end"] = altad_df["spliced_with_coord_end"].astype(int)
altad_df["junction_coord_start"] = altad_df["junction_coord_start"].astype(int)
altad_df["junction_coord_end"] = altad_df["junction_coord_end"].astype(int)

print(altad_df.head())

                   module_id                  gene_id  gene_name seqid strand  \
0  AT5G15540.Araport11.447_1  AT5G15540.Araport11.447  AT5G15540  Chr5      -   
1  AT5G15540.Araport11.447_1  AT5G15540.Araport11.447  AT5G15540  Chr5      -   
4  AT3G54500.Araport11.447_1  AT3G54500.Araport11.447  AT3G54500  Chr3      -   
5  AT3G54500.Araport11.447_1  AT3G54500.Araport11.447  AT3G54500  Chr3      -   
8  AT3G54500.Araport11.447_2  AT3G54500.Araport11.447  AT3G54500  Chr3      -   

                                        lsv_id  \
0    AT5G15540.Araport11.447:s:5056537-5056646   
1    AT5G15540.Araport11.447:s:5056537-5056646   
4  AT3G54500.Araport11.447:s:20178940-20179070   
5  AT3G54500.Araport11.447:s:20178940-20179070   
8  AT3G54500.Araport11.447:s:20178123-20178491   

                           event_id  complex  denovo reference_exon_coord  \
0  AT5G15540.Araport11.447_1_A3A5_1    False   False      5056537-5056646   
1  AT5G15540.Araport11.447_1_A3A5_1    False   False      

In [47]:
condicao_e2_positivo = (altad_df["spliced_with"] == "E2") & (altad_df["strand"] == "+") & (altad_df["reference_exon_coord_start"] < altad_df["spliced_with_coord_start"])
condicao_e2_negativo = (altad_df["spliced_with"] == "E2") & (altad_df["strand"] == "-") & (altad_df["reference_exon_coord_start"] > altad_df["spliced_with_coord_start"])
condicao_e1_positivo = (altad_df["spliced_with"] == "E1") & (altad_df["strand"] == "+") & (altad_df["reference_exon_coord_start"] > altad_df["spliced_with_coord_start"])
condicao_e1_negativo = (altad_df["spliced_with"] == "E1") & (altad_df["strand"] == "-") & (altad_df["reference_exon_coord_start"] < altad_df["spliced_with_coord_start"])

altad_df["condicao_valida"] = condicao_e2_positivo | condicao_e2_negativo | condicao_e1_positivo | condicao_e1_negativo
linhas_invalidas = altd_df[~altd_df["condicao_valida"]]
print("Linhas que violam a teoria:")
print(linhas_invalidas)
print(altad_df["spliced_with"].value_counts(), altad_df["condicao_valida"].value_counts())


Linhas que violam a teoria:
Empty DataFrame
Columns: [module_id, gene_id, gene_name, seqid, strand, lsv_id, event_id, complex, denovo, reference_exon_coord, spliced_with, spliced_with_coord, junction_name, junction_coord, event_size, event_non_changing, event_changing, junction_changing, SRR11684912_median_psi, SRR11684912_var_psi, reference_exon_coord_start, reference_exon_coord_end, spliced_with_coord_start, spliced_with_coord_end, junction_coord_start, junction_coord_end, condicao_valida]
Index: []

[0 rows x 27 columns]
E2    192
E1     14
Name: spliced_with, dtype: int64 True    206
Name: condicao_valida, dtype: int64


### Alternate first exon
In RNA splicing, alternative first exon refers to a mechanism where different initial exons (first exons) are used to start the transcription of a gene. This occurs when a gene has multiple promoters or transcription start sites, each associated with a unique first exon

- When 'spliced_with' is E2 and 'strand' is "+", 'reference_exon_coord_start' should be less than 'spliced_with_coord_start'.

- When 'strand' is "-", the opposite should be true for E2: 'reference_exon_coord_start' should be greater than 'spliced_with_coord_start'.

- For E1, everything is the opposite. So, if E1 and "+", reference start is greater; if E1 and "-", reference start is less.

- Spliced with is always A

- ! Unknown coordinates are marked as -1

In [50]:
alt_first_exon_df = pd.read_csv(alt_first_exon, sep="\t", header=0, comment="#")

# split coordinates columns in start and end
alt_first_exon_df[["reference_exon_coord_start", "reference_exon_coord_end"]] = alt_first_exon_df["reference_exon_coord"].str.split("-", expand=True)
alt_first_exon_df[["spliced_with_coord_start", "spliced_with_coord_end"]] = alt_first_exon_df["spliced_with_coord"].apply(split_coord).tolist()
alt_first_exon_df[["junction_coord_start", "junction_coord_end"]] = alt_first_exon_df["junction_coord"].str.split("-", expand=True)

# convert start and end to int
alt_first_exon_df["reference_exon_coord_start"] = alt_first_exon_df["reference_exon_coord_start"].astype(int)
alt_first_exon_df["reference_exon_coord_end"] = alt_first_exon_df["reference_exon_coord_end"].astype(int)
alt_first_exon_df["spliced_with_coord_start"] = alt_first_exon_df["spliced_with_coord_start"].astype(int)
alt_first_exon_df["spliced_with_coord_end"] = alt_first_exon_df["spliced_with_coord_end"].astype(int)
alt_first_exon_df["junction_coord_start"] = alt_first_exon_df["junction_coord_start"].astype(int)
alt_first_exon_df["junction_coord_end"] = alt_first_exon_df["junction_coord_end"].astype(int)

print(alt_first_exon_df.head())

                   module_id                  gene_id  gene_name seqid strand  \
0  AT3G29160.Araport11.447_1  AT3G29160.Araport11.447  AT3G29160  Chr3      -   
1  AT3G29160.Araport11.447_1  AT3G29160.Araport11.447  AT3G29160  Chr3      -   
2  AT2G03620.Araport11.447_1  AT2G03620.Araport11.447  AT2G03620  Chr2      -   
3  AT2G03620.Araport11.447_1  AT2G03620.Araport11.447  AT2G03620  Chr2      -   
4  AT2G03810.Araport11.447_1  AT2G03810.Araport11.447  AT2G03810  Chr2      +   

                                        lsv_id  \
0  AT3G29160.Araport11.447:t:11131318-11131513   
1  AT3G29160.Araport11.447:t:11131318-11131513   
2    AT2G03620.Araport11.447:t:1101727-1102292   
3    AT2G03620.Araport11.447:t:1101727-1102292   
4    AT2G03810.Araport11.447:t:1162633-1162712   

                          event_id  complex  denovo reference_exon_coord  ...  \
0  AT3G29160.Araport11.447_1_afe_1    False   False    11131318-11131513  ...   
1  AT3G29160.Araport11.447_1_afe_1    False   Fals

In [52]:
condicao_e2_positivo = (alt_first_exon_df["spliced_with"] == "A") & (alt_first_exon_df["strand"] == "+") & (alt_first_exon_df["reference_exon_coord_start"] < alt_first_exon_df["spliced_with_coord_start"])
condicao_e2_negativo = (alt_first_exon_df["spliced_with"] == "A") & (alt_first_exon_df["strand"] == "-") & (alt_first_exon_df["reference_exon_coord_start"] > alt_first_exon_df["spliced_with_coord_start"])
condicao_e1_positivo = (alt_first_exon_df["spliced_with"] == "A") & (alt_first_exon_df["strand"] == "+") & (alt_first_exon_df["reference_exon_coord_start"] > alt_first_exon_df["spliced_with_coord_start"])
condicao_e1_negativo = (alt_first_exon_df["spliced_with"] == "A") & (alt_first_exon_df["strand"] == "-") & (alt_first_exon_df["reference_exon_coord_start"] < alt_first_exon_df["spliced_with_coord_start"])

alt_first_exon_df["condicao_valida"] = condicao_e2_positivo | condicao_e2_negativo | condicao_e1_positivo | condicao_e1_negativo
linhas_invalidas = alt_first_exon_df[~alt_first_exon_df["condicao_valida"]]
print("Linhas que violam a teoria:")
print(linhas_invalidas)
print(alt_first_exon_df["spliced_with"].value_counts(), alt_first_exon_df["condicao_valida"].value_counts())


Linhas que violam a teoria:
Empty DataFrame
Columns: [module_id, gene_id, gene_name, seqid, strand, lsv_id, event_id, complex, denovo, reference_exon_coord, spliced_with, spliced_with_coord, junction_name, junction_coord, event_non_changing, event_changing, junction_changing, SRR11684912_median_psi, SRR11684912_var_psi, reference_exon_coord_start, reference_exon_coord_end, spliced_with_coord_start, spliced_with_coord_end, junction_coord_start, junction_coord_end, condicao_valida]
Index: []

[0 rows x 26 columns]
A    428
Name: spliced_with, dtype: int64 True    428
Name: condicao_valida, dtype: int64


### Alternate last exon 
In RNA splicing, alternative last exon refers to a mechanism where different terminal exons (last exons) are used to conclude the transcription of a gene. This occurs when a gene has multiple polyadenylation sites or transcription termination signals, each associated with a unique last exon.

- When 'spliced_with' is E2 and 'strand' is "+", 'reference_exon_coord_start' should be less than 'spliced_with_coord_start'.

- When 'strand' is "-", the opposite should be true for E2: 'reference_exon_coord_start' should be greater than 'spliced_with_coord_start'.

- For E1, everything is the opposite. So, if E1 and "+", reference start is greater; if E1 and "-", reference start is less.

- Spliced with is always A

- ! Unknown coordinates are marked as -1

In [53]:
alt_last_exon_df = pd.read_csv(alt_last_exon, sep="\t", header=0, comment="#")

# split coordinates columns in start and end
alt_last_exon_df[["reference_exon_coord_start", "reference_exon_coord_end"]] = alt_last_exon_df["reference_exon_coord"].str.split("-", expand=True)
alt_last_exon_df[["spliced_with_coord_start", "spliced_with_coord_end"]] = alt_last_exon_df["spliced_with_coord"].apply(split_coord).tolist()
alt_last_exon_df[["junction_coord_start", "junction_coord_end"]] = alt_last_exon_df["junction_coord"].str.split("-", expand=True)

# convert start and end to int
alt_last_exon_df["reference_exon_coord_start"] = alt_last_exon_df["reference_exon_coord_start"].astype(int)
alt_last_exon_df["reference_exon_coord_end"] = alt_last_exon_df["reference_exon_coord_end"].astype(int)
alt_last_exon_df["spliced_with_coord_start"] = alt_last_exon_df["spliced_with_coord_start"].astype(int)
alt_last_exon_df["spliced_with_coord_end"] = alt_last_exon_df["spliced_with_coord_end"].astype(int)
alt_last_exon_df["junction_coord_start"] = alt_last_exon_df["junction_coord_start"].astype(int)
alt_last_exon_df["junction_coord_end"] = alt_last_exon_df["junction_coord_end"].astype(int)

print(alt_last_exon_df.head())

                   module_id                  gene_id  gene_name seqid strand  \
0  AT1G70850.Araport11.447_1  AT1G70850.Araport11.447  AT1G70850  Chr1      -   
1  AT1G70850.Araport11.447_1  AT1G70850.Araport11.447  AT1G70850  Chr1      -   
2  AT1G70850.Araport11.447_1  AT1G70850.Araport11.447  AT1G70850  Chr1      -   
3  AT1G70850.Araport11.447_1  AT1G70850.Araport11.447  AT1G70850  Chr1      -   
4  AT5G03560.Araport11.447_1  AT5G03560.Araport11.447  AT5G03560  Chr5      -   

                                        lsv_id  \
0  AT1G70850.Araport11.447:s:26716540-26716901   
1  AT1G70850.Araport11.447:s:26716540-26716901   
2  AT1G70850.Araport11.447:s:26716540-26716901   
3  AT1G70850.Araport11.447:s:26716540-26716901   
4      AT5G03560.Araport11.447:s:902412-902770   

                          event_id  complex  denovo reference_exon_coord  ...  \
0  AT1G70850.Araport11.447_1_ale_1     True    True    26716540-26716901  ...   
1  AT1G70850.Araport11.447_1_ale_1     True    Tru

In [54]:
condicao_e2_positivo = (alt_last_exon_df["spliced_with"] == "A") & (alt_last_exon_df["strand"] == "+") & (alt_last_exon_df["reference_exon_coord_start"] < alt_last_exon_df["spliced_with_coord_start"])
condicao_e2_negativo = (alt_last_exon_df["spliced_with"] == "A") & (alt_last_exon_df["strand"] == "-") & (alt_last_exon_df["reference_exon_coord_start"] > alt_last_exon_df["spliced_with_coord_start"])
condicao_e1_positivo = (alt_last_exon_df["spliced_with"] == "A") & (alt_last_exon_df["strand"] == "+") & (alt_last_exon_df["reference_exon_coord_start"] > alt_last_exon_df["spliced_with_coord_start"])
condicao_e1_negativo = (alt_last_exon_df["spliced_with"] == "A") & (alt_last_exon_df["strand"] == "-") & (alt_last_exon_df["reference_exon_coord_start"] < alt_last_exon_df["spliced_with_coord_start"])

alt_last_exon_df["condicao_valida"] = condicao_e2_positivo | condicao_e2_negativo | condicao_e1_positivo | condicao_e1_negativo
linhas_invalidas = alt_last_exon_df[~alt_last_exon_df["condicao_valida"]]
print("Linhas que violam a teoria:")
print(linhas_invalidas)
print(alt_last_exon_df["spliced_with"].value_counts(), alt_last_exon_df["condicao_valida"].value_counts())


Linhas que violam a teoria:
Empty DataFrame
Columns: [module_id, gene_id, gene_name, seqid, strand, lsv_id, event_id, complex, denovo, reference_exon_coord, spliced_with, spliced_with_coord, junction_name, junction_coord, event_non_changing, event_changing, junction_changing, SRR11684912_median_psi, SRR11684912_var_psi, reference_exon_coord_start, reference_exon_coord_end, spliced_with_coord_start, spliced_with_coord_end, junction_coord_start, junction_coord_end, condicao_valida]
Index: []

[0 rows x 26 columns]
A    138
Name: spliced_with, dtype: int64 True    138
Name: condicao_valida, dtype: int64


### Intron retention or Alternative Intron
!!!!!!!!!!!!!!!!
Intron retention is a mechanism of alternative splicing where an intron, which is typically removed during the processing of pre-mRNA into mature mRNA, is retained in the final transcript. Instead of being spliced out, the intron remains within the mRNA, effectively becoming part of the coding or non-coding sequence.


In [63]:
alt_intron_df = pd.read_csv(alt_intron, sep="\t", header=0, comment="#")

# split coordinates columns in start and end
alt_intron_df[["reference_exon_coord_start", "reference_exon_coord_end"]] = alt_intron_df["reference_exon_coord"].str.split("-", expand=True)
alt_intron_df[["spliced_with_coord_start", "spliced_with_coord_end"]] = alt_intron_df["spliced_with_coord"].str.split("-", expand=True)
alt_intron_df[["junction_coord_start", "junction_coord_end"]] = alt_intron_df["junction_coord"].str.split("-", expand=True)

# convert start and end to int
alt_intron_df["reference_exon_coord_start"] = alt_intron_df["reference_exon_coord_start"].astype(int)
alt_intron_df["reference_exon_coord_end"] = alt_intron_df["reference_exon_coord_end"].astype(int)
alt_intron_df["spliced_with_coord_start"] = alt_intron_df["spliced_with_coord_start"].astype(int)
alt_intron_df["spliced_with_coord_end"] = alt_intron_df["spliced_with_coord_end"].astype(int)
alt_intron_df["junction_coord_start"] = alt_intron_df["junction_coord_start"].astype(int)
alt_intron_df["junction_coord_end"] = alt_intron_df["junction_coord_end"].astype(int)

print(alt_intron_df["junction_name"].head())

0     C1_C2_intron
1    C1_C2_spliced
2     C1_C2_intron
3    C1_C2_spliced
4     C1_C2_intron
Name: junction_name, dtype: object


In [71]:
condicao_e2_positivo = (alt_intron_df["spliced_with"] == "C2") & (alt_intron_df["strand"] == "+") & (alt_intron_df["reference_exon_coord_start"] < alt_intron_df["spliced_with_coord_start"])
condicao_e2_negativo = (alt_intron_df["spliced_with"] == "C2") & (alt_intron_df["strand"] == "-") & (alt_intron_df["reference_exon_coord_start"] > alt_intron_df["spliced_with_coord_start"])
condicao_e1_positivo = (alt_intron_df["spliced_with"] == "C1") & (alt_intron_df["strand"] == "+") & (alt_intron_df["reference_exon_coord_start"] > alt_intron_df["spliced_with_coord_start"])
condicao_e1_negativo = (alt_intron_df["spliced_with"] == "C1") & (alt_intron_df["strand"] == "-") & (alt_intron_df["reference_exon_coord_start"] < alt_intron_df["spliced_with_coord_start"])

alt_intron_df["condicao_valida"] = condicao_e2_positivo | condicao_e2_negativo | condicao_e1_positivo | condicao_e1_negativo
linhas_invalidas = alt_intron_df[~alt_intron_df["condicao_valida"]]
validas = alt_intron_df[alt_intron_df["condicao_valida"]]
print("Linhas que violam a teoria:")
print(linhas_invalidas["complex"].value_counts())
print(validas["complex"].value_counts())
print(alt_intron_df["spliced_with"].value_counts(), alt_intron_df["condicao_valida"].value_counts())


Linhas que violam a teoria:
True     27
False     4
Name: complex, dtype: int64
False    104
True      81
Name: complex, dtype: int64
C2    154
C1     62
Name: spliced_with, dtype: int64 True     185
False     31
Name: condicao_valida, dtype: int64


In [69]:
def test_conditions(df):
    for i in range(0, len(df), 2):  # Processa grupos de 4 linhas
        group = df.iloc[i:i+2]  # Grupo de 4 linhas

        # 2) Verifica compatibilidade das coordenadas nas linhas marcadas como splice
        for j in range(1, 2, 2):  # Apenas linhas ímpares (splice)
            splice_row = group.iloc[j]

            # Extrai coordenadas
            junction_start, junction_end = map(int, splice_row["junction_coord"].split("-"))
            ref_start, ref_end = map(int, splice_row["reference_exon_coord"].split("-"))
            spliced_start, spliced_end = map(int, splice_row["spliced_with_coord"].split("-"))

            # Verifica compatibilidade
            if not (junction_start in [ref_start, ref_end, spliced_start, spliced_end] or
                    junction_end in [ref_start, ref_end, spliced_start, spliced_end]):
                print(f"Erro de compatibilidade no grupo {i//2 + 1}, linha {i + j}:")
                print("As coordenadas de junção não são compatíveis com reference_exon_coord ou spliced_with_coord.")
                print(splice_row)

# Aplicar a função ao DataFrame
test_conditions(linhas_invalidas)

Erro de compatibilidade no grupo 2, linha 3:
As coordenadas de junção não são compatíveis com reference_exon_coord ou spliced_with_coord.
module_id                                       AT4G25080.Araport11.447_1
gene_id                                           AT4G25080.Araport11.447
gene_name                                                       AT4G25080
seqid                                                                Chr4
strand                                                                  +
lsv_id                        AT4G25080.Araport11.447:t:12878382-12878703
event_id                                   AT4G25080.Araport11.447_1_AI_2
complex                                                              True
denovo                                                              False
reference_exon_coord                                    12878339-12878359
spliced_with                                                           C1
spliced_with_coord                              

IndexError: single positional indexer is out-of-bounds

### Cassette or Exon Skipping
Exon skipping, also known as cassette exon, is a form of alternative splicing where an exon is either included or excluded from the final mRNA transcript. This process allows a single gene to produce multiple protein isoforms, contributing to the diversity of the proteome.

- Organized in stacks of 4 rows, where junction_names are respectively: C1_C2, C1_A, C2_C1, C2_A. 

- !! TODO: C1_C2 and C2_C1 are the same, but represent different LSVs, which means they have different mean PSI values. 

- For + strand: 
    - C1_C2: reference_exon_coord_start < spliced_with_coord_start
    - C1_A: reference_exon_coord_start < spliced_with_coord_start
    - C2_C1: reference_exon_coord_start > spliced_with_coord_start
    - C2_A: reference_exon_coord_start > spliced_with_coord_start

- For - strand:
    - C1_C2: reference_exon_coord_start > spliced_with_coord_start
    - C1_A: reference_exon_coord_start > spliced_with_coord_start
    - C2_C1: reference_exon_coord_start < spliced_with_coord_start
    - C2_A: reference_exon_coord_start < spliced_with_coord_start

In [None]:
cassette_df = pd.read_csv(cassette, sep="\t", header=0, comment="#")

cassette_df[["reference_exon_coord_start", "reference_exon_coord_end"]] = cassette_df["reference_exon_coord"].str.split("-", expand=True)
cassette_df[["spliced_with_coord_start", "spliced_with_coord_end"]] = cassette_df["spliced_with_coord"].str.split("-", expand=True)
cassette_df[["junction_coord_start", "junction_coord_end"]] = cassette_df["junction_coord"].str.split("-", expand=True)

# convert start and end to int
cassette_df["reference_exon_coord_start"] = cassette_df["reference_exon_coord_start"].astype(int)
cassette_df["reference_exon_coord_end"] = cassette_df["reference_exon_coord_end"].astype(int)
cassette_df["spliced_with_coord_start"] = cassette_df["spliced_with_coord_start"].astype(int)
cassette_df["spliced_with_coord_end"] = cassette_df["spliced_with_coord_end"].astype(int)
cassette_df["junction_coord_start"] = cassette_df["junction_coord_start"].astype(int)
cassette_df["junction_coord_end"] = cassette_df["junction_coord_end"].astype(int)

print(cassette_df.head())

def check_junction_name_order(df):
    expected_order = ["C1_C2", "C1_A", "C2_C1", "C2_A"]
    
    for i in range(0, len(df), 4):  # Processa grupos de 4 linhas
        group = df.iloc[i:i+4]  # Grupo de 4 linhas
        junction_names = group["junction_name"].tolist()  # Lista de valores na coluna junction_name

        if junction_names != expected_order:
            print(f"Erro de ordem no grupo {i//4 + 1}:")
            print(f"Esperado: {expected_order}")
            print(f"Encontrado: {junction_names}")
            print(group)
            continue  # Pula para o próximo grupo

# Aplicar a função ao DataFrame
check_junction_name_order(cassette_df)

                   module_id                  gene_id  gene_name seqid strand  \
0  AT1G06220.Araport11.447_1  AT1G06220.Araport11.447  AT1G06220  Chr1      +   
1  AT1G06220.Araport11.447_1  AT1G06220.Araport11.447  AT1G06220  Chr1      +   
2  AT1G06220.Araport11.447_1  AT1G06220.Araport11.447  AT1G06220  Chr1      +   
3  AT1G06220.Araport11.447_1  AT1G06220.Araport11.447  AT1G06220  Chr1      +   
4  AT2G26770.Araport11.447_1  AT2G26770.Araport11.447  AT2G26770  Chr2      -   

                                        lsv_id  \
0    AT1G06220.Araport11.447:s:1899951-1900107   
1    AT1G06220.Araport11.447:s:1899951-1900107   
2    AT1G06220.Araport11.447:t:1900512-1901264   
3    AT1G06220.Araport11.447:t:1900512-1901264   
4  AT2G26770.Araport11.447:s:11408200-11408421   

                         event_id  complex  denovo reference_exon_coord  ...  \
0  AT1G06220.Araport11.447_1_CE_1    False   False      1899951-1900107  ...   
1  AT1G06220.Araport11.447_1_CE_1    False   False  

In [None]:
def check_coordinate_conditions(df):
    # Condições para strand +
    condicao_c1_c2_positivo = (df["junction_name"].isin(["C1_C2", "C1_A"])) & (df["strand"] == "+") & (df["reference_exon_coord_start"] < df["spliced_with_coord_start"])
    condicao_c2_c1_positivo = (df["junction_name"].isin(["C2_C1", "C2_A"])) & (df["strand"] == "+") & (df["reference_exon_coord_start"] > df["spliced_with_coord_start"])

    # Condições para strand -
    condicao_c1_c2_negativo = (df["junction_name"].isin(["C1_C2", "C1_A"])) & (df["strand"] == "-") & (df["reference_exon_coord_start"] > df["spliced_with_coord_start"])
    condicao_c2_c1_negativo = (df["junction_name"].isin(["C2_C1", "C2_A"])) & (df["strand"] == "-") & (df["reference_exon_coord_start"] < df["spliced_with_coord_start"])

    # Combina todas as condições
    df["condicao_valida"] = condicao_c1_c2_positivo | condicao_c2_c1_positivo | condicao_c1_c2_negativo | condicao_c2_c1_negativo

    # Identifica linhas inválidas
    linhas_invalidas = df[~df["condicao_valida"]]
    print("Linhas que violam a teoria:")
    print(linhas_invalidas)

    # Estatísticas
    print("\nContagem de valores em 'condicao_valida':")
    print(df["condicao_valida"].value_counts())

# Aplicar a função ao DataFrame
check_coordinate_conditions(cassette_df)

Linhas que violam a teoria:
Empty DataFrame
Columns: [module_id, gene_id, gene_name, seqid, strand, lsv_id, event_id, complex, denovo, reference_exon_coord, spliced_with, spliced_with_coord, junction_name, junction_coord, event_size, event_non_changing, event_changing, junction_changing, SRR11684912_median_psi, SRR11684912_var_psi, reference_exon_coord_start, reference_exon_coord_end, spliced_with_coord_start, spliced_with_coord_end, junction_coord_start, junction_coord_end, condicao_valida]
Index: []

[0 rows x 27 columns]

Contagem de valores em 'condicao_valida':
True    884
Name: condicao_valida, dtype: int64


### Tandem Cassette Exons
Same as Cassette or Exon Skipping, but with more than 2 exons being skipped. 

- Organized in stacks of 4 rows, where junction_names are respectively: C1_C2, C1_A, C2_C1, C2_A_Last. 

- !! TODO: C1_C2 and C2_C1 are the same, but represent different LSVs, which means they have different mean PSI values. 

- For + strand: 
    - C1_C2: reference_exon_coord_start < spliced_with_coord_start
    - C1_A: reference_exon_coord_start < spliced_with_coord_start
    - C2_C1: reference_exon_coord_start > spliced_with_coord_start
    - C2_A_Last: reference_exon_coord_start > spliced_with_coord_start

- For - strand:
    - C1_C2: reference_exon_coord_start > spliced_with_coord_start
    - C1_A: reference_exon_coord_start > spliced_with_coord_start
    - C2_C1: reference_exon_coord_start < spliced_with_coord_start
    - C2_A_Last: reference_exon_coord_start < spliced_with_coord_start

- !! TODO: Has two new columns (exons_skipped_coords, num_skipped_exons) that actually disrup the pattern of the columns.

In [90]:
tandem_cassette_df = pd.read_csv(tandem_cassette, sep="\t", header=0, comment="#")

tandem_cassette_df[["reference_exon_coord_start", "reference_exon_coord_end"]] = tandem_cassette_df["reference_exon_coord"].str.split("-", expand=True)
tandem_cassette_df[["spliced_with_coord_start", "spliced_with_coord_end"]] = tandem_cassette_df["spliced_with_coord"].str.split("-", expand=True)
tandem_cassette_df[["junction_coord_start", "junction_coord_end"]] = tandem_cassette_df["junction_coord"].str.split("-", expand=True)

# convert start and end to int
tandem_cassette_df["reference_exon_coord_start"] = tandem_cassette_df["reference_exon_coord_start"].astype(int)
tandem_cassette_df["reference_exon_coord_end"] = tandem_cassette_df["reference_exon_coord_end"].astype(int)
tandem_cassette_df["spliced_with_coord_start"] = tandem_cassette_df["spliced_with_coord_start"].astype(int)
tandem_cassette_df["spliced_with_coord_end"] = tandem_cassette_df["spliced_with_coord_end"].astype(int)
tandem_cassette_df["junction_coord_start"] = tandem_cassette_df["junction_coord_start"].astype(int)
tandem_cassette_df["junction_coord_end"] = tandem_cassette_df["junction_coord_end"].astype(int)

print(tandem_cassette_df.head())

def check_junction_name_order(df):
    expected_order = ["C1_C2", "C1_A", "C2_C1", "C2_A_Last"]
    
    for i in range(0, len(df), 4):  # Processa grupos de 4 linhas
        group = df.iloc[i:i+4]  # Grupo de 4 linhas
        junction_names = group["junction_name"].tolist()  # Lista de valores na coluna junction_name

        if junction_names != expected_order:
            print(f"Erro de ordem no grupo {i//4 + 1}:")
            print(f"Esperado: {expected_order}")
            print(f"Encontrado: {junction_names}")
            print(group)
            continue  # Pula para o próximo grupo

# Aplicar a função ao DataFrame
check_junction_name_order(tandem_cassette_df)

                   module_id                  gene_id  gene_name seqid strand  \
0  AT2G39950.Araport11.447_1  AT2G39950.Araport11.447  AT2G39950  Chr2      -   
1  AT2G39950.Araport11.447_1  AT2G39950.Araport11.447  AT2G39950  Chr2      -   
2  AT2G39950.Araport11.447_1  AT2G39950.Araport11.447  AT2G39950  Chr2      -   
3  AT2G39950.Araport11.447_1  AT2G39950.Araport11.447  AT2G39950  Chr2      -   
4  AT1G60640.Araport11.447_1  AT1G60640.Araport11.447  AT1G60640  Chr1      -   

                                        lsv_id  \
0  AT2G39950.Araport11.447:s:16679657-16679772   
1  AT2G39950.Araport11.447:s:16679657-16679772   
2  AT2G39950.Araport11.447:t:16678035-16678278   
3  AT2G39950.Araport11.447:t:16678035-16678278   
4  AT1G60640.Araport11.447:s:22338467-22338526   

                          event_id  complex  denovo reference_exon_coord  ...  \
0  AT2G39950.Araport11.447_1_TCE_1     True   False    16679657-16679772  ...   
1  AT2G39950.Araport11.447_1_TCE_1     True   Fals

In [91]:
def check_coordinate_conditions(df):
    # Condições para strand +
    condicao_c1_c2_positivo = (df["junction_name"].isin(["C1_C2", "C1_A"])) & (df["strand"] == "+") & (df["reference_exon_coord_start"] < df["spliced_with_coord_start"])
    condicao_c2_c1_positivo = (df["junction_name"].isin(["C2_C1", "C2_A_Last"])) & (df["strand"] == "+") & (df["reference_exon_coord_start"] > df["spliced_with_coord_start"])

    # Condições para strand -
    condicao_c1_c2_negativo = (df["junction_name"].isin(["C1_C2", "C1_A"])) & (df["strand"] == "-") & (df["reference_exon_coord_start"] > df["spliced_with_coord_start"])
    condicao_c2_c1_negativo = (df["junction_name"].isin(["C2_C1", "C2_A_Last"])) & (df["strand"] == "-") & (df["reference_exon_coord_start"] < df["spliced_with_coord_start"])

    # Combina todas as condições
    df["condicao_valida"] = condicao_c1_c2_positivo | condicao_c2_c1_positivo | condicao_c1_c2_negativo | condicao_c2_c1_negativo

    # Identifica linhas inválidas
    linhas_invalidas = df[~df["condicao_valida"]]
    print("Linhas que violam a teoria:")
    print(linhas_invalidas[["reference_exon_coord", "spliced_with_coord", "junction_coord", "junction_name", "strand"]])

    # Estatísticas
    print("\nContagem de valores em 'condicao_valida':")
    print(df["condicao_valida"].value_counts())

# Aplicar a função ao DataFrame
check_coordinate_conditions(tandem_cassette_df)

Linhas que violam a teoria:
Empty DataFrame
Columns: [reference_exon_coord, spliced_with_coord, junction_coord, junction_name, strand]
Index: []

Contagem de valores em 'condicao_valida':
True    32
Name: condicao_valida, dtype: int64


### Mutually exclusive exons
Mutually exclusive exons are a form of alternative splicing where only one exon from a pair (or set) of exons is included in the mature mRNA transcript, while the other exon(s) are excluded. This mechanism allows for the generation of different protein isoforms from a single gene, contributing to protein diversity.

- When 'strand' is "+", 'reference_exon_coord_start' should be less than 'spliced_with_coord_start'.

- When 'strand' is "-", the opposite is true: 'reference_exon_coord_start' should be greater than 'spliced_with_coord_start'.

In [92]:
mutually_exclusive_df = pd.read_csv(mutually_exclusive, sep="\t", header=0, comment="#")

mutually_exclusive_df[["reference_exon_coord_start", "reference_exon_coord_end"]] = mutually_exclusive_df["reference_exon_coord"].str.split("-", expand=True)
mutually_exclusive_df[["spliced_with_coord_start", "spliced_with_coord_end"]] = mutually_exclusive_df["spliced_with_coord"].str.split("-", expand=True)
mutually_exclusive_df[["junction_coord_start", "junction_coord_end"]] = mutually_exclusive_df["junction_coord"].str.split("-", expand=True)

# convert start and end to int
mutually_exclusive_df["reference_exon_coord_start"] = mutually_exclusive_df["reference_exon_coord_start"].astype(int)
mutually_exclusive_df["reference_exon_coord_end"] = mutually_exclusive_df["reference_exon_coord_end"].astype(int)
mutually_exclusive_df["spliced_with_coord_start"] = mutually_exclusive_df["spliced_with_coord_start"].astype(int)
mutually_exclusive_df["spliced_with_coord_end"] = mutually_exclusive_df["spliced_with_coord_end"].astype(int)
mutually_exclusive_df["junction_coord_start"] = mutually_exclusive_df["junction_coord_start"].astype(int)
mutually_exclusive_df["junction_coord_end"] = mutually_exclusive_df["junction_coord_end"].astype(int)

print(mutually_exclusive_df.head())


                   module_id                  gene_id  gene_name seqid strand  \
0  AT1G10600.Araport11.447_1  AT1G10600.Araport11.447  AT1G10600  Chr1      +   
1  AT1G10600.Araport11.447_1  AT1G10600.Araport11.447  AT1G10600  Chr1      +   
2  AT1G10600.Araport11.447_1  AT1G10600.Araport11.447  AT1G10600  Chr1      +   
3  AT1G10600.Araport11.447_1  AT1G10600.Araport11.447  AT1G10600  Chr1      +   
4  AT5G42770.Araport11.447_1  AT5G42770.Araport11.447  AT5G42770  Chr5      +   

                                        lsv_id  \
0    AT1G10600.Araport11.447:s:3504024-3504113   
1    AT1G10600.Araport11.447:s:3504024-3504113   
2    AT1G10600.Araport11.447:t:3504569-3504643   
3    AT1G10600.Araport11.447:t:3504569-3504643   
4  AT5G42770.Araport11.447:s:17152530-17152604   

                          event_id  complex  denovo reference_exon_coord  ...  \
0  AT1G10600.Araport11.447_1_mxe_1    False   False      3504024-3504113  ...   
1  AT1G10600.Araport11.447_1_mxe_1    False   Fals

In [93]:
condicao_e2_positivo = (mutually_exclusive_df["strand"] == "+") & (mutually_exclusive_df["reference_exon_coord_start"] < mutually_exclusive_df["spliced_with_coord_start"])
condicao_e2_negativo = (mutually_exclusive_df["strand"] == "-") & (mutually_exclusive_df["reference_exon_coord_start"] > mutually_exclusive_df["spliced_with_coord_start"])
condicao_e1_positivo = (mutually_exclusive_df["strand"] == "+") & (mutually_exclusive_df["reference_exon_coord_start"] > mutually_exclusive_df["spliced_with_coord_start"])
condicao_e1_negativo = (mutually_exclusive_df["strand"] == "-") & (mutually_exclusive_df["reference_exon_coord_start"] < mutually_exclusive_df["spliced_with_coord_start"])

mutually_exclusive_df["condicao_valida"] = condicao_e2_positivo | condicao_e2_negativo | condicao_e1_positivo | condicao_e1_negativo
linhas_invalidas = mutually_exclusive_df[~mutually_exclusive_df["condicao_valida"]]
print("Linhas que violam a teoria:")
print(linhas_invalidas)
print(mutually_exclusive_df["spliced_with"].value_counts(), mutually_exclusive_df["condicao_valida"].value_counts())


Linhas que violam a teoria:
Empty DataFrame
Columns: [module_id, gene_id, gene_name, seqid, strand, lsv_id, event_id, complex, denovo, reference_exon_coord, spliced_with, spliced_with_coord, junction_name, junction_coord, event_non_changing, event_changing, junction_changing, SRR11684912_median_psi, SRR11684912_var_psi, reference_exon_coord_start, reference_exon_coord_end, spliced_with_coord_start, spliced_with_coord_end, junction_coord_start, junction_coord_end, condicao_valida]
Index: []

[0 rows x 26 columns]
A1    14
A2    14
Name: spliced_with, dtype: int64 True    28
Name: condicao_valida, dtype: int64
