# Setup

# This file requires using the environment created from environment-gtfparse.yml

In [12]:
# Imports

import polars as pl
import duckdb
from pathlib import Path
import math
import os

In [2]:
# Paths
project_dir = Path("/data/teamgdansk/mwaleron/carmen-analysis")

data_dir = project_dir.joinpath("data")
temp_dir = project_dir.joinpath("temp")

data_subs_dir = data_dir.joinpath("subsidiary-files")
data_pub_dir = data_dir.joinpath("to-be-published")

cosmic_data_dir = data_subs_dir.joinpath("COSMIC")
cosmic_genome_screening_file = cosmic_data_dir.joinpath("Cosmic_GenomeScreensMutant_v99_GRCh38.tsv")
cosmic_classification_file = cosmic_data_dir.joinpath("Cosmic_Classification_v99_GRCh38.tsv")

gendode_data_dir = data_subs_dir.joinpath("GENCODE")
gencodev39_stripped = gendode_data_dir.joinpath("gencode.v39.annotation.stripped.versions.parquet")

contig_scaffold_list_file = data_pub_dir.joinpath("scaff_all_expanded.tsv")
contig_peptide_list_file = data_pub_dir.joinpath("contig_unique_peptide_list.tsv")
contig_hotspot_score_file = data_subs_dir.joinpath("hotspot_score_3_axis_per_contig.parquet")
contig_hotspot_score_top_quadrant_file = data_subs_dir.joinpath("contigs_top_quadrant_by_peptides_coverage.parquet")
pogo_mapping_filtered_by_top_quadrant_contigs = data_subs_dir.joinpath("pogo_tq.parquet")

exon_ranges_in_contigs_file = data_subs_dir.joinpath("evenbiggerjoin_cause_its_exons_not_just_genes_v2_just_the_contig_genes.parquet")

cosmic_somatics_overlap_with_contigs_file = data_subs_dir.joinpath("cosmic_somatics_overlap_with_contigs.parquet")
cosmic_somatics_overlap_with_top_quadrant_file = data_subs_dir.joinpath("cosmic_somatics_overlap_with_tq.parquet")
cosmic_mutations_in_contigs_semi_anti_file_B = data_subs_dir.joinpath("contig_cosmic_mutations.parquet")
cosmic_mutations_in_contigs_semi_anti_file_B_compliment = data_subs_dir.joinpath("non_contig_cosmic_mutations.parquet")
cosmic_mutations_in_top_quadrant_semi_anti_file_B = data_subs_dir.joinpath("tq_cosmic_mutations.parquet")
cosmic_mutations_in_top_quadrant_semi_anti_file_B_compliment = data_subs_dir.joinpath("non_tq_cosmic_mutations.parquet")

main_samples_file = data_dir.joinpath("main-output-table-1.tsv")
main_samples_file_2 = data_dir.joinpath("main-output-table-2.tsv")
main_samples_file_3 = data_subs_dir.joinpath("emilia_umap_with_ids.parquet")

In [3]:
cosmic_somatics = pl.read_csv(
    cosmic_genome_screening_file, 
    separator='\t')
cosmic_classification = pl.read_csv(
    cosmic_classification_file, 
    separator='\t')
contig_scaffold_list = pl.read_csv(
    contig_scaffold_list_file,
    separator='\t')
contig_peptide_list = pl.read_csv(
    contig_peptide_list_file, 
    separator='\t')

In [None]:
#exec disabled - just read the saved parquet
%%script false --no-raise-error

justcontigs = contig_scaffold_list.filter(pl.col("gap_width") == 0)
mutations_in_contigs = duckdb.sql('''
                                select *
                                from justcontigs j, cosmic_somatics_chr c
                                where (j.Chromosome = c.CHROMOSOME)
                                and(
                                    (c.GENOME_START between j.Gene_start and j.Gene_end)
                                or  (c.GENOME_STOP between j.Gene_Start and j.Gene_end)
                                )
                                ''')
mutations_in_contigs_res = mutations_in_contigs.pl()
mutations_in_contigs_res.write_parquet(cosmic_somatics_overlap_with_contigs_file)

In [8]:
cosmic_mutcount_per_contig_tmp = temp_dir.joinpath("mutation_count_per_contig.parquet")

mutations_in_contigs_res = pl.read_parquet(cosmic_somatics_overlap_with_contigs_file)
mutation_count_per_contig = mutations_in_contigs_res\
                            .group_by("Id").len().sort("len")
mutation_count_per_contig_C_name =  contig_scaffold_list\
                                    .join(
                                       mutation_count_per_contig,
                                       on="Id",
                                       how="left")\
                                    .fill_null(0)\
                                    .filter(pl.col("gap_width") == 0)\
                                    .select(["Contigs", "len"])
mutation_sum_per_scaffold = contig_scaffold_list\
                            .join(
                                mutation_count_per_contig_C_name,
                                on="Contigs")\
                            .select(["Id", "len"])\
                            .group_by("Id").sum().sort("len")
mutation_sum_per_scaffold.filter(pl.col("Id").str.starts_with("S_0"))\
                         .write_parquet(cosmic_mutcount_per_contig_tmp)

  mutation_count_per_contig_C_name =  contig_scaffold_list\


In [None]:
unique_peptides_per_contig_tmp = temp_dir.joinpath("unique_peptides_per_contig.parquet")
contig_peptide_one_to_one = contig_peptide_list\
                            .with_columns(pl.col("pep_list").str.split(','))\
                            .explode("pep_list")\
                            .unique(["pep_list", "contig"])\
                            .select(["pep_list", "contig"])
scaffold_peptide = contig_scaffold_list\
                   .join(
                       contig_peptide_one_to_one,
                       left_on="Contigs",
                       right_on="contig")\
                   .unique(["Id", "pep_list"])\
                   .group_by("Id").agg("pep_list")
emilia_umap = pl.read_csv(main_samples_file, separator="\t")
emilia_umap_peptides = pl.read_csv(main_samples_file_2, separator="\t")
emilia_umap_with_ids =  emilia_umap\
                        .with_row_index()\
                        .hstack(emilia_umap_peptides.select("Id", "Peptides"))\
                        .with_columns(Peptides = pl.col("Peptides").str.split(","))
emilia_umap_with_ids.write_parquet(main_samples_file_3)
unique_peptides_per_contig =    scaffold_peptide\
                                .with_columns(unique_peptides = pl.col("pep_list").list.len())\
                                .filter(pl.col("Id").str.starts_with("S_0"))
unique_peptides_per_contig.write_parquet(unique_peptides_per_contig_tmp)

calculation_spam_dir = temp_dir.joinpath("parquetspam")
calculation_spam_dir.mkdir(parents=True,exist_ok=True)
contig_peptide = scaffold_peptide.filter(pl.col("Id").str.starts_with("S_0"))
number_of_shards = math.ceil(len(contig_peptide)/10000)
for i in range(number_of_shards):
    contig_peptide.slice(10000*i, 10000)\
                  .write_parquet(calculation_spam_dir.joinpath(f"contig_slice_{i}.parquet"))

In [None]:
# run the compute on the slices
# it is recommended to use parallel, SLURM or other parallelization method here
for i in range(number_of_shards):
    os.system(f"python ./calculate_immune_score.py {i}")

In [10]:
test=pl.read_parquet(calculation_spam_dir.joinpath("scored_contig_slice_*"))
popcov_per_contig_file_tmp = temp_dir.joinpath("population_coverage_per_contig.parquet")
test.write_parquet(popcov_per_contig_file_tmp)
mutation_sum_per_scaffold = pl.read_parquet(cosmic_mutcount_per_contig_tmp)
test.join(unique_peptides_per_contig, on="Id")\
    .join(mutation_sum_per_scaffold, on="Id")\
    .drop("pep_list", "pep_list_right")\
    .rename({"len":"mutation_count"})\
    .write_parquet(contig_hotspot_score_file)
exon_join=pl.scan_parquet(exon_ranges_in_contigs_file)
contig_exon_len = exon_join\
                    .filter(pl.col("gap_width")==0)\
                    .with_columns(exon_length = pl.col("exon_end") - pl.col("exon_start"))\
                    .select("Id", "exon_length")\
                    .group_by("Id").sum().collect()
hotspot_score_3_axis_per_contig = pl.read_parquet(contig_hotspot_score_file).join(contig_exon_len, on="Id")
hotspot_score_3_axis_per_contig =  hotspot_score_3_axis_per_contig\
                                   .with_columns(normalized_unique_pep = pl.col("unique_peptides") / pl.col("exon_length"))
hotspot_score_3_axis_per_contig = hotspot_score_3_axis_per_contig\
                                  .sort(pl.col("popcov_but_sqrt"), descending=True)\
                                  .with_row_index("pop_cov_idx")\
                                  .sort(pl.col("mutation_count"), descending=True)\
                                  .with_row_index("mut_count_idx")\
                                  .sort(pl.col("normalized_unique_pep"), descending=True)\
                                  .with_row_index("unique_pep_idx")\
                                  .with_columns(
                                      ranking= pl.col("unique_pep_idx") + pl.col("mut_count_idx") + pl.col("pop_cov_idx"))\
                                  .sort("ranking")
hotspot_score_3_axis_per_contig.filter(
                                        pl.col("unique_pep_idx") < 124025 , 
                                        pl.col("pop_cov_idx") < 124025)\
                                .write_parquet(contig_hotspot_score_top_quadrant_file)
top_quadrant = pl.read_parquet(contig_hotspot_score_top_quadrant_file)
justcontigs = contig_scaffold_list.filter(pl.col("gap_width") == 0, pl.col("Id").is_in(top_quadrant["Id"]))


UsageError: Line magic function `%%script` not found.


In [None]:
#exec disabled - just read the saved parquet
%%script false --no-raise-error

cosmic_somatics_chr = pl.read_csv(
    cosmic_genome_screening_file, 
    separator='\t').with_columns(CHROMOSOME = pl.lit("chr")+pl.col("CHROMOSOME").str.replace("MT","M"))
justtq = contig_scaffold_list.filter(pl.col("gap_width") == 0, pl.col("Id").is_in(top_quadrant["Id"]))
mutations_in_tq = duckdb.sql('''
                                select *
                                from justtq j, cosmic_somatics_chr c
                                where (j.Chromosome = c.CHROMOSOME)
                                and(
                                    (c.GENOME_START between j.Gene_start and j.Gene_end)
                                or  (c.GENOME_STOP between j.Gene_Start and j.Gene_end)
                                )
                                ''')
mutations_in_tq_res = mutations_in_tq.pl()
mutations_in_tq_res.write_parquet(cosmic_somatics_overlap_with_top_quadrant_file)

In [None]:
# This is the correct set in the sense mutations aren't doubled 
# Thus this one will be used for statistics
%%script false --no-raise-error
setB = cosmic_somatics.with_columns(
    CHROMOSOME=pl.col("CHROMOSOME").str.replace("MT","M")
    ).rename({"CHROMOSOME":"CHROMOSOME_1"}
             ).join(
                mutations_in_contigs_res.with_columns(
                    CHROMOSOME_1=pl.col("CHROMOSOME_1").str.strip_chars_start("chr")), 
                on=["GENOME_START","GENOME_STOP","CHROMOSOME_1"], 
                how="semi").collect()
setB.write_parquet(cosmic_mutations_in_contigs_semi_anti_file_B)


In [None]:
# This is complimentary to setB
# ye just read the parquet as usual
%%script false --no-raise-error
non_contig_mutations = cosmic_somatics.with_columns(
    CHROMOSOME=pl.col("CHROMOSOME").str.replace("MT","M")
    ).rename({"CHROMOSOME":"CHROMOSOME_1"}
             ).join(
                mutations_in_contigs_res.with_columns(
                    CHROMOSOME_1=pl.col("CHROMOSOME_1").str.strip_chars_start("chr")), 
                on=["GENOME_START","GENOME_STOP","CHROMOSOME_1"], 
                how="anti").collect()

non_contig_mutations.write_parquet(cosmic_mutations_in_contigs_semi_anti_file_B_compliment)

In [None]:
mutations_in_tq_res = pl.read_parquet(cosmic_somatics_overlap_with_top_quadrant_file)

In [None]:
setB = pl.read_parquet(cosmic_mutations_in_contigs_semi_anti_file_B)


In [None]:
non_contig_mutations = pl.read_parquet(cosmic_mutations_in_contigs_semi_anti_file_B_compliment)

In [4]:
setBtq = cosmic_somatics.with_columns(
    CHROMOSOME=pl.col("CHROMOSOME").str.replace("MT","M")
    ).rename({"CHROMOSOME":"CHROMOSOME_1"}
             ).join(
                mutations_in_tq_res.with_columns(
                    CHROMOSOME_1=pl.col("CHROMOSOME_1").str.strip_chars_start("chr")), 
                on=["GENOME_START","GENOME_STOP","CHROMOSOME_1"], 
                how="semi")
setBtq.write_parquet(cosmic_mutations_in_top_quadrant_semi_anti_file_B)
setBtq = pl.read_parquet(cosmic_mutations_in_top_quadrant_semi_anti_file_B)

non_tq_mutations = cosmic_somatics.with_columns(
    CHROMOSOME=pl.col("CHROMOSOME").str.replace("MT","M")
    ).rename({"CHROMOSOME":"CHROMOSOME_1"}
             ).join(
                mutations_in_tq_res.with_columns(
                    CHROMOSOME_1=pl.col("CHROMOSOME_1").str.strip_chars_start("chr")), 
                on=["GENOME_START","GENOME_STOP","CHROMOSOME_1"], 
                how="anti")

non_tq_mutations.write_parquet(cosmic_mutations_in_top_quadrant_semi_anti_file_B_compliment)
non_tq_mutations = pl.read_parquet(cosmic_mutations_in_top_quadrant_semi_anti_file_B_compliment)

# Final Cleanup

This is to clean up and delete all additional files and directories created throughout the analysis.

**Do not run the second cell unless you want to end your work here or start over.**

In [None]:
# This is a safety code

raise KeyboardInterrupt("Are you sure you want to run the cell below?")

In [None]:
cosmic_mutcount_per_contig_tmp.unlink()
unique_peptides_per_contig_tmp.unlink()
calculation_spam_dir.joinpath("*").unlink()
calculation_spam_dir.rmdir()
popcov_per_contig_file_tmp.unlink()