In [2]:
import importlib.resources
import polars as pl

In [11]:
_COLS = ["nucleoside", "monoisotopic_mass"]

MASSES = pl.read_csv(
    # (importlib.resources.files(__package__) / "assets" / "masses.tsv"),
    # separator="\t",
    ("assets/masses_all.tsv"),
    separator="\t",
    # ("assets/masses4.tsv"), separator="\t"
)
# TODO: Change the file to masses_all.tsv for all modifications. Note: "masses_all.tsv" has multiples nucleosides with the same mass!

assert MASSES.columns == _COLS

print(MASSES)

shape: (145, 2)
┌────────────┬───────────────────┐
│ nucleoside ┆ monoisotopic_mass │
│ ---        ┆ ---               │
│ str        ┆ f64               │
╞════════════╪═══════════════════╡
│ 00G        ┆ 495.1003          │
│ 309U       ┆ 345.117215        │
│ 01G        ┆ 311.123           │
│ 101G       ┆ 311.123           │
│ 8U         ┆ 246.085186        │
│ …          ┆ …                 │
│ 253U       ┆ 317.068156        │
│ 053U       ┆ 315.10665         │
│ 2164A      ┆ 440.1114          │
│ 2161A      ┆ 381.1471          │
│ 621A       ┆ 327.1001          │
└────────────┴───────────────────┘


In [13]:
# TODO: Add the appropriate backbone masses and the terminal extra masses to the nucleosides!

MASSES = MASSES.with_columns(pl.col("monoisotopic_mass").round(5))

print(MASSES)

UNIQUE_MASSES = (
    MASSES.group_by("monoisotopic_mass", maintain_order=True)
    .first()
    .select(pl.col(_COLS))
)

print(UNIQUE_MASSES)

print((UNIQUE_MASSES.filter(pl.col("nucleoside").is_in(["A", "C", "G", "U"]))))

shape: (145, 2)
┌────────────┬───────────────────┐
│ nucleoside ┆ monoisotopic_mass │
│ ---        ┆ ---               │
│ str        ┆ f64               │
╞════════════╪═══════════════════╡
│ 00G        ┆ 495.1003          │
│ 309U       ┆ 345.11721         │
│ 01G        ┆ 311.123           │
│ 101G       ┆ 311.123           │
│ 8U         ┆ 246.08519         │
│ …          ┆ …                 │
│ 253U       ┆ 317.06816         │
│ 053U       ┆ 315.10665         │
│ 2164A      ┆ 440.1114          │
│ 2161A      ┆ 381.1471          │
│ 621A       ┆ 327.1001          │
└────────────┴───────────────────┘
shape: (105, 2)
┌────────────┬───────────────────┐
│ nucleoside ┆ monoisotopic_mass │
│ ---        ┆ ---               │
│ str        ┆ f64               │
╞════════════╪═══════════════════╡
│ 00G        ┆ 495.1003          │
│ 309U       ┆ 345.11721         │
│ 01G        ┆ 311.123           │
│ 8U         ┆ 246.08519         │
│ 510U       ┆ 273.09609         │
│ …          ┆ …       

In [5]:
TOLERANCE = 1e-5  # For perfect matching, the TOLERANCE should be the precision (digits after decimal) to which the masses of nucleosides and sequences are reported

MATCHING_THRESHOLD = 0  # This dictates a matching threshold such that we consider -MATCHING_THRESHOLD < (sum(masses) - target_mass) < MATCHING_THRESHOLD to be matched!
# If TOLERANCE < num_of_decimals in reported masses, then MATCHING_THRESHOLD should at least be greater or equal than the number of nucleotides expected for a target mass!

EXPLANATION_MASSES = UNIQUE_MASSES.with_columns(
    (pl.col("monoisotopic_mass") / TOLERANCE)
    .round(0)
    .cast(pl.Int64)
    .alias("tolerated_integer_masses")
)

print(EXPLANATION_MASSES)

shape: (106, 3)
┌────────────┬───────────────────┬──────────────────────────┐
│ nucleoside ┆ monoisotopic_mass ┆ tolerated_integer_masses │
│ ---        ┆ ---               ┆ ---                      │
│ str        ┆ f64               ┆ i64                      │
╞════════════╪═══════════════════╪══════════════════════════╡
│ 00G        ┆ 495.1003          ┆ 49510030                 │
│ 309U       ┆ 345.117215        ┆ 34511721                 │
│ 01G        ┆ 311.123           ┆ 31112300                 │
│ 8U         ┆ 246.085186        ┆ 24608519                 │
│ 510U       ┆ 273.096085        ┆ 27309609                 │
│ …          ┆ …                 ┆ …                        │
│ 253U       ┆ 317.068156        ┆ 31706816                 │
│ 053U       ┆ 315.10665         ┆ 31510665                 │
│ 2164A      ┆ 440.1114          ┆ 44011140                 │
│ 2161A      ┆ 381.1471          ┆ 38114710                 │
│ 621A       ┆ 327.1001          ┆ 32710010           

In [14]:
print((EXPLANATION_MASSES.filter(pl.col("nucleoside").is_in(["A", "C", "G", "U"]))))

shape: (4, 3)
┌────────────┬───────────────────┬──────────────────────────┐
│ nucleoside ┆ monoisotopic_mass ┆ tolerated_integer_masses │
│ ---        ┆ ---               ┆ ---                      │
│ str        ┆ f64               ┆ i64                      │
╞════════════╪═══════════════════╪══════════════════════════╡
│ C          ┆ 243.08552         ┆ 24308552                 │
│ A          ┆ 267.09675         ┆ 26709675                 │
│ G          ┆ 283.09167         ┆ 28309167                 │
│ U          ┆ 244.06954         ┆ 24406954                 │
└────────────┴───────────────────┴──────────────────────────┘


In [9]:
UNIQUE_EXPLANATION_MASSES = (
    EXPLANATION_MASSES.group_by("tolerated_integer_masses", maintain_order=True).first()
    # .select(pl.col(_COLS))
)
print(UNIQUE_EXPLANATION_MASSES)

shape: (105, 3)
┌──────────────────────────┬────────────┬───────────────────┐
│ tolerated_integer_masses ┆ nucleoside ┆ monoisotopic_mass │
│ ---                      ┆ ---        ┆ ---               │
│ i64                      ┆ str        ┆ f64               │
╞══════════════════════════╪════════════╪═══════════════════╡
│ 49510030                 ┆ 00G        ┆ 495.1003          │
│ 34511721                 ┆ 309U       ┆ 345.117215        │
│ 31112300                 ┆ 01G        ┆ 311.123           │
│ 24608519                 ┆ 8U         ┆ 246.085186        │
│ 27309609                 ┆ 510U       ┆ 273.096085        │
│ …                        ┆ …          ┆ …                 │
│ 31706816                 ┆ 253U       ┆ 317.068156        │
│ 31510665                 ┆ 053U       ┆ 315.10665         │
│ 44011140                 ┆ 2164A      ┆ 440.1114          │
│ 38114710                 ┆ 2161A      ┆ 381.1471          │
│ 32710010                 ┆ 621A       ┆ 327.1001    

In [7]:
difference = EXPLANATION_MASSES.join(
    UNIQUE_EXPLANATION_MASSES,
    on=["tolerated_integer_masses", "nucleoside", "monoisotopic_mass"],
    how="anti",
)
print(difference)

shape: (1, 3)
┌────────────┬───────────────────┬──────────────────────────┐
│ nucleoside ┆ monoisotopic_mass ┆ tolerated_integer_masses │
│ ---        ┆ ---               ┆ ---                      │
│ str        ┆ f64               ┆ i64                      │
╞════════════╪═══════════════════╪══════════════════════════╡
│ U          ┆ 244.06954         ┆ 24406954                 │
└────────────┴───────────────────┴──────────────────────────┘


In [7]:
TEST_MASSES = [
    267.09675 * 2,  # 2A
    283.09167 * 2,  # 2G
    243.08552 * 2,  # 2C
    244.06954 * 2,  # 2U
    1037.34348,  # AUGC
    1563.52067,  # CCUAGG
]

TEST_SEQ = [
    ("A", "A"),
    ("G", "G"),
    ("C", "C"),
    ("U", "U"),
    ("A", "U", "G", "C"),
    ("A", "U", "G", "G", "C", "C"),
]

MASS_SEQ_DICT = dict(zip(TEST_MASSES, TEST_SEQ))

MASS_SEQ_DICT

# _TESTCASES = importlib.resources.files("tests") / "testcases"

# @pytest.mark.parametrize("testcase", _TESTCASES.iterdir())

{534.1935: ('A', 'A'),
 566.18334: ('G', 'G'),
 486.17104: ('C', 'C'),
 488.13908: ('U', 'U'),
 1037.34348: ('A', 'U', 'G', 'C'),
 1563.52067: ('A', 'U', 'G', 'G', 'C', 'C')}

In [None]:
iterable = MASS_SEQ_DICT.items()
print(iterable)

dict_items([(534.1935, ('A', 'A')), (566.18334, ('G', 'G')), (486.17104, ('C', 'C')), (488.13908, ('U', 'U')), (1037.34348, ('A', 'U', 'G', 'C')), (1563.52067, ('A', 'U', 'G', 'G', 'C', 'C'))])


In [38]:
for key, value in iterable:
    print(key, value)

534.1935 ('A', 'A')
566.18334 ('G', 'G')
486.17104 ('C', 'C')
488.13908 ('U', 'U')
1037.34348 ('A', 'U', 'G', 'C')
1563.52067 ('A', 'U', 'G', 'G', 'C', 'C')


In [None]:
import pytest


@pytest.mark.parametrize("testcase", MASS_SEQ_DICT.items())
def test_testcase(testcase):
    # print(testcase)

    # base_path = _TESTCASES / testcase

    # with open(base_path / "meta.yaml", "r") as f:
    #    meta = yaml.safe_load(f)

    # true_seq = parse_nucleosides(meta["true_sequence"])

    # predicted_mass_explainations = []

    # for i in MASS_SEQ_DICT.keys():
    #    predicted_mass_explainations.append(explain_mass(i))

    predicted_mass_explaination = explain_mass(testcase[0])

    print(testcase[0], testcase[1])

    # assert prediction.sequence == true_seq

    assert testcase[1] in predicted_mass_explaination

In [27]:
!pytest

platform darwin -- Python 3.13.0, pytest-8.3.4, pluggy-1.5.0
rootdir: /Users/localadmin-moshir.harsh01/Work/code/lionelmssq
configfile: pyproject.toml
collected 0 items                                                              [0m

