In [36]:
import sc2ts
import tszip

import numpy as np
from tqdm import tqdm
from Bio import SeqIO
import ast
import glob
import pandas as pd

In [39]:
ts = tszip.load("sc2ts_viridian_v1.1.trees.tsz")
ds = sc2ts.Dataset("viridian_mafft_2024-10-14_v1.vcz")
#### dots "." are actually missing data i.e. N

In [38]:
nodes_df = sc2ts.node_data(ts)
in_ARG_set = set(list(nodes_df.sample_id))

In [40]:
def find_gap_ranges(seq):
    import numpy as np

    start, end = 0, 29903
    offset = start + 1  # for 1-based indexing

    arr = np.fromiter((c == '-' for c in seq[start:end]), dtype=bool)
    indices = np.flatnonzero(arr) + offset

    if indices.size == 0:
        return []

    # Find where gaps break (difference > 1), then split
    grouped = np.split(indices, np.where(np.diff(indices) > 1)[0] + 1)

    return [list(map(int, group)) for group in grouped]

In [41]:
import numpy as np
from tqdm import tqdm
from collections import Counter

arr = np.array(list(sc2ts.IUPAC_ALLELES + "N"))
counter = Counter()

for i, h in tqdm(ds.haplotypes.items(), desc="processing sequences"):
    if i not in in_ARG_set:
        continue

    seq = "".join(arr[h])     # if find_gap_ranges requires string
    deletions = find_gap_ranges(seq)

    # directly update with tuples, no intermediate list
    counter.update(tuple(d) for d in deletions)

processing sequences: 100%|████████| 4484157/4484157 [4:54:52<00:00, 253.44it/s]


NameError: name 'df' is not defined

In [45]:
# Convert to a pandas Series 
deletion_counts = pd.DataFrame(counter.items(), columns=["deletion", "count"])
deletion_counts = deletion_counts.sort_values("count", ascending=False).reset_index(drop=True)
print(deletion_counts.head(20))   # top 20 most common deletions

                                             deletion    count
0                                            (28271,)  1331409
1          (28248, 28249, 28250, 28251, 28252, 28253)  1113392
2          (22029, 22030, 22031, 22032, 22033, 22034)  1107941
3   (29855, 29856, 29857, 29858, 29859, 29860, 298...  1007210
4   (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...   954928
5   (28362, 28363, 28364, 28365, 28366, 28367, 283...   850483
6   (11288, 11289, 11290, 11291, 11292, 11293, 112...   843291
7          (21765, 21766, 21767, 21768, 21769, 21770)   820194
8   (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...   745346
9   (29867, 29868, 29869, 29870, 29871, 29872, 298...   719736
10  (21633, 21634, 21635, 21636, 21637, 21638, 216...   531973
11  (29734, 29735, 29736, 29737, 29738, 29739, 297...   526026
12                                 (6513, 6514, 6515)   341380
13  (11283, 11284, 11285, 11286, 11287, 11288, 112...   339810
14  (21987, 21988, 21989, 21990, 21991, 21992, 219...  

In [73]:
deletion_counts["length"] = deletion_counts["deletion"].apply(len)  # Add tuple length

start = []
end = []
for i in deletion_counts["deletion"]: 
    start.append(i[0])
    end.append(i[-1])

deletion_counts["start"] = start
deletion_counts["end"] = start

#counts_df = counts_df[counts_df["deletion"].apply(lambda x: x[0] != 266)]
#counts_df = counts_df[counts_df["deletion"].apply(lambda x: x[-1] != 29674)].reset_index(drop=True)
deletion_counts = deletion_counts.sort_values(by = "count", ascending = False)

In [80]:
low, high = 255, 29674

deletion_counts[deletion_counts["deletion"].apply(lambda lst: all(low <= d <= high for d in lst))].head(20)

Unnamed: 0,deletion,count,length,start,end
0,"(28271,)",1331409,1,28271,28271
1,"(28248, 28249, 28250, 28251, 28252, 28253)",1113392,6,28248,28248
2,"(22029, 22030, 22031, 22032, 22033, 22034)",1107941,6,22029,22029
5,"(28362, 28363, 28364, 28365, 28366, 28367, 283...",850483,9,28362,28362
6,"(11288, 11289, 11290, 11291, 11292, 11293, 112...",843291,9,11288,11288
7,"(21765, 21766, 21767, 21768, 21769, 21770)",820194,6,21765,21765
10,"(21633, 21634, 21635, 21636, 21637, 21638, 216...",531973,9,21633,21633
12,"(6513, 6514, 6515)",341380,3,6513,6513
13,"(11283, 11284, 11285, 11286, 11287, 11288, 112...",339810,9,11283,11283
14,"(21987, 21988, 21989, 21990, 21991, 21992, 219...",338633,9,21987,21987


In [94]:
deletion_counts.to_csv("all_viridian_deletion_counts.csv", index = False)

## counts_df = pd.read_csv("all_viridian_deletion_counts.csv")

In [90]:
## what about removal of deletions after 29513 and before 694 
filtered_counts_df = deletion_counts[deletion_counts["deletion"].apply(lambda tup: all(694 <= x <= 29513 for x in tup))].reset_index(drop = True)

In [92]:
filtered_counts_df[:20]

Unnamed: 0,deletion,count,length,start,end
0,"(28271,)",1331409,1,28271,28271
1,"(28248, 28249, 28250, 28251, 28252, 28253)",1113392,6,28248,28248
2,"(22029, 22030, 22031, 22032, 22033, 22034)",1107941,6,22029,22029
3,"(28362, 28363, 28364, 28365, 28366, 28367, 283...",850483,9,28362,28362
4,"(11288, 11289, 11290, 11291, 11292, 11293, 112...",843291,9,11288,11288
5,"(21765, 21766, 21767, 21768, 21769, 21770)",820194,6,21765,21765
6,"(21633, 21634, 21635, 21636, 21637, 21638, 216...",531973,9,21633,21633
7,"(6513, 6514, 6515)",341380,3,6513,6513
8,"(11283, 11284, 11285, 11286, 11287, 11288, 112...",339810,9,11283,11283
9,"(21987, 21988, 21989, 21990, 21991, 21992, 219...",338633,9,21987,21987


In [93]:
filtered_counts_df.to_csv("coding_viridian_deletion_counts.csv", index = False)