In [1]:
import gzip

import pandas as pd

pd.set_option("display.max_columns", None)

Read LOH positions

In [6]:
with open("/home/junkhann/daten/mmml_onek1k_all_patients_columns.txt", mode="r") as columns_file:
    columns_list = columns_file.readlines()

columns_list = [col.rstrip("\n") for col in columns_list]

columns_list[1] = "pos"
columns_list = columns_list[0:18]
columns_list

['#CHROM',
 'pos',
 'position_dummy',
 'REF',
 'ALT',
 'INFO',
 'sample_control',
 'sample_tumor',
 'start',
 'end',
 'genotype',
 'TCN',
 'PID',
 'normal_genotype',
 'tumor_genotype',
 'quality_score',
 'reads_normal',
 'reads_tumor']

In [7]:
df = pd.read_csv("/home/junkhann/daten/LOH_positions_mmml.vcf.gz", sep="\t", header=None, names=columns_list, compression="gzip", low_memory=False)
len(df)

28265798

In [8]:
df.head()

Unnamed: 0,#CHROM,pos,position_dummy,REF,ALT,INFO,sample_control,sample_tumor,start,end,genotype,TCN,PID,normal_genotype,tumor_genotype,quality_score,reads_normal,reads_tumor
0,3,116221105,116221105,C,A,BRF=0.18;FR=0.5256;HP=6;HapScore=4;MGOF=9;MMLQ...,"1/0:-42.61,0,-38.37:9:99:36:18","0/1:-18.43,0,-1.47:6:15:11:9",116220095,116342540,1:0,1.0,4100314,11,1,52.130925,36.0,11
1,3,116221490,116221490,A,C,BRF=0.13;FR=0.7499;HP=1;HapScore=1;MGOF=26;MML...,"0/1:-67.26,0,-65.76:5:99:45:22","1/1:-69.7,-2.82,0:26:28:22:21",116220095,116342540,1:0,1.0,4100314,11,1,99.0,45.0,22
2,3,116221573,116221573,T,C,BRF=0.06;FR=0.75;HP=1;HapScore=1;MGOF=41;MMLQ=...,"1/0:-25.97,0,-63.87:23:99:32:10","1/1:-89,-6.33,0:41:63:28:27",116220095,116342540,1:0,1.0,4100314,11,1,99.0,32.0,28
3,3,116222047,116222047,G,A,BRF=0.29;FR=0.75;HP=4;HapScore=1;MGOF=6;MMLQ=3...,"1/0:-60.05,0,-72.45:5:99:45:20","1/1:-87.9,-7.22,0:6:72:24:24",116220095,116342540,1:0,1.0,4100314,11,1,99.0,45.0,24
4,3,116223336,116223336,A,G,BRF=0.19;FR=0.7499;HP=1;HapScore=1;MGOF=12;MML...,"0/1:-82.34,0,-89.54:12:99:57:29","1/1:-77.1,-3.12,0:6:31:24:23",116220095,116342540,1:0,1.0,4100314,11,1,99.0,57.0,24


In [9]:
df = df.drop(columns=["position_dummy"])

Analyse LOH positions

In [10]:
df["position_code"] = df["#CHROM"].astype(str) + "-" + df["pos"].astype(str)

In [11]:
no_distinct_positions = df["position_code"].nunique()
no_distinct_positions

7636659

In [12]:
df["#CHROM"].nunique()

22

In [13]:
no_distinct_patients = df["PID"].nunique()
no_distinct_patients

241

Merge position count

In [16]:
ser = pd.read_json("/home/junkhann/daten/LOH_position_count_mmml.json", typ="series", dtype={str})
df_position_count = ser.to_frame("position_count")
df_position_count = df_position_count.reset_index(names="pos")
df_position_count.shape

(7636659, 2)

In [17]:
df_position_count.head()

Unnamed: 0,pos,position_count
0,3-116221105,5
1,3-116221490,5
2,3-116221573,5
3,3-116222047,5
4,3-116223336,5


In [18]:
df = df.merge(df_position_count, how="inner", left_on="position_code", right_on="pos")
len(df)

28265798

Set threshold

In [33]:
threshold = 10
df_thresholded = df[df["position_count"] >= threshold]
len(df_thresholded)

10126763

Analyze thresholded dataframe

In [34]:
df_thresholded["PID"].nunique()

240

In [35]:
df_thresholded["#CHROM"].nunique()

17

In [36]:
df_thresholded["#CHROM"].unique()

array([ 3, 14,  1,  6,  9, 16, 17, 19, 22, 10, 11,  2, 13, 15,  8, 18, 12])

In [37]:
df_thresholded["position_code"].nunique()

641574

Write to csv

In [38]:
df_thresholded.to_csv(f"/home/junkhann/daten/LOH_pos_mmml_thresholded/LOH_positions_threshold_{threshold}.tsv.gz", sep="\t", header=True, index=False, compression="gzip")