In [26]:
import gzip

import pandas as pd

pd.set_option("display.max_columns", None)

Read LOH positions

In [27]:
df = pd.read_csv("/home/junkhann/daten/intersect_mmml_onek1k_with_header.vcf.gz", sep="\t", compression="gzip", low_memory=False)
len(df)

19090241

In [28]:
df = df.drop(columns=["Unnamed: 29", "position_dummy", "#CHR", "POS", "POS_DUMMY"])

In [29]:
df["position_code"] = df["#CHROM"].astype(str) + "-" + df["pos"].astype(str)

In [30]:
df.head()

Unnamed: 0,#CHROM,pos,REF,ALT,INFO,sample_control,sample_tumor,start,end,genotype,TCN,PID,normal_genotype,tumor_genotype,quality_score,reads_normal,reads_tumor,CELL_TYPE,RSID,GENE,GENE_ID,A1,A2,A2_FREQ_ONEK1K,A2_FREQ_HRC,position_code
0,3,116723728,G,A,BRF=0.23;FR=0.2505;HP=1;HapScore=1;MGOF=13;MML...,"1/0:-60.85,0,-76.25:4:99:40:17","0/0:0,-2.22,-65.4:13:22:18:1",116664052,116765897,1:0,1.0,4100314,11,10,99.0,40,18,CD4 Naive/Central memory T cell,rs1464621,{'RP11-384F7.1'},{'ENSG00000243276'},G,A,0.26109,0.269156,3-116723728
1,3,116725097,G,A,BRF=0.16;FR=0.4792;HP=4;HapScore=1;MGOF=16;MML...,"1/0:-80.45,0,-60.95:7:99:46:26","1/0:-1.08,0,-61.28:16:11:21:2",116664052,116765897,1:0,1.0,4100314,11,10,99.0,46,21,CD4 Naive/Central memory T cell,rs9871824,{'RP11-384F7.1'},{'ENSG00000243276'},G,A,0.25949,0.256067,3-116725097
2,3,116725415,A,G,BRF=0.21;FR=0.4934;HP=1;HapScore=1;MGOF=12;MML...,"0/1:-80.24,0,-94.94:7:99:59:27","0/1:-1.58,0,-55.58:12:16:20:2",116664052,116765897,1:0,1.0,4100314,11,10,99.0,59,20,CD4 Naive/Central memory T cell,rs6783180,{'RP11-384F7.1'},{'ENSG00000243276'},A,G,0.25897,0.256144,3-116725415
3,3,116726085,C,T,BRF=0.22;FR=0.5001;HP=2;HapScore=1;MGOF=25;MML...,"0/1:-45.57,0,-38.67:25:99:28:13","0/1:-71.17,0,-3.57:2:36:26:23",116664052,116765897,1:0,1.0,4100314,11,1,99.0,28,26,CD4 Naive/Central memory T cell,rs75373647,{'RP11-384F7.1'},{'ENSG00000243276'},C,T,0.13415,0.144379,3-116726085
4,3,116726713,T,C,BRF=0.24;FR=0.7484;HP=5;HapScore=1;MGOF=21;MML...,"1/0:-89.35,0,-57.55:15:99:55:33","1/1:-79.1,-1.71,0:21:17:29:27",116664052,116765897,1:0,1.0,4100314,11,1,99.0,55,29,CD4 Naive/Central memory T cell,rs12630933,{'RP11-384F7.1'},{'ENSG00000243276'},T,C,0.13452,0.144503,3-116726713


Analyse LOH positions

In [31]:
no_distinct_positions = df["position_code"].nunique()
no_distinct_positions

4124732

In [32]:
df["#CHROM"].nunique()

22

In [33]:
no_distinct_patients = df["PID"].nunique()
no_distinct_patients

235

In [34]:
unique_combinations = len(df.drop_duplicates(['PID','position_code']).index)
unique_combinations

18870653

Merge position count

In [35]:
ser = pd.read_json("/home/junkhann/daten/intersect_position_count.json", typ="series", dtype={str})
df_position_count = ser.to_frame("position_count")
df_position_count = df_position_count.reset_index(names="pos")
df_position_count.shape

(4124732, 2)

In [36]:
df_position_count.head()

Unnamed: 0,pos,position_count
0,3-116723728,7
1,3-116725097,7
2,3-116725415,7
3,3-116726085,3
4,3-116726713,3


In [37]:
df = df.merge(df_position_count, how="inner", left_on="position_code", right_on="pos")
len(df)

19090241

Set threshold

In [51]:
threshold = 10
df_thresholded = df[df["position_count"] >= threshold]
len(df_thresholded)

7614506

Analyze thresholded dataframe

In [52]:
df_thresholded["PID"].nunique()

231

In [53]:
df_thresholded["#CHROM"].nunique()

22

In [54]:
df_thresholded["#CHROM"].unique()

array([ 3,  4,  1,  6,  9, 14, 16, 17, 19, 22, 10, 11, 20, 13,  2, 15,  8,
        7, 18, 12,  5, 21])

In [55]:
df_thresholded["position_code"].nunique()

490185

Write to csv

In [56]:
df_thresholded.to_csv(f"/home/junkhann/daten/LOH_pos_thresholded/LOH_positions_threshold_{threshold}.tsv.gz", sep="\t", header=True, index=False, compression="gzip")