In [1]:
import pandas as pd
from cyvcf2 import VCF
import numpy as np
from math import exp
from ast import literal_eval
from tqdm import tqdm

In [2]:
regions = pd.read_csv("/expanse/projects/gymreklab/helia/HipSTR_LR/tests/whole_genome/metadata/GRCh38.hipstr_reference_pre.bed", sep = "\t", header=None)
regions_homopolymer = regions[regions[3] == 1]
regions_homopolymer.to_csv("homopolymers.csv", sep = "\t",
                                          header=False, index=False)

In [3]:
#### Prepare regions for TRGT ####

regions_homopolymer['TRGT_field'] = "ID=" + regions_homopolymer[5] + ";" + "MOTIFS=" + regions_homopolymer[6] + ";" + "STRUC=(" + regions_homopolymer[6] + ")n"
regions_homopolymer['TRGT_pos'] = regions_homopolymer[1] - 1
regions_homopolymer[[0,'TRGT_pos',2,'TRGT_field']].to_csv("TRGT_homopolymer_ref.bed", header=False, index=False,
                                                sep="\t")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regions_homopolymer['TRGT_field'] = "ID=" + regions_homopolymer[5] + ";" + "MOTIFS=" + regions_homopolymer[6] + ";" + "STRUC=(" + regions_homopolymer[6] + ")n"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  regions_homopolymer['TRGT_pos'] = regions_homopolymer[1] - 1


In [4]:
#### Reading LongTR calls ####

LongTR_calls = []
for chrom in range(1,23):
    LongTR = VCF(f"/expanse/projects/gymreklab/helia/HipSTR_LR/tests/whole_genome/output/HG002_wg_HiFi_HipSTR_long_chr{chrom}.vcf.gz")
    for record in LongTR:
        if record.INFO['PERIOD'] == 1:
            LongTR_calls.append([record.CHROM, record.POS,
                                 record.REF, record.gt_bases[0],
                                 record.format("GB")[0], record.format('ALLREADS')[0]])
LongTR_df = pd.DataFrame(LongTR_calls, columns=['chrom', 'pos', 'ref', 'gt', 'gb', 'allreads'])
LongTR_df['len'] = LongTR_df['ref'].str.len()
LongTR_df['end'] = LongTR_df['pos'] + LongTR_df['len'] - 1

def seen_lengths(allreads, ref_len):
    if allreads == ".":
        return np.nan
    seen_lengths = []
    allreads = allreads.split(";")
    for allread in allreads:
        allread = allread.split("|")
        for i in range(int(allread[1])):
            seen_lengths.append(ref_len + int(allread[0]))
    return seen_lengths
LongTR_df['alllengths'] = LongTR_df.apply(lambda row: seen_lengths(row['allreads'], row['len']), axis = 1)

In [8]:
#### assembly alleles #####

def diff_from_assembly(assembly_alleles, GT, alllengths=None):
    diff = []
    GT = GT.split("|")
    GT_len = sorted([len(x) for x in GT])
    assembly_len = sorted([len(x) for x in assembly_alleles])
    if len(GT_len) != 2 or len(assembly_len) != 2:
        return np.nan, np.nan, np.nan
    if abs(GT_len[0] - assembly_len[0]) > 5 or abs(GT_len[1] - assembly_len[1]) > 5:
        return np.nan, np.nan, np.nan
    if alllengths != None:
        for readlen in alllengths:
            diff1 = readlen - len(assembly_alleles[0])
            diff2 = readlen - len(assembly_alleles[1])
            if abs(diff1) <= abs(diff2):
                diff.append(diff1)
            else:
                diff.append(diff2)
    return diff, GT_len, assembly_len

assembly_all = pd.DataFrame(columns = [0,1,2,3,4])

for chrom in range(1,23):       
    assembly = pd.read_csv(f"assembly/homopolymers_assembly_alleles_chr{chrom}.csv", header=None, 
                        sep = "\t", skiprows = 1)
    assembly[3] = assembly.apply(lambda row: literal_eval(row[3]), axis = 1)
    assembly[4] = assembly.apply(lambda row: literal_eval(row[4]), axis = 1)
    assembly = assembly[assembly[3].apply(lambda x: len(x) != 0)]
    assembly = assembly[assembly[4].apply(lambda x: len(x) == 0)]
    assembly_all = pd.concat([assembly_all, assembly])
        
assembly_all[1] = assembly_all[1].astype(int)
assembly_all[2] = assembly_all[2].astype(int)
assembly_LongTR = pd.merge(assembly_all, LongTR_df, left_on = [0,1,2], right_on=['chrom', 'pos', 'end'])
assembly_LongTR[['assembly_diff',
                'GT_len', 'assembly_len']] = assembly_LongTR.apply(lambda row: diff_from_assembly(row[3], 
                                                                                                  row['gt'], row['alllengths']), 
                                                                   axis = 1, result_type = 'expand')
assembly_LongTR = assembly_LongTR.dropna()
assembly_LongTR


Unnamed: 0,0,1,2,3,4,chrom,pos,ref,gt,gb,allreads,len,end,alllengths,assembly_diff,GT_len,assembly_len
0,chr1,262684,262694,"[TTTTTTTTTTTTT, TTTTTTTTTTTTT]",[],chr1,262684,TTTTTTTTTTT,TTTTTTTTTTT|TTTTTTTTTTT,0|0,-1|1;0|23;1|1;2|1,11,262694,"[10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...","[-3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -...","[11, 11]","[13, 13]"
1,chr1,267778,267793,"[AAAAAAAAAAAAAAAAA, AAAAAAAAAAAAAAAAA]",[],chr1,267778,AAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAA|AAAAAAAAAAAAAAAAAAAA,0|4,-1|9;0|15;1|4;3|6;4|11;5|7;6|1,16,267793,"[15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 1...","[-2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -...","[16, 20]","[17, 17]"
2,chr1,591734,591751,"[AAAAAAAAAAAAAAAAAAAA, AAAAAAAAAAAAAAAAAAAA]",[],chr1,591734,AAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAA|AAAAAAAAAAAAAAAAAAAA,1|2,1|3;2|5;3|1;12|1,18,591751,"[19, 19, 19, 20, 20, 20, 20, 20, 21, 30]","[-1, -1, -1, 0, 0, 0, 0, 0, 1, 10]","[19, 20]","[20, 20]"
3,chr1,597686,597699,"[AAAAAAAAAAAAAA, AAAAAAAAAAAAAA]",[],chr1,597686,AAAAAAAAAAAAAA,AAAAAAAAAAAAAA|AAAAAAAAAAAAA,0|-1,-1|2;0|9;1|1,14,597699,"[13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15]","[-1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[13, 14]","[14, 14]"
4,chr1,604402,604419,"[AAAAAAAAAAAAAAAAAA, AAAAAAAAAAAAAAAAAA]",[],chr1,604402,AAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAA|AAAAAAAAAAAAAAAAA,0|-1,-2|1;-1|4;0|16;1|3,18,604419,"[16, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 1...","[-2, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[17, 18]","[18, 18]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
677799,chr22,50780398,50780419,"[TTTTTTTTTTTTTTTTTTTTTT, TTTTTTTTTTTTTTTTTTTTTTT]",[],chr22,50780398,TTTTTTTTTTTTTTTTTTTTTT,TTTTTTTTTTTTTTTTTTTTTT|TTTTTTTTTTTTTTTTTTTTTTT,0|1,-2|1;-1|2;0|13;1|12,22,50780419,"[20, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 2...","[-2, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[22, 23]","[22, 23]"
677800,chr22,50781344,50781362,"[AAAAAAAAAAAAAAAAAAA, AAAAAAAAAAAAAAAAAAA]",[],chr22,50781344,AAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAA|AAAAAAAAAAAAAAAAAAA,0|0,-1|7;0|15;1|7;2|1,19,50781362,"[18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19, 1...","[-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0,...","[19, 19]","[19, 19]"
677801,chr22,50790606,50790627,"[AAAAAAAAAAAAAAAAAAAA, AAAAAAAAAAAAAAAAAAAAAAA...",[],chr22,50790606,AAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA...,28|-2,-3|4;-2|8;-1|3;0|1;2|1;20|1;23|1;24|2;25|1;26|...,22,50790627,"[19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 2...","[-1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,...","[20, 50]","[20, 49]"
677802,chr22,50796314,50796328,"[CCCCCCCCCCCCC, CCCCCCCCCCCCCC]",[],chr22,50796314,CCCCCCCCCCCCCCC,CCCCCCCCCCCCC|CCCCCCCCCCCCCC,-2|-1,-8|1;-6|6;-5|1;-4|1;-3|2;-2|6;-1|9;0|1,15,50796328,"[7, 9, 9, 9, 9, 9, 9, 10, 11, 12, 12, 13, 13, ...","[-6, -4, -4, -4, -4, -4, -4, -3, -2, -1, -1, 0...","[13, 14]","[13, 14]"


In [9]:
### Learning Stutter error params ###


LOG_THRESH = np.log(0.001)

def log_sum_exp(log_vals):
    max_val = max(log_vals)
    total = 0

    for val in log_vals:
        total += exp(val - max_val)

    return max_val + np.log(total)

def EMStutterGenotyper(motif_len, df):
    in_log_up = [0.0]  
    in_log_down = [0.0]  
    in_log_eq = [0.0]
    in_log_diffs = [0.0, np.log(1.1)] 
    out_log_up = [0.0] 
    out_log_down = [0.0] 
    out_log_diffs = [0.0, np.log(1.1)]     
    
    for index,row in tqdm(df.iterrows()):
        for bp_diff in row['assembly_diff']:
            if bp_diff == 0:
                in_log_eq.append(0.0)
            else:
                if bp_diff % motif_len != 0:
                    eff_diff = bp_diff - bp_diff/motif_len
                    out_log_diffs.append(np.log(abs(eff_diff)))
                    if bp_diff > 0:
                        out_log_up.append(0.0)
                    else:
                        out_log_down.append(0.0)
                else:
                    eff_diff = bp_diff/motif_len
                    in_log_diffs.append(np.log(abs(eff_diff)))
                    if bp_diff > 0:
                        in_log_up.append(0.0)
                    else:
                        in_log_down.append(0.0)


    in_log_total_up     = log_sum_exp(in_log_up)
    in_log_total_down   = log_sum_exp(in_log_down)
    in_log_total_eq     = log_sum_exp(in_log_eq)
    in_log_total_diffs  = log_sum_exp(in_log_diffs)
    out_log_total_up    = log_sum_exp(out_log_up)
    out_log_total_down  = log_sum_exp(out_log_down)
    out_log_total_diffs = log_sum_exp(out_log_diffs)
    out_log_total       = log_sum_exp([out_log_total_up, out_log_total_down])
    in_pgeom_hat        = min(0.999, exp(log_sum_exp([in_log_total_up, in_log_total_down]) - in_log_total_diffs))
    out_pgeom_hat       = min(0.999, exp(out_log_total - out_log_total_diffs))
    log_total           = log_sum_exp([log_sum_exp([in_log_total_up, in_log_total_down, in_log_total_eq]), out_log_total])
    in_pup_hat          = exp(in_log_total_up    - log_total)
    in_pdown_hat        = exp(in_log_total_down  - log_total)
    out_pup_hat         = exp(out_log_total_up   - log_total)
    out_pdown_hat       = exp(out_log_total_down - log_total)
    
    params = [in_pgeom_hat, in_pup_hat, in_pdown_hat, out_pgeom_hat, out_pup_hat, out_pdown_hat]
    return [round(x,2) for x in params]

params = {}
for i in range(0,60,10):
    if i < 50:
        df = assembly_LongTR[(assembly_LongTR['len'] < i + 10) & 
                                          (assembly_LongTR['len'] >= i)]
    else:
        df = assembly_LongTR[(assembly_LongTR['len'] >= 50)]
    print(i,len(df))
    params[(i, i+10)] = EMStutterGenotyper(1, df)
    


0it [00:00, ?it/s]
0it [00:00, ?it/s]

0 0
10 449901


449901it [00:34, 13101.14it/s]
173it [00:00, 1729.12it/s]

20 181991


181991it [00:14, 12395.79it/s]
1017it [00:00, 10167.68it/s]

30 33949


33949it [00:02, 12155.23it/s]
2310it [00:00, 11482.17it/s]

40 8258


8258it [00:00, 11928.69it/s]
1155it [00:00, 11549.63it/s]

50 2499


2499it [00:00, 11658.68it/s]


In [11]:
params

{(0, 10): [0.95, 0.2, 0.2, 0.95, 0.2, 0.2],
 (10, 20): [0.92, 0.1, 0.15, 0.95, 0.0, 0.0],
 (20, 30): [0.86, 0.13, 0.23, 0.95, 0.0, 0.0],
 (30, 40): [0.8, 0.14, 0.28, 0.95, 0.0, 0.0],
 (40, 50): [0.75, 0.14, 0.31, 0.95, 0.0, 0.0],
 (50, 60): [0.71, 0.15, 0.34, 0.95, 0.0, 0.0]}

In [25]:
diff = assembly_LongTR[assembly_LongTR['GT_len'] != assembly_LongTR['assembly_len']]
print(len(diff))


125001


In [29]:
regions_different = pd.merge(regions,diff, on =[0,1,2])
regions_different[[0,
         1,2,'3_x',
         '4_x',5,6]].to_csv("../../tests/homopolymer_fix/wrong_regions.bed", sep = "\t", 
                           index=False, header=False)

Unnamed: 0,0,1,2,3_x,4_x,5,6,3_y,4_y,chrom,...,ref,gt,gb,allreads,len,end,alllengths,assembly_diff,GT_len,assembly_len
0,chr1,262684,262694,1,11.0,Human_STR_90,T,"[TTTTTTTTTTTTT, TTTTTTTTTTTTT]",[],chr1,...,TTTTTTTTTTT,TTTTTTTTTTT|TTTTTTTTTTT,0|0,-1|1;0|23;1|1;2|1,11,262694,"[10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...","[-3, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -...","[11, 11]","[13, 13]"
1,chr1,267778,267793,1,16.0,Human_STR_91,A,"[AAAAAAAAAAAAAAAAA, AAAAAAAAAAAAAAAAA]",[],chr1,...,AAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAA|AAAAAAAAAAAAAAAAAAAA,0|4,-1|9;0|15;1|4;3|6;4|11;5|7;6|1,16,267793,"[15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 1...","[-2, -2, -2, -2, -2, -2, -2, -2, -2, -1, -1, -...","[16, 20]","[17, 17]"
2,chr1,591734,591751,1,18.0,Human_STR_115,A,"[AAAAAAAAAAAAAAAAAAAA, AAAAAAAAAAAAAAAAAAAA]",[],chr1,...,AAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAA|AAAAAAAAAAAAAAAAAAAA,1|2,1|3;2|5;3|1;12|1,18,591751,"[19, 19, 19, 20, 20, 20, 20, 20, 21, 30]","[-1, -1, -1, 0, 0, 0, 0, 0, 1, 10]","[19, 20]","[20, 20]"
3,chr1,597686,597699,1,14.0,Human_STR_119,A,"[AAAAAAAAAAAAAA, AAAAAAAAAAAAAA]",[],chr1,...,AAAAAAAAAAAAAA,AAAAAAAAAAAAAA|AAAAAAAAAAAAA,0|-1,-1|2;0|9;1|1,14,597699,"[13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15]","[-1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[13, 14]","[14, 14]"
4,chr1,604402,604419,1,18.0,Human_STR_128,A,"[AAAAAAAAAAAAAAAAAA, AAAAAAAAAAAAAAAAAA]",[],chr1,...,AAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAA|AAAAAAAAAAAAAAAAA,0|-1,-2|1;-1|4;0|16;1|3,18,604419,"[16, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 1...","[-2, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[17, 18]","[18, 18]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124996,chr9,138286860,138286875,1,16.0,Human_STR_1635526,A,"[AAAAAAAAAAAAAAAA, AAAAAAAAAAAAAAAA]",[],chr9,...,AAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAA|AAAAAAAAAAAAAAA,0|-1,-1|2;0|3,16,138286875,"[15, 15, 16, 16, 16]","[-1, -1, 0, 0, 0]","[15, 16]","[16, 16]"
124997,chr9,138303883,138303900,1,18.0,Human_STR_1635532,T,"[TTTTTTTTTTTTTTTTTTT, TTTTTTTTTTTTTTTTTTT]",[],chr9,...,TTTTTTTTTTTTTTTTTT,TTTTTTTTTTTTTTTTTT|TTTTTTTTTTTTTTTTTTT,0|1,-1|1;0|10;1|16;2|1,18,138303900,"[17, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 1...","[-2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0...","[18, 19]","[19, 19]"
124998,chr9,138309958,138309973,1,16.0,Human_STR_1635533,T,"[TTTTTTTTTTTTTTTT, TTTTTTTTTTTTTTTT]",[],chr9,...,TTTTTTTTTTTTTTTT,TTTTTTTTTTTTTTTT|TTTTTTTTTTTTTTT,0|-1,-1|5;0|18;2|1;3|1,16,138309973,"[15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 1...","[-1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[15, 16]","[16, 16]"
124999,chr9,138313855,138313868,1,14.0,Human_STR_1635536,A,"[AAAAAAAAAAAAAAAAAAAAAAAAAA, AAAAAAAAAAAAAAAAA...",[],chr9,...,AAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAAAA|AAAAAAAAAAAAAAAAAAAA...,11|12,10|2;11|7;12|8;13|2,14,138313868,"[24, 24, 25, 25, 25, 25, 25, 25, 25, 26, 26, 2...","[-2, -2, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, ...","[25, 26]","[26, 27]"


In [13]:
with open("homopolymers_stutter.bed", 'w') as f:
    for index,row in regions_homopolymer.iterrows():
        len_ = int((row[2] - row[1] + 1) / 10) * 10
        if len_ > 50:
            len_ = 50
        param = params[(len_,len_+10)]
        f.write(f"{row[0]}\t{row[1]}\t{row[2]}\t{param[0]}\t{param[1]}\t{param[2]}\t{param[3]}\t0.00001\t0.00001\t1")
        f.write("\n")
f.close()

In [12]:
regions_homopolymer

Unnamed: 0,0,1,2,3,4,5,6,TRGT_field
3,chr1,28589,28603,1,15.0,Human_STR_4,T,ID=Human_STR_4;MOTIFS=T;STRUC=(T)n
5,chr1,31720,31733,1,14.0,Human_STR_6,A,ID=Human_STR_6;MOTIFS=A;STRUC=(A)n
6,chr1,33450,33464,1,15.0,Human_STR_7,A,ID=Human_STR_7;MOTIFS=A;STRUC=(A)n
7,chr1,33521,33541,1,21.0,Human_STR_8,T,ID=Human_STR_8;MOTIFS=T;STRUC=(T)n
9,chr1,36352,36364,1,13.0,Human_STR_10,A,ID=Human_STR_10;MOTIFS=A;STRUC=(A)n
...,...,...,...,...,...,...,...,...
1638933,chrY,57188708,57188720,1,13.0,Human_STR_1619159,A,ID=Human_STR_1619159;MOTIFS=A;STRUC=(A)n
1638934,chrY,57188868,57188878,1,11.0,Human_STR_1619160,A,ID=Human_STR_1619160;MOTIFS=A;STRUC=(A)n
1638938,chrY,57200838,57200851,1,14.0,Human_STR_1619164,A,ID=Human_STR_1619164;MOTIFS=A;STRUC=(A)n
1638939,chrY,57201000,57201019,1,20.0,Human_STR_1619165,A,ID=Human_STR_1619165;MOTIFS=A;STRUC=(A)n


In [45]:
LongTR_new_calls = []

for chrom in range(1,23):
    print(chrom)
    new_longTR = VCF(f"wrong_homopolymer_chr{chrom}.vcf.gz")
    for record in new_longTR:
        if record.INFO['PERIOD'] == 1:
            LongTR_new_calls.append([record.CHROM, record.POS,
                                 record.REF, record.gt_bases[0],
                                 record.format("GB")[0], record.format('ALLREADS')[0]])
LongTR_new_df = pd.DataFrame(LongTR_new_calls, columns=['chrom', 'pos', 'ref', 'gt', 'gb', 'allreads'])
LongTR_new_df

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22


Unnamed: 0,chrom,pos,ref,gt,gb,allreads
0,chr1,262684,TTTTTTTTTTT,TTTTTTTTTTT|TTTTTTTTTTT,0|0,-1|1;0|23;1|1;2|1
1,chr1,267778,AAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAA|AAAAAAAAAAAAAAAAAAAA,0|4,-1|9;0|15;1|4;3|6;4|11;5|7;6|1
2,chr1,591734,AAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAA|AAAAAAAAAAAAAAAAAAAA,1|2,1|3;2|5;3|1;12|1
3,chr1,597686,AAAAAAAAAAAAAA,AAAAAAAAAAAAAA|AAAAAAAAAAAAAA,0|0,-1|2;0|9;1|1
4,chr1,604402,AAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAA|AAAAAAAAAAAAAAAAAA,0|0,-2|1;-1|4;0|16;1|3
...,...,...,...,...,...,...
124996,chr22,50662172,AAAAAAAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAAAA|AAAAAAAAAAAAAAAAAAAAA...,-2|1,-2|7;-1|3;0|5;1|3;2|2
124997,chr22,50669907,AAAAAAAAAAAAAAA,AAAAAAAAAAAAAAA|AAAAAAAAAAAAAAA,0|0,-2|1;-1|2;0|6;1|3
124998,chr22,50759009,AAAAAAAAAAAAAAA,AAAAAAAAAAAAAAA|AAAAAAAAAAAAAAA,0|0,-2|1;-1|7;0|22;1|10
124999,chr22,50771046,TTTTTTTTTTTTTTTT,TTTTTTTTTTTTTTT|TTTTTTTTTTTTTTTT,-1|0,-2|2;-1|7;0|11;1|4;2|1


In [49]:
assembly_LongTR_new = pd.merge(assembly_LongTR, LongTR_new_df, on = ['chrom', 'pos'])
assembly_LongTR_new[['x',
                'GT_len_new', 'assembly_len']] = assembly_LongTR_new.apply(lambda row: diff_from_assembly(row[3], 
                                                                                                  row['gt_y']), 
                                                                   axis = 1, result_type = 'expand')
assembly_LongTR_new[assembly_LongTR_new['assembly_len'] == assembly_LongTR_new['GT_len_new']]

Unnamed: 0,0,1,2,3,4,chrom,pos,ref_x,gt_x,gb_x,...,alllengths,assembly_diff,GT_len,assembly_len,ref_y,gt_y,gb_y,allreads_y,x,GT_len_new
3,chr1,597686,597699,"[AAAAAAAAAAAAAA, AAAAAAAAAAAAAA]",[],chr1,597686,AAAAAAAAAAAAAA,AAAAAAAAAAAAAA|AAAAAAAAAAAAA,0|-1,...,"[13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15]","[-1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[13, 14]","[14, 14]",AAAAAAAAAAAAAA,AAAAAAAAAAAAAA|AAAAAAAAAAAAAA,0|0,-1|2;0|9;1|1,[],"[14, 14]"
4,chr1,604402,604419,"[AAAAAAAAAAAAAAAAAA, AAAAAAAAAAAAAAAAAA]",[],chr1,604402,AAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAA|AAAAAAAAAAAAAAAAA,0|-1,...,"[16, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 1...","[-2, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[17, 18]","[18, 18]",AAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAA|AAAAAAAAAAAAAAAAAA,0|0,-2|1;-1|4;0|16;1|3,[],"[18, 18]"
5,chr1,605778,605788,"[AAAAAAAAAAA, AAAAAAAAAAA]",[],chr1,605778,AAAAAAAAAAA,AAAAAAAAAAA|AAAAAAAAAA,0|-1,...,"[10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 1...","[-1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[10, 11]","[11, 11]",AAAAAAAAAAA,AAAAAAAAAAA|AAAAAAAAAAA,0|0,-1|3;0|19;1|2,[],"[11, 11]"
6,chr1,613404,613420,"[AAAAAAGAAAAAAAAAA, AAAAAAGAAAAAAAAAA]",[],chr1,613404,AAAAAAGAAAAAAAAAA,AAAAAAGAAAAAAAAAA|AAAAAAGAAAAAAAAA,0|-1,...,"[16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 1...","[-1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[16, 17]","[17, 17]",AAAAAAGAAAAAAAAAA,AAAAAAGAAAAAAAAAA|AAAAAAGAAAAAAAAAA,0|0,-1|4;0|20;1|3,[],"[17, 17]"
13,chr1,711497,711507,"[TTTTTTTTTTT, TTTTTTTTTTT]",[],chr1,711497,TTTTTTTTTTT,TTTTTTTTTTT|TTTTTTTTTTTT,0|1,...,"[10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 1...","[-1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[11, 12]","[11, 11]",TTTTTTTTTTT,TTTTTTTTTTT|TTTTTTTTTTT,0|0,-1|2;0|29;1|4,[],"[11, 11]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124942,chr22,50497080,50497099,"[AAAAAAAAAAAAAAAAAAAAA, AAAAAAAAAAAAAAAAAAAAAA]",[],chr22,50497080,AAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAA|AAAAAAAAAAAAAAAAAAAA,2|0,...,"[20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 2...","[-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0,...","[20, 22]","[21, 22]",AAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAAA|AAAAAAAAAAAAAAAAAAAAA,2|1,0|7;1|6;2|12;3|2,[],"[21, 22]"
124943,chr22,50511286,50511307,"[TTTTTTTTTTTTTTTTTTTTTT, TTTTTTTTTTTTTTTTTTTTTT]",[],chr22,50511286,TTTTTTTTTTTTTTTTTTTTTT,TTTTTTTTTTTTTTTTTTTTTT|TTTTTTTTTTTTTTTTTTTTT,0|-1,...,"[21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 2...","[-1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,...","[21, 22]","[22, 22]",TTTTTTTTTTTTTTTTTTTTTT,TTTTTTTTTTTTTTTTTTTTTT|TTTTTTTTTTTTTTTTTTTTTT,0|0,-1|4;0|9;1|3;2|1,[],"[22, 22]"
124946,chr22,50598004,50598024,"[AAAAAAAAAAAAAAAAAAAAA, AAAAAAAAAAAAAAAAAAAAAAAA]",[],chr22,50598004,AAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAA|AAAAAAAAAAAAAAAAAAAAAAA,0|2,...,"[20, 20, 20, 21, 21, 21, 21, 22, 23, 23, 23, 2...","[-1, -1, -1, 0, 0, 0, 0, 1, -1, -1, -1, 0, 0, 1]","[21, 23]","[21, 24]",AAAAAAAAAAAAAAAAAAAAA,AAAAAAAAAAAAAAAAAAAAA|AAAAAAAAAAAAAAAAAAAAAAAA,0|3,-1|3;0|4;1|1;2|3;3|2;4|1,[],"[21, 24]"
124950,chr22,50669907,50669921,"[AAAAAAAAAAAAAAA, AAAAAAAAAAAAAAA]",[],chr22,50669907,AAAAAAAAAAAAAAA,AAAAAAAAAAAAAAA|AAAAAAAAAAAAAAAA,0|1,...,"[13, 14, 14, 15, 15, 15, 15, 15, 15, 16, 16, 16]","[-2, -1, -1, 0, 0, 0, 0, 0, 0, 1, 1, 1]","[15, 16]","[15, 15]",AAAAAAAAAAAAAAA,AAAAAAAAAAAAAAA|AAAAAAAAAAAAAAA,0|0,-2|1;-1|2;0|6;1|3,[],"[15, 15]"


In [50]:
len(LongTR_new_df)

125001

In [None]:
``