In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import pysam
import os
import ast
import numpy as np
import json
import collections
from tqdm import tqdm
from Bio.Seq import Seq
import more_itertools as mit

In [15]:
def find_sequence(sequence, subsequence):
    positions = []
    start = 0
    while True:
        start = sequence.find(subsequence, start)
        if start == -1:
            break
        positions.append(start)
        start += 1
    return positions

In [16]:
def orientationFinder(df):
    df2 = df.copy()
    df2['Orientation']='NONE'
    for row in df2.index:
        if df2.at[row,'TE_Hits'] == 'NONE':
            continue
        else:
            elementAnnotation = str(df2.at[row,'Element_Annotation'])
            sense=0
            antisense=0
            hitList = ast.literal_eval(str(df2.at[row,'TE_Hits']))
            for hit in hitList:
                if str(hit.split()[9]) == elementAnnotation:
                    if str(hit.split()[8])=='+':
                        sense+=1
                    else:
                        antisense+=1
                else:
                    continue
            
            if sense>0 and antisense ==0:
                df2.at[row,'Orientation']='+'
            elif sense==0 and antisense>0:
                df2.at[row,'Orientation']='-'
            elif sense>antisense:
                df2.at[row,'Orientation']='+'
            elif antisense>sense:
                df2.at[row,'Orientation']='-'
            else:
                continue
    return(df2)
    

In [17]:
def tailCounter(df):
    
    df2= df.copy()
    df2['FILTER_RESULTS']='Good_Row'
    df2['Tail_Begins']='No_Tail_Detected'
    df2['Tail_Type']='No_Tail_Type'
    df2['Tail_Length']=0
    df2['Tail_Seed_Hits']=0
    
    for row in df2.index:
        

        sequence = str(df2.at[row,'Sequence']).upper()
        
        tTail = 'TTTTT'
        tTailFlag=0
        trunningTotal = 0
        tTailList=[]
        
        if tTail in sequence:
            taillength=0
            findings = [int(x) for x in find_sequence(sequence, tTail)]
            if len(findings)>0:
                tTailFlag=1
                tmin = min(findings)
                
                iterable = findings
                df2.at[row,'Tail_Seed_Hits']=len(findings)
                groupings = [list(group) for group in mit.consecutive_groups(iterable)]
                flag=0
                for group in groupings:
                    if flag==0:

                        taillength+= len(tTail) + (len(group)-1)
                        groupEnd = max(group)+len(tTail)
                        flag=1

                    else:

                        if min(group)-groupEnd<=4:
                            taillength+= (len(tTail) + (len(group)-1) + abs(min(group)-groupEnd))
                            groupEnd = max(group)+len(tTail)
                        else:
                            continue

                tTailLength = taillength
                
            else:
                pass
                
        
        aTail = 'AAAAA'
        aTailFlag=0
        arunningTotal = 0
        aTailList=[]
        if aTail in sequence:
            taillength=0
            reverseSequence = str(Seq(sequence).reverse_complement())
            findings = [int(x) for x in find_sequence(reverseSequence, tTail)]
            df2.at[row,'Tail_Seed_Hits']=len(findings)
            if len(findings)>0:
                aTailFlag=1
                amin = min(findings)
                iterable = findings
                groupings = [list(group) for group in mit.consecutive_groups(iterable)]
                flag=0
                for group in groupings:
                    if flag==0:

                        taillength+= len(aTail) + (len(group)-1)
                        groupEnd = max(group)+len(aTail)
                        flag=1

                    else:

                        if min(group)-groupEnd<=4:
                            taillength+= (len(aTail) + (len(group)-1) + abs(min(group)-groupEnd))
                            groupEnd = max(group)+len(aTail)
                        else:
                            continue

                aTailLength = taillength
                
            else:
                pass
        
        if tTailFlag>0 and aTailFlag==0:
            
            df2.at[row,'Tail_Begins']=tmin
            df2.at[row,'Tail_Type']='Possible_T-Tail'
            df2.at[row,'Tail_Length']=tTailLength
            
        elif tTailFlag==0 and aTailFlag>0:
            
            df2.at[row,'Tail_Begins']=amin
            df2.at[row,'Tail_Type']='Possible_A-Tail'
            df2.at[row,'Tail_Length']=aTailLength
        
        
        elif aTailFlag>0 and tTailFlag>0:
            
            orientation = str(df2.at[row,'Orientation'])
            
            if tTailLength<aTailLength and amin<tmin:
                df2.at[row,'Tail_Begins']=amin
                df2.at[row,'Tail_Type']='Possible_A-Tail*_and_Possible_T-Tail'
                df2.at[row,'Tail_Length']=aTailLength
                
            elif tTailLength>aTailLength and tmin<amin:
                df2.at[row,'Tail_Begins']=tmin
                df2.at[row,'Tail_Type']='Possible_A-Tail_and_Possible_T-Tail*'
                df2.at[row,'Tail_Length']=tTailLength
                
            else:
                if orientation !='None':
                    
                    if tmin>=50 and amin<50:
                        df2.at[row,'Tail_Begins']=amin
                        df2.at[row,'Tail_Type']='Possible_A-Tail*_and_Possible_T-Tail'
                        df2.at[row,'Tail_Length']=aTailLength
                    elif tmin<50 and amin>=50:
                        df2.at[row,'Tail_Begins']=tmin
                        df2.at[row,'Tail_Type']='Possible_A-Tail_and_Possible_T-Tail*'
                        df2.at[row,'Tail_Length']=tTailLength
                    
                        
                    elif aTailLength<tTailLength and orientation == '-':
                        df2.at[row,'Tail_Begins']=tmin
                        df2.at[row,'Tail_Type']='Possible_A-Tail_and_Possible_T-Tail*'
                        df2.at[row,'Tail_Length']=tTailLength

                    elif aTailLength>tTailLength and orientation == '+':
                        df2.at[row,'Tail_Begins']=amin
                        df2.at[row,'Tail_Type']='Possible_A-Tail*_and_Possible_T-Tail'
                        df2.at[row,'Tail_Length']=aTailLength
                        
                    elif orientation == '+':
                        df2.at[row,'Tail_Begins']=amin
                        df2.at[row,'Tail_Type']='Possible_A-Tail*_and_Possible_T-Tail'
                        df2.at[row,'Tail_Length']=aTailLength
                    
                    
                    elif orientation == '-':
                        df2.at[row,'Tail_Begins']=tmin
                        df2.at[row,'Tail_Type']='Possible_A-Tail_and_Possible_T-Tail*'
                        df2.at[row,'Tail_Length']=tTailLength
                    
                    else:
                        continue
                else:
                    continue
                    
                
        
        else:
            continue
            
    return(df2)
            

In [18]:
def aluLinker(df):
    df2 = df.copy()
    
    for row in df2.index:
        
        threePrimeFlags=0
        
        if df2.at[row,'TE_Designation'] == 'SINE/Alu' and str(df2.at[row,'Tail_Type'])!='No_Tail_Type':
            
            tailDesignation = str(df2.at[row,'Tail_Type'])
            tailStartSite = int(df2.at[row,'Tail_Begins'])
            elements = ast.literal_eval(str(df2.at[row,'TE_Hits']))
            
            if len(elements)==1:
                for element in elements:

                    splitElement = element.split()

                    if abs(int(splitElement[12])-tailStartSite)>=120 and abs(int(splitElement[12])-tailStartSite)<=150:
                        threePrimeFlags+=1
                    else:
                        continue
            else:
                pass
                        
            if threePrimeFlags>0:
                df2.at[row,'FILTER_RESULTS']=['Alu_Linker_Region_Warning']
            else:
                continue
            
            
        else:
            continue
    return(df2)

In [75]:
def finalQuickCheck(df):
    
    df2 = df.copy()
    
    for row in df2.index:
            
            if str(df2.at[row,'TE_Designation']) =='LINE/L1':
                if str(df2.at[row,'Element_Annotation']) == 'L1HS' or str(df2.at[row,'Element_Annotation']) == 'L1PA1' or str(df2.at[row,'Element_Annotation']) == 'L1PA2':
                    continue
                else:
                    if str(df2.at[row,'FILTER_RESULTS']) == 'Good_Row':
                        df2.at[row,'FILTER_RESULTS']=['OLDER_LINE_SUBFAMILY']
                    else:
                        newList = ast.literal_eval(str(df2.at[row,'FILTER_RESULTS']))
                        newList.append('OLDER_LINE_SUBFAMILY')
                        df2.at[row,'FILTER_RESULTS']=newList
                    
                    
            elif str(df2.at[row,'TE_Designation']) =='SINE/Alu':
                
                if 'AluY' in str(df2.at[row,'Element_Annotation']):
                    continue
                else:
                    if str(df2.at[row,'FILTER_RESULTS']) == 'Good_Row':
                        df2.at[row,'FILTER_RESULTS']=['OLDER_ALU_SUBFAMILY']
                    else:
                        newList = ast.literal_eval(str(df2.at[row,'FILTER_RESULTS']))
                        newList.append('OLDER_ALU_SUBFAMILY')
                        df2.at[row,'FILTER_RESULTS']=newList
                        
                
                aluList = ast.literal_eval(str(df2.at[row,'TE_Hits']))
                if len(aluList)==1:
                    continue
                else:
                    
                    aluHitList=[]
                    for hit in aluList:
                        splitHit = hit.split()
                        if 'Alu' in str(splitHit[9]):
                            aluHitList.append(splitHit[9])
                        else:
                            continue
                            
                    if len(set(aluHitList))>1:
                        if str(df2.at[row,'FILTER_RESULTS']) == 'Good_Row':
                            df2.at[row,'FILTER_RESULTS']=['MULTI-ALU_Hits']
                        else:
                            newList = ast.literal_eval(str(df2.at[row,'FILTER_RESULTS']))
                            newList.append('MULTI-ALU_Hits')
                            df2.at[row,'FILTER_RESULTS']=newList

                    else:
                        continue

    return(df2)

In [76]:
def finalQuickCheck2(df):
    
    df2 = df.copy()
    
    for row in df2.index:
            
            if str(df2.at[row,'TE_Designation']) =='LINE/L1':
                if int(df2.at[row,'Sequence_Length']) >10000:
                    if str(df2.at[row,'FILTER_RESULTS']) == 'Good_Row':
                        df2.at[row,'FILTER_RESULTS']=['LINE_>10kLen']
                    else:
                        newList = ast.literal_eval(str(df2.at[row,'FILTER_RESULTS']))
                        newList.append('LINE_>10kLen')
                        df2.at[row,'FILTER_RESULTS']=newList
                else:
                    pass
                    
                if float(df2.at[row,'Element_Divergence']) >15.0:
                    if str(df2.at[row,'FILTER_RESULTS']) == 'Good_Row':
                        df2.at[row,'FILTER_RESULTS']=['LINE_DIVERGENCE']
                    else:
                        newList = ast.literal_eval(str(df2.at[row,'FILTER_RESULTS']))
                        newList.append('LINE_DIVERGENCE')
                        df2.at[row,'FILTER_RESULTS']=newList
                else:
                    continue
                    
                    
            elif str(df2.at[row,'TE_Designation']) =='SINE/Alu':
                
                if int(df2.at[row,'Sequence_Length']) >500:
                    if str(df2.at[row,'FILTER_RESULTS']) == 'Good_Row':
                        df2.at[row,'FILTER_RESULTS']=['ALU_>500Len']
                    else:
                        newList = ast.literal_eval(str(df2.at[row,'FILTER_RESULTS']))
                        newList.append('ALU_>500Len')
                        df2.at[row,'FILTER_RESULTS']=newList
                else:
                    pass
                    
                if float(df2.at[row,'Element_Divergence']) >6.0:
                    if str(df2.at[row,'FILTER_RESULTS']) == 'Good_Row':
                        df2.at[row,'FILTER_RESULTS']=['ALU_DIVERGENCE']
                    else:
                        newList = ast.literal_eval(str(df2.at[row,'FILTER_RESULTS']))
                        newList.append('ALU_DIVERGENCE')
                        df2.at[row,'FILTER_RESULTS']=newList
                else:
                    continue
                        
                        
            elif str(df2.at[row,'TE_Designation']) =='Retroposon/SVA':
                if int(df2.at[row,'Sequence_Length']) >10000:
                    if str(df2.at[row,'FILTER_RESULTS']) == 'Good_Row':
                        df2.at[row,'FILTER_RESULTS']=['SVA_>10kLen']
                    else:
                        newList = ast.literal_eval(str(df2.at[row,'FILTER_RESULTS']))
                        newList.append('SVA_>10kLen')
                        df2.at[row,'FILTER_RESULTS']=newList
                else:
                    pass
                    
                if float(df2.at[row,'Element_Divergence']) >15.0:
                    if str(df2.at[row,'FILTER_RESULTS']) == 'Good_Row':
                        df2.at[row,'FILTER_RESULTS']=['SVA_DIVERGENCE']
                    else:
                        newList = ast.literal_eval(str(df2.at[row,'FILTER_RESULTS']))
                        newList.append('SVA_DIVERGENCE')
                        df2.at[row,'FILTER_RESULTS']=newList
                else:
                    continue
            
            else:
                continue

    return(df2)

In [77]:
def tailCounterCheck(df):
    df2 = df.copy()
    
    for row in df2.index:
        if df2.at[row,'Tail_Type']!='No_Tail_Type':
            flag=0

            if int(df2.at[row,'Tail_Begins'])/int(df2.at[row,'Sequence_Length'])>.5 or int(df2.at[row,'Tail_Begins'])>70:
                flag+=1
            else:
                pass

            if flag ==0:
                continue
            else:
                flagg='Bad_Tail_Position'
                if df2.at[row,'FILTER_RESULTS']!='Good_Row':
                    tempFlagList = ast.literal_eval(str(df2.at[row,'FILTER_RESULTS']))
                    tempFlagList.append(flagg)
                    df2.at[row,'FILTER_RESULTS'] = tempFlagList
                else:
                    df2.at[row,'FILTER_RESULTS'] = [flagg]
        else:
            continue
    
    return(df2)

In [78]:
def cleanDataTEPercentage(df):
    
    df2 = df.copy() 
    df2['Element_Annotation']='No_Element_Annotation'
    df2['Element_Divergence']=0.0
    df2['TE_Proportion']='NONE'
    df2['TE_Percentage']=0.0
    
    #For each locus
    for row in tqdm(df2.index):
        
        if df2.at[row,'TE_Hits'] == 'NONE' or int(df2.at[row,'Sequence_Length'])>50000:
            continue
        else:
            

            columnName = 'Sequence'

            tempDict={int(x):0 for x in range(1,len(df2.at[row,columnName])+1)}    
            tempDict2={}
            tempDict3={}
            tempDivDict={}

            teHitList = ast.literal_eval(str(df2.at[row,'TE_Hits']))

            for hit in teHitList:

                splitHit = hit.split()
                focusElement = str(splitHit[9])
                tempDict3[focusElement]=str(splitHit[10])

                if focusElement in tempDict2.keys():
                    tempDivDict[focusElement].append(float(splitHit[1]))
                    pass
                else:
                    tempDivDict[focusElement]=[float(splitHit[1])]
                    tempDict2[focusElement]={x:0 for x in range(1,len(df2.at[row,columnName])+1)}
                    pass

                for coordinate in range(int(splitHit[5]), int(splitHit[6])+1):
                    tempDict[coordinate]+=1
                    tempDict2[focusElement][coordinate]+=1

            tePercentage = len([x for x in tempDict.values() if x >0])/len(tempDict)

            df2.at[row,'TE_Percentage']=float(tePercentage)

            #print(tempDict3)
            #print(tempDivDict)

            if len(tempDict2)==1:
                mykey = [x for x in tempDict2.keys()][0]
                df2.at[row,'TE_Designation']=str(tempDict3[mykey])
                df2.at[row,'TE_Proportion']={mykey:tePercentage}
                df2.at[row,'Element_Annotation']=mykey
                df2.at[row,'Element_Divergence']=np.median(tempDivDict[mykey][0])

            else:

                tempDict4 = {x:len([y for y in tempDict2[x].values() if y>0])/len(tempDict) for x in tempDict2.keys()}
                #print(tempDict4)
                maxKey = max(tempDict4, key=tempDict4.get)
                df2.at[row,'TE_Designation']=tempDict3[str(maxKey)]
                df2.at[row,'TE_Proportion']=tempDict4
                df2.at[row,'Element_Annotation']=maxKey
                df2.at[row,'Element_Divergence']=np.median(tempDivDict[maxKey])
        
    return(df2)

In [79]:
def repeatmaskerPatternFilter(df):
    df2 = df.copy()
    annotationList=[]
    df2['Unique_Element_Count']='One_Element'
    for row in df2.index:
        if df2.at[row,'TE_Proportion'] == 'NONE':
            continue
        else:

            prodict = ast.literal_eval(str(df2.at[row,'TE_Proportion']))

            if str(df2.at[row,'TE_Designation']) == 'SINE/Alu':
                aluList = [x for x in prodict.keys() if 'ALU' in str(x).upper()]
                if len(aluList)==1:
                    continue
                else:
                    df2.at[row,'Unique_Element_Count']="More_Than_One_Element"
            else:
                continue 
    
    
    for row in df2.index:
        if df2.at[row,'TE_Proportion'] == 'NONE':
            continue
        else:
        
            if df2.at[row,'Unique_Element_Count'] == 'One_Element':

                tempDict = ast.literal_eval(str(df2.at[row,'TE_Proportion']))
                dictLength = len([x for x in tempDict.keys()])

                if str(df2.at[row,'TE_Designation']) == 'SINE/Alu':
                    if len(ast.literal_eval(str(df2.at[row,'TE_Hits'])))>dictLength:
                        df2.at[row,'Unique_Element_Count'] = 'One_Element_ODD'
                    else:
                        continue



                elif str(df2.at[row,'TE_Designation']) == 'LINE/L1':
                    if len(ast.literal_eval(str(df2.at[row,'TE_Hits'])))>dictLength:
                        df2.at[row,'Unique_Element_Count'] = 'One_Element_ODD'
                    else:
                        continue

                else:
                    if len(ast.literal_eval(str(df2.at[row,'TE_Hits'])))>dictLength:
                        df2.at[row,'Unique_Element_Count'] = 'One_Element_ODD'
                    else:
                        continue


            else:
                continue
    return(df2)

In [80]:
def findTwinPriming(df):
    df2 = df.copy()
    df2['Twin_Priming_Flag']='NONE'
    
    for row in df2.index:
        if df2.at[row,'TE_Designation']=='LINE/L1':
            allhits = ast.literal_eval(str(df2.at[row,'TE_Hits']))
            orientations = []
            for hit in allhits:
                splitHit=hit.split()
                if str(splitHit[10]) == 'LINE/L1':
                    orientations.append(splitHit[8])
                else:
                    continue
            if len(set(orientations))>1:
                df2.at[row,'Twin_Priming_Flag']='FLAG'
            else:
                continue
            
            
        else:
            continue
    return(df2)

In [81]:
def simpleRepeatCheck(df):
    df2 = df.copy()
    
    for row in df2.index:
        if df2.at[row,'TE_Designation']=='Simple_repeat':
            tempDict = {x:float(y) for x,y in ast.literal_eval(str(df2.at[row,'TE_Proportion'])).items() if ')n' not in x}
            if len(tempDict)>0:
                maxKey = str(max(tempDict, key=tempDict.get)).upper()
                if 'ALU' in maxKey and len(tempDict) == 1:
                    #print(row)
                    df2.at[row,'TE_Designation']= 'SINE/Alu'
                    df2.at[row,'Element_Annotation']= maxKey

                elif'L1' in maxKey and len(tempDict) == 1:
                    #print(row)
                    df2.at[row,'TE_Designation']= 'LINE/L1'
                    df2.at[row,'Element_Annotation']= maxKey

                elif 'SVA' in maxKey and len(tempDict) == 1:
                    #print(row)
                    df2.at[row,'TE_Designation']= 'Retroposon/SVA'
                    df2.at[row,'Element_Annotation']= maxKey

                else:
                    continue
            else:
                continue
        else:
            continue
    return(df2)

In [105]:
goodTEs=['SINE/Alu','LINE/L1','Retroposon/SVA','LTR/ERVK']

In [106]:
arthurhg38List=[]
fasta_sequences = SeqIO.parse(open('/home/mark/Desktop/MEI_Group/HGSVC3/Merged_Callsets/Manuscript/ArthurSequences/hg38/arthurhg38_Sequences.fasta'),'fasta')
for fasta in fasta_sequences:
    name, sequence = fasta.id, str(fasta.seq)
    arthurhg38List.append([name, sequence])

arthurhg38DF = pd.DataFrame(data=arthurhg38List, columns=['ID','Sequence']).set_index("ID")
arthurhg38DF['Sequence_Length']=[len(x) for x in arthurhg38DF['Sequence']]

rmdirectory = '/home/mark/Desktop/MEI_Group/HGSVC3/Merged_Callsets/Manuscript/ArthurSequences/hg38/arthurhg38_Sequences.fasta.out'
callDict={}
with open(rmdirectory, 'r') as file:
    lines_after_header = file.readlines()[3:]
    for line in lines_after_header:
        goodline = ' '.join(line.split())
        if str(goodline.split(" ")[4]) in callDict.keys() and float(goodline.split(" ")[1])<=20.0:
            callDict[str(goodline.split(" ")[4])]['Annotations'].append(goodline)
        elif float(goodline.split(" ")[1])<=20.0:
            callDict[str(goodline.split(" ")[4])]={'Annotations':[]}
            callDict[str(goodline.split(" ")[4])]['Annotations'].append(goodline)
        else:
            continue
file.close()
arthurhg38DF['TE_Hits']='NONE'
for row in arthurhg38DF.index:
    if row in callDict.keys():
        arthurhg38DF.at[row,'TE_Hits']=callDict[row]['Annotations']
    else:
        continue
        
insDF_Filtered= orientationFinder(simpleRepeatCheck(cleanDataTEPercentage(arthurhg38DF)))
insDF_Filtered2 = tailCounter(insDF_Filtered)
insDF_Filtered3 = aluLinker(insDF_Filtered2)
insDF_Filtered4 = tailCounterCheck(insDF_Filtered3)
insDF_Filtered5 =finalQuickCheck(insDF_Filtered4)
insDF_Filtered6 = finalQuickCheck2(insDF_Filtered5)
insDF_Filtered7 = repeatmaskerPatternFilter(insDF_Filtered6)
arthurhg38DFFinal = findTwinPriming(insDF_Filtered7)

for row in arthurhg38DFFinal.index:
    if arthurhg38DFFinal.at[row,'TE_Designation'] == 'LTR/ERVK':
        arthurhg38DFFinal.at[row,'FILTER_RESULTS'] = 'Good_Row'
    else:
        if arthurhg38DFFinal.at[row,'TE_Designation'] == 'SINE/Alu' and arthurhg38DFFinal.at[row,'Unique_Element_Count'] == 'More_Than_One_Element':
            arthurhg38DFFinal.at[row,'FILTER_RESULTS'] = 'BAD_Row'

        else:
            continue
        
finalGoodPALMERhg38 = arthurhg38DFFinal[(arthurhg38DFFinal['TE_Designation'].isin(goodTEs)) & (arthurhg38DFFinal['FILTER_RESULTS']=='Good_Row') & (arthurhg38DFFinal['Tail_Type']!='No_Tail_Type')].copy()
findBadPALMERhg38 = arthurhg38DFFinal.loc[[x for x in arthurhg38DFFinal.index if x not in finalGoodPALMERhg38.index]].copy()

100%|█████████████████████████████████████| 1267/1267 [00:00<00:00, 3027.60it/s]


In [107]:
print(len(finalGoodPALMERhg38))
print(len(findBadPALMERhg38))

816
451


In [117]:
#finalGoodPALMERhg38.to_csv('/home/mark/Desktop/MEI_Group/HGSVC3/Merged_Callsets/Manuscript/ArthurSequences/hg38_putativeGoodCalls_07-24-2024.csv')
#findBadPALMERhg38.to_csv('/home/mark/Desktop/MEI_Group/HGSVC3/Merged_Callsets/Manuscript/ArthurSequences/hg38_putativeBadCalls_07-24-2024.csv')

In [120]:
for element in set(finalGoodPALMERhg38['TE_Designation']):
    tempDF = finalGoodPALMERhg38[finalGoodPALMERhg38['TE_Designation']==element].copy()
    elementName=element.replace("/",'_')
    
    #with open('/home/mark/Desktop/MEI_Group/HGSVC3/Merged_Callsets/Manuscript/ArthurSequences/goodCallSequences/'+str(elementName)+"_sequences.fasta", 'a+') as file:
        for row in tempDF.index:
            elementName2 = str(tempDF.at[row,'Element_Annotation'])
            file.write('>'+str(row)+"_"+elementName2+'\n')
            if str(tempDF.at[row,'Orientation']) == '-':
                file.write(str(tempDF.at[row,'Sequence'])+"\n")
            else:
                sequence2 = str(Seq(tempDF.at[row,'Sequence']).reverse_complement())
                file.write(sequence2+"\n")
    file.close()

In [108]:
arthurhs1List=[]
fasta_sequences = SeqIO.parse(open('/home/mark/Desktop/MEI_Group/HGSVC3/Merged_Callsets/Manuscript/ArthurSequences/hs1/arthurhs1_Sequences.fasta'),'fasta')
for fasta in fasta_sequences:
    name, sequence = fasta.id, str(fasta.seq)
    arthurhs1List.append([name, sequence])

arthurhs1DF = pd.DataFrame(data=arthurhs1List, columns=['ID','Sequence']).set_index("ID")
arthurhs1DF['Sequence_Length']=[len(x) for x in arthurhs1DF['Sequence']]

rmdirectory = '/home/mark/Desktop/MEI_Group/HGSVC3/Merged_Callsets/Manuscript/ArthurSequences/hs1/arthurhs1_Sequences.fasta.out'
callDict={}
with open(rmdirectory, 'r') as file:
    lines_after_header = file.readlines()[3:]
    for line in lines_after_header:
        goodline = ' '.join(line.split())
        if str(goodline.split(" ")[4]) in callDict.keys() and float(goodline.split(" ")[1])<=20.0:
            callDict[str(goodline.split(" ")[4])]['Annotations'].append(goodline)
        elif float(goodline.split(" ")[1])<=20.0:
            callDict[str(goodline.split(" ")[4])]={'Annotations':[]}
            callDict[str(goodline.split(" ")[4])]['Annotations'].append(goodline)
        else:
            continue
file.close()
arthurhs1DF['TE_Hits']='NONE'
for row in arthurhs1DF.index:
    if row in callDict.keys():
        arthurhs1DF.at[row,'TE_Hits']=callDict[row]['Annotations']
    else:
        continue
        
insDF_Filtered= orientationFinder(simpleRepeatCheck(cleanDataTEPercentage(arthurhs1DF)))
insDF_Filtered2 = tailCounter(insDF_Filtered)
insDF_Filtered3 = aluLinker(insDF_Filtered2)
insDF_Filtered4 = tailCounterCheck(insDF_Filtered3)
insDF_Filtered5 =finalQuickCheck(insDF_Filtered4)
insDF_Filtered6 = finalQuickCheck2(insDF_Filtered5)
insDF_Filtered7 = repeatmaskerPatternFilter(insDF_Filtered6)
arthurhs1DFFinal = findTwinPriming(insDF_Filtered7)

for row in arthurhs1DFFinal.index:
    if arthurhs1DFFinal.at[row,'TE_Designation'] == 'LTR/ERVK':
        arthurhs1DFFinal.at[row,'FILTER_RESULTS'] = 'Good_Row'
    else:
        if arthurhs1DFFinal.at[row,'TE_Designation'] == 'SINE/Alu' and arthurhs1DFFinal.at[row,'Unique_Element_Count'] == 'More_Than_One_Element':
            arthurhs1DFFinal.at[row,'FILTER_RESULTS'] = 'BAD_Row'

        else:
            continue
        
finalGoodPALMERhs1 = arthurhs1DFFinal[(arthurhs1DFFinal['TE_Designation'].isin(goodTEs)) & (arthurhs1DFFinal['FILTER_RESULTS']=='Good_Row') & (arthurhs1DFFinal['Tail_Type']!='No_Tail_Type')].copy()
findBadPALMERhs1 = arthurhs1DFFinal.loc[[x for x in arthurhs1DFFinal.index if x not in finalGoodPALMERhs1.index]].copy()

100%|█████████████████████████████████████| 1290/1290 [00:00<00:00, 2884.87it/s]


In [109]:
print(len(finalGoodPALMERhs1))
print(len(findBadPALMERhs1))

775
515


In [110]:
collections.Counter(finalGoodPALMERhs1['TE_Designation'])

Counter({'SINE/Alu': 601, 'LINE/L1': 144, 'Retroposon/SVA': 23, 'LTR/ERVK': 7})

In [116]:
#finalGoodPALMERhs1.to_csv('/home/mark/Desktop/MEI_Group/HGSVC3/Merged_Callsets/Manuscript/ArthurSequences/hs1_putativeGoodCalls_07-24-2024.csv')
#findBadPALMERhs1.to_csv('/home/mark/Desktop/MEI_Group/HGSVC3/Merged_Callsets/Manuscript/ArthurSequences/hs1_putativeBadCalls_07-24-2024.csv')

In [121]:
for element in set(finalGoodPALMERhs1['TE_Designation']):
    tempDF = finalGoodPALMERhs1[finalGoodPALMERhs1['TE_Designation']==element].copy()
    elementName=element.replace("/",'_')
    
    #with open('/home/mark/Desktop/MEI_Group/HGSVC3/Merged_Callsets/Manuscript/ArthurSequences/goodCallSequences/'+str(elementName)+"_hs1_sequences.fasta", 'a+') as file:
        for row in tempDF.index:
            elementName2 = str(tempDF.at[row,'Element_Annotation'])
            file.write('>'+str(row)+"_"+elementName2+'\n')
            if str(tempDF.at[row,'Orientation']) == '-':
                file.write(str(tempDF.at[row,'Sequence'])+"\n")
            else:
                sequence2 = str(Seq(tempDF.at[row,'Sequence']).reverse_complement())
                file.write(sequence2+"\n")
    file.close()

## Konkel Lab

In [127]:
mergedhg38 =pd.read_csv('/home/mark/Desktop/MEI_Group/HGSVC3/Merged_Callsets/Manuscript/hg38_allCalls_07-24-2024.csv').set_index("Unnamed: 0")
konkelhg38 = mergedhg38[(mergedhg38['Caller_Count']==1) & (mergedhg38['Konkel_Lab']==1)].copy()

In [133]:
konkelhg38.loc['chr1-11770152-INS-1586']['Konkel_Lab_INFO']

'SV_Length:1587;TE_Designation:Retroposon/SVA;RM_Annotation:SVA_F;Tail_Type:Possible_T-Tail;Orientation:-'

In [135]:
for element in set(konkelhg38['TE_Designation']):
    tempDF = konkelhg38[konkelhg38['TE_Designation']==element].copy()
    elementName=element.replace("/",'_')
    
    
    #with open('/home/mark/Desktop/MEI_Group/HGSVC3/Merged_Callsets/Manuscript/KonkelSequences/goodCallSequences/'+str(elementName)+"_hg38_sequences.fasta", 'a+') as file:
        for row in tempDF.index:
            elementName = str(konkelhg38.at[row,'Konkel_Lab_INFO'].split("RM_Annotation:")[1].split(";")[0])
            orientation = str(konkelhg38.at[row,'Konkel_Lab_INFO'].split("Orientation:")[1])
            file.write('>'+str(row)+'_'+str(elementName)+'\n')
            if orientation == '-':
                file.write(str(tempDF.at[row,'ALT'])+"\n")
            else:
                sequence2 = str(Seq(tempDF.at[row,'ALT']).reverse_complement())
                file.write(sequence2+"\n")
    file.close()

In [128]:
mergedhs1 =pd.read_csv('/home/mark/Desktop/MEI_Group/HGSVC3/Merged_Callsets/Manuscript/hs1_allCalls_07-24-2024.csv').set_index("Unnamed: 0")
konkelhs1 = mergedhs1[(mergedhs1['Caller_Count']==1) & (mergedhs1['Konkel_Lab']==1)].copy()

In [136]:
for element in set(konkelhs1['TE_Designation']):
    tempDF = konkelhs1[konkelhs1['TE_Designation']==element].copy()
    elementName=element.replace("/",'_')
    
    
    #with open('/home/mark/Desktop/MEI_Group/HGSVC3/Merged_Callsets/Manuscript/KonkelSequences/goodCallSequences/'+str(elementName)+"_hs1_sequences.fasta", 'a+') as file:
        for row in tempDF.index:
            elementName = str(konkelhs1.at[row,'Konkel_Lab_INFO'].split("RM_Annotation:")[1].split(";")[0])
            orientation = str(konkelhs1.at[row,'Konkel_Lab_INFO'].split("Orientation:")[1])
            file.write('>'+str(row)+'_'+str(elementName)+'\n')
            if orientation == '-':
                file.write(str(tempDF.at[row,'ALT'])+"\n")
            else:
                sequence2 = str(Seq(tempDF.at[row,'ALT']).reverse_complement())
                file.write(sequence2+"\n")
    file.close()

## Filter out the bad calls

In [153]:
mergedhs1 =pd.read_csv('/home/mark/Desktop/MEI_Group/HGSVC3/Merged_Callsets/Manuscript/hs1_allCalls_07-24-2024.csv').set_index("Unnamed: 0")
mergedhg38 =pd.read_csv('/home/mark/Desktop/MEI_Group/HGSVC3/Merged_Callsets/Manuscript/hg38_allCalls_07-24-2024.csv').set_index("Unnamed: 0")

In [154]:
print(len(mergedhs1))
print(len(mergedhg38))

14208
13991


In [155]:
#hg38 bad calls
from Bio import SeqIO
badhg38Calls=[]

badPALMERCalls = pd.read_csv('/home/mark/Desktop/MEI_Group/HGSVC3/Merged_Callsets/Manuscript/ArthurSequences/hg38_putativeBadCalls_07-24-2024.csv').set_index("ID")
for row in badPALMERCalls.index:
    badhg38Calls.append(row)

input_file='/home/mark/Desktop/MEI_Group/HGSVC3/Merged_Callsets/Manuscript/ArthurSequences/PALMER_UniqueSequences_Checked/LINE_L1_hg38_Duplications_sequences.fasta'
fasta_sequences = SeqIO.parse(open(input_file),'fasta')
for fasta in fasta_sequences:
    name, sequence = fasta.id, str(fasta.seq)
    badhg38Calls.append('_'.join(name.split("_")[:2]))
    
input_file='/home/mark/Desktop/MEI_Group/HGSVC3/Merged_Callsets/Manuscript/ArthurSequences/PALMER_UniqueSequences_Checked/Retroposon_SVA_hg38_Duplications_sequences.fasta'
fasta_sequences = SeqIO.parse(open(input_file),'fasta')
for fasta in fasta_sequences:
    name, sequence = fasta.id, str(fasta.seq)
    badhg38Calls.append('_'.join(name.split("_")[:2]))
    
input_file='/home/mark/Desktop/MEI_Group/HGSVC3/Merged_Callsets/Manuscript/ArthurSequences/PALMER_UniqueSequences_Checked/SINE_Alu_hg38_Duplications_sequences.fasta'
fasta_sequences = SeqIO.parse(open(input_file),'fasta')
for fasta in fasta_sequences:
    name, sequence = fasta.id, str(fasta.seq)
    badhg38Calls.append('_'.join(name.split("_")[:2]))
    

input_file='/home/mark/Desktop/MEI_Group/HGSVC3/Merged_Callsets/Manuscript/KonkelSequences/Konkel_Unique_Sequences_Checked/LINE_L1_hg38_Duplications_sequences.fasta'
fasta_sequences = SeqIO.parse(open(input_file),'fasta')
for fasta in fasta_sequences:
    name, sequence = fasta.id, str(fasta.seq)
    badhg38Calls.append(name.split("_")[0])
    
    
for row in mergedhg38.index:
    if int(mergedhg38.at[row,'PALMER'])==1:
        if 'HERV' in str(mergedhg38.at[row,'TE_Designation']):
            continue
        else:
            if 'N' in str(mergedhg38.at[row,'PALMER_INFO'].split(";")[6]).upper():
                #print(mergedhs1.at[row,'PALMER_INFO'].split(";")[6])
                badhg38Calls.append(row)
            else:
                continue
        
    else:
        continue

In [156]:
print(len(badhg38Calls))

984


In [157]:
#hs1 bad calls
badhs1Calls=[]

badPALMERCalls = pd.read_csv('/home/mark/Desktop/MEI_Group/HGSVC3/Merged_Callsets/Manuscript/ArthurSequences/hs1_putativeBadCalls_07-24-2024.csv').set_index("ID")
for row in badPALMERCalls.index:
    badhs1Calls.append(row)

input_file='/home/mark/Desktop/MEI_Group/HGSVC3/Merged_Callsets/Manuscript/ArthurSequences/PALMER_UniqueSequences_Checked/LINE_L1_hs1_Duplications_sequences.fasta'
fasta_sequences = SeqIO.parse(open(input_file),'fasta')
for fasta in fasta_sequences:
    name, sequence = fasta.id, str(fasta.seq)
    badhs1Calls.append('_'.join(name.split("_")[:2]))
    
input_file='/home/mark/Desktop/MEI_Group/HGSVC3/Merged_Callsets/Manuscript/ArthurSequences/PALMER_UniqueSequences_Checked/SINE_Alu_hs1_Duplications_sequences.fasta'
fasta_sequences = SeqIO.parse(open(input_file),'fasta')
for fasta in fasta_sequences:
    name, sequence = fasta.id, str(fasta.seq)
    badhs1Calls.append('_'.join(name.split("_")[:2]))
    
    
input_file='/home/mark/Desktop/MEI_Group/HGSVC3/Merged_Callsets/Manuscript/KonkelSequences/Konkel_Unique_Sequences_Checked/LINE_L1_hs1_Duplications_Bad_sequences.fasta'
fasta_sequences = SeqIO.parse(open(input_file),'fasta')
for fasta in fasta_sequences:
    name, sequence = fasta.id, str(fasta.seq)
    badhs1Calls.append(name.split("_")[0])
    
for row in mergedhs1.index:
    if int(mergedhs1.at[row,'PALMER'])==1:
        if 'HERV' in str(mergedhs1.at[row,'TE_Designation']):
            continue
        else:
            if 'N' in str(mergedhs1.at[row,'PALMER_INFO'].split(";")[6]).upper():
                #print(mergedhs1.at[row,'PALMER_INFO'].split(";")[6])
                badhs1Calls.append(row)
            else:
                continue
        
    else:
        continue

In [158]:
len(badhs1Calls)

998

In [159]:
def filterCalls(df, mylist):
    df2 = df.copy()
    goodRows=[]
    for row in df2.index:
        if row in mylist:
            continue
        else:
            goodRows.append(row)
    df3 = df2.loc[goodRows].copy()
    return(df3)

In [160]:
filteredhs1=filterCalls(mergedhs1,badhs1Calls)
filteredhg38=filterCalls(mergedhg38,badhg38Calls)

In [161]:
print(len(filteredhs1))
print(len(filteredhg38))

13300
13083


In [162]:
import collections
def findCallerCounts(df):
    df2 = df.copy()
    callerCounts=[]
    both=0
    allcounts=0
    pavrows=[]
    palmerrows=[]
    for row in df2.index:
        allcounts+=1
        if df2.at[row,'Caller_Count']==2:
            callerCounts.append('BOTH')
            both+=1
        else:
            if int(df2.at[row,'Konkel_Lab'])==0:
                callerCounts.append("PALMER")
                palmerrows.append(row)
            else:
                callerCounts.append("PAV")
                pavrows.append(row)
    print('Shared Call Percentage: '+str(both/allcounts))
    print('PAV: '+str(collections.Counter(df2.loc[pavrows]['TE_Designation'])))
    print('PALMER:' +str(collections.Counter(df2.loc[palmerrows]['TE_Designation'])))
    print('\n')
    return(collections.Counter(callerCounts))

In [163]:
#hs1

In [164]:
findCallerCounts(mergedhs1)

Shared Call Percentage: 0.8415681306306306
PAV: Counter({'SINE/Alu': 496, 'LINE/L1': 253, 'Retroposon/SVA': 211, 'snRNA': 1})
PALMER:Counter({'SINE/Alu': 1023, 'LINE/L1': 235, 'Retroposon/SVA': 25, 'HERVK': 7})




Counter({'BOTH': 11957, 'PALMER': 1290, 'PAV': 961})

In [165]:
findCallerCounts(filteredhs1)

Shared Call Percentage: 0.8981954887218045
PAV: Counter({'SINE/Alu': 496, 'LINE/L1': 248, 'Retroposon/SVA': 211, 'snRNA': 1})
PALMER:Counter({'SINE/Alu': 308, 'LINE/L1': 61, 'Retroposon/SVA': 22, 'HERVK': 7})




Counter({'BOTH': 11946, 'PAV': 956, 'PALMER': 398})

In [166]:
#hg38

In [167]:
findCallerCounts(mergedhg38)

Shared Call Percentage: 0.8526910156529197
PAV: Counter({'SINE/Alu': 363, 'LINE/L1': 233, 'Retroposon/SVA': 197, 'snRNA': 1})
PALMER:Counter({'SINE/Alu': 962, 'LINE/L1': 279, 'Retroposon/SVA': 19, 'HERVK': 7})




Counter({'BOTH': 11930, 'PALMER': 1267, 'PAV': 794})

In [168]:
findCallerCounts(filteredhg38)

Shared Call Percentage: 0.9105709699610182
PAV: Counter({'SINE/Alu': 363, 'LINE/L1': 231, 'Retroposon/SVA': 197, 'snRNA': 1})
PALMER:Counter({'SINE/Alu': 283, 'LINE/L1': 73, 'Retroposon/SVA': 15, 'HERVK': 7})




Counter({'BOTH': 11913, 'PAV': 792, 'PALMER': 378})

In [None]:
print(len(filteredhs1))
print(len(filteredhg38))

In [173]:
#filteredhg38.to_csv('/home/mark/Desktop/MEI_Group/HGSVC3/Merged_Callsets/Manuscript/hg38_allCalls_Cleaned-07-29-2024.csv')

In [174]:
#filteredhs1.to_csv('/home/mark/Desktop/MEI_Group/HGSVC3/Merged_Callsets/Manuscript/hs1_allCalls_Cleaned-07-29-2024.csv')