In [None]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from functools import partial
from os import listdir, mkdir

rcParams = {'font.size': 30, 'font.weight': 'normal', 'font.family': 'sans-serif',
            'axes.unicode_minus':False, 'axes.labelweight':'normal'}

plt.rcParams.update(rcParams)

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

def Load_PAF(filepath, sample):
    header = ['Query','QLen','QStart','QEnd','Orientation','Subject','SLen','SStart','SEnd',
              'Matches','AlignLength','MAPQ','TP', 'MM', 'GN', 'GO', 'CG', 'CS']
    df = pd.read_csv(filepath, sep = "\t", names = header)
    df[['QLen','QStart','QEnd','SLen','SStart',
        'SEnd','Matches','AlignLength','MAPQ']] = df[['QLen','QStart','QEnd','SLen','SStart',
                                                      'SEnd','Matches','AlignLength','MAPQ']].astype('int')
    df['PIdent'] = df['Matches']/df['AlignLength']*100.0
    df['Sample'] = sample.replace("_FD.paf","")
    df['S_Align'] = df['SEnd'] - df['SStart']
    
    df = df[(df['QLen'] == df['AlignLength'])]

    df['Read_Name'] = df['Query'].str[0:-2]
    df['Read_Tag'] = df['Query'].str[-1]
    df['MisMatches'] = df['AlignLength']-df['Matches']
    df['Read_ID'] = df['Sample']+"_"+df['Query']
    
    return df

def Coverage_Stats(group, slen, pe = False):
    coverage = np.zeros(slen)
    
    if (pe):
        Sstarts = group[('SStart','1')].tolist() + group[('SStart','2')].tolist()
        Sends = group[('SEnd','1')].tolist() + group[('SEnd','2')].tolist()
    else:
        Sstarts = group['SStart'].tolist()
        Sends = group['SEnd'].tolist()
    
    for i in range(0, len(Sstarts)):
        start, end = Sstarts[i], Sends[i]
        coverage[start:end] += 1
    length = len(coverage)
    avg_depth = coverage.sum()
    breadth = len(coverage[coverage > 0])/length*100.0
    num_reads = len(group)
    
    return pd.Series({'Avg_Depth_Coverage':avg_depth, 
                      'Breadth_Coverage':breadth, 
                      'Num_Reads':num_reads})

def Return_Best_Alignment(grp):
    Mismatches = np.array(grp['MisMatches'].tolist())
    min_mismatch = Mismatches.min()
    out = grp[grp['MisMatches'] == min_mismatch]
    if len(out) > 1: out['Only'] = False
    else: out['Only'] = True
    return out

def Process_CS_String(CS, v):
    vec = np.ones(int(v))
    index = 0
    CS = CS.replace("cs:Z:","").replace("*",":").replace("+","&").replace("-",'&')
    splits = CS.split(":")
    for s in splits[1:]:
        r = s.split("&")
        if r[0].isnumeric(): 
            vec[index:index+int(r[0])] = 0
            index += int(r[0])
        else: index += 1
    assert len(vec) == v, "Length Mismatch"
    
    return vec

def Count_MisMatches_Paired_End(row, slen=2000):
    coverage = np.zeros(slen)
    SStart_1, SEnd_1, Orientation_Fow = row[('SStart','1')], row[('SEnd','1')], row[('Orientation','1')]
    SStart_2, SEnd_2, Orientation_Rev = row[('SStart','2')], row[('SEnd','2')], row[('Orientation','2')]
    CS_1, CS_2 = row[('CS','1')], row[('CS','2')]
    V1, V2 = Process_CS_String(CS_1, SEnd_1-SStart_1), Process_CS_String(CS_2, SEnd_2-SStart_2)
    
    if Orientation_Fow == '+': coverage[SStart_1:SEnd_1] += V1
    elif Orientation_Fow == '-': coverage[SStart_1:SEnd_1] += V1[::-1]
        
    if Orientation_Rev == '+': coverage[SStart_2:SEnd_2] += V2
    elif Orientation_Rev == '-': coverage[SStart_2:SEnd_2] += V2[::-1]
    
    return len(coverage[coverage > 0])
    
def Merge_Paired_Ends(df_all):
    df = df_all.pivot_table(index = ['Subject','Read_Name'], columns = ['Read_Tag'], aggfunc = 'first',
                            values = ['SStart','SEnd','MisMatches','CS','Orientation'])
    df = df.dropna()
    df[[('SStart','1'),('SStart','2'),
        ('SEnd','1'),('SEnd','2')]] = df[[('SStart','1'),('SStart','2'),
                                          ('SEnd','1'),('SEnd','2')]].astype(int)
    df['Overlap'] = (df[[('SStart','1'),('SStart','2')]].max(axis = 1) - 
                     df[[('SEnd','1'),('SEnd','2')]].min(axis = 1))
    df['Overlap_Flag'] = False
    df.loc[df['Overlap'] < 0, 'Overlap_Flag'] = True
    df['MisMatches_Total'] = df.apply(Count_MisMatches_Paired_End, args=(2000,), axis = 1)
    df = df.reset_index()
    return df



In [None]:
filedir = '/Users/harihara/Mount-2/hotspring_metagenome/single_cell_analysis_with_Gabe_Birzu/16S_Read_Alignments/'
samples = listdir(filedir+'Alpha_Alleles/paf_files/')

In [None]:
df_all_abundances = pd.DataFrame()
df_paired_end_read_abundances = pd.DataFrame()

for s in samples:
    df_alpha = Load_PAF(filedir+'Alpha_Alleles/paf_files/'+s, s)
    df_beta = Load_PAF(filedir+'Beta_Alleles/paf_files/'+s, s)
    df_gamma = Load_PAF(filedir+'Gamma_Alleles/paf_files/'+s, s)
    
    Temp = pd.DataFrame()
    Temp = Temp.append(df_alpha).append(df_beta).append(df_gamma)
    
    df_all_abundances = df_all_abundances.append(Temp, ignore_index = True)
    df_paired_end = Merge_Paired_Ends(Temp)
    df_paired_end['Sample'] = s.replace("_FD.paf","")
    
    df_paired_end_read_abundances = df_paired_end_read_abundances.append(df_paired_end.reset_index())

    print(s, len(Temp), len(df_paired_end)*2)