# Quality control and big trends in processed data
For new sequencing files (45-2-x)

In [1]:
import pandas as pd
import regex as re
import os
from functools import reduce
import numpy as np

pd.set_option('display.max_colwidth', None)

In [77]:
# Set up
workdir = '/Users/jw38/Onedrive/prime_editing_efficiencies'
experiment = 'G1_020_'

prc_dir_string = 'prc_data/20210712_MergedRL_SNP'
prc_dir = os.path.join(workdir,prc_dir_string)
lib_dir = os.path.join(workdir,'prc_data')
metadata_path = os.path.join(workdir,'files','G1_000_screenID_conversion.txt')

## Get files and metadata

### count tables
First, we want to import all the files that match our filename pattern: \
    "Goose_45-number-number-(pegRNA or target)_counts.txt"

In [97]:
# Get file list
file_list = [f for f in os.listdir(prc_dir) if re.match(r'G1_010_45-[0-9]{1,2}-[0-9]{1,2}-(pegRNA|target)_counts.txt', f)]
print("Number of files: {}".format(len(file_list)))

# Extract information from filename
file_df = pd.DataFrame(file_list, columns = ['filename'])
file_df[['experiment','step','screen','extension']] = file_df['filename'].str.split("_" , expand=True)
file_df[['ID','type']] = file_df['screen'].str.rsplit('-', 1, expand = True)
file_df['replicate'] = file_df['ID'].str.split('-', expand = True)[1]
file_df = file_df.drop(columns=['extension','screen','experiment','step'])
# extend path for filename
file_df['path'] = prc_dir_string + "/" + file_df['filename'].astype(str)

file_df


Number of files: 42


Unnamed: 0,filename,ID,type,replicate,path
0,G1_010_45-2-7-target_counts.txt,45-2-7,target,2,prc_data/20210712_MergedRL_SNP/G1_010_45-2-7-target_counts.txt
1,G1_010_45-2-9-pegRNA_counts.txt,45-2-9,pegRNA,2,prc_data/20210712_MergedRL_SNP/G1_010_45-2-9-pegRNA_counts.txt
2,G1_010_45-2-10-target_counts.txt,45-2-10,target,2,prc_data/20210712_MergedRL_SNP/G1_010_45-2-10-target_counts.txt
3,G1_010_45-1-2-target_counts.txt,45-1-2,target,1,prc_data/20210712_MergedRL_SNP/G1_010_45-1-2-target_counts.txt
4,G1_010_45-1-9-pegRNA_counts.txt,45-1-9,pegRNA,1,prc_data/20210712_MergedRL_SNP/G1_010_45-1-9-pegRNA_counts.txt
5,G1_010_45-1-12-pegRNA_counts.txt,45-1-12,pegRNA,1,prc_data/20210712_MergedRL_SNP/G1_010_45-1-12-pegRNA_counts.txt
6,G1_010_45-1-7-target_counts.txt,45-1-7,target,1,prc_data/20210712_MergedRL_SNP/G1_010_45-1-7-target_counts.txt
7,G1_010_45-2-8-pegRNA_counts.txt,45-2-8,pegRNA,2,prc_data/20210712_MergedRL_SNP/G1_010_45-2-8-pegRNA_counts.txt
8,G1_010_45-2-11-target_counts.txt,45-2-11,target,2,prc_data/20210712_MergedRL_SNP/G1_010_45-2-11-target_counts.txt
9,G1_010_45-1-3-target_counts.txt,45-1-3,target,1,prc_data/20210712_MergedRL_SNP/G1_010_45-1-3-target_counts.txt


The metadata for this data can be infered from the library number which is stored in another file. So, we will extract the library number from the filename and then create a table with the metadata.

In [98]:
metadata = pd.read_csv(metadata_path, delimiter = '\t')
metadata

Unnamed: 0,ID,cell_line,target,prime_editor,read
0,45-1-1,293T,FANCF,PE2,R1
1,45-1-2,293T,FANCF,PE2,R2
2,45-1-3,293T,FANCF,PE2,R3
3,45-1-4,293T,HEK3,PE2,R1
4,45-1-5,293T,HEK3,PE2,R2
5,45-1-6,293T,HEK3,PE2,R3
6,45-1-7,HAP1,FANCF,PE2,R1
7,45-1-8,HAP1,FANCF,PE2,R2
8,45-1-9,HAP1,FANCF,PE2,R3
9,45-1-10,HAP1,HEK3,PE2,R1


In [99]:
meta_df = pd.merge(file_df, metadata, on='ID', how='left')
meta_df['name'] = meta_df['target'] + "_" + meta_df['cell_line'] + "_" + meta_df['prime_editor'] + "_" + meta_df['type']+ "_" + meta_df['replicate'] + "_counts_" + meta_df['read']
meta_df['screenname'] = meta_df['target'] + "_" + meta_df['cell_line'] + "_" + meta_df['prime_editor'] + "_" + meta_df['type'] + "_" + meta_df['replicate']

# save to file
meta_df.to_csv(os.path.join(workdir, 'files','G1_020_prc-data_countfiles.txt'), sep = '\t')

meta_df

Unnamed: 0,filename,ID,type,replicate,path,cell_line,target,prime_editor,read,name,screenname
0,G1_010_45-2-7-target_counts.txt,45-2-7,target,2,prc_data/20210712_MergedRL_SNP/G1_010_45-2-7-target_counts.txt,HAP1,FANCF,PE2,R1,FANCF_HAP1_PE2_target_2_counts_R1,FANCF_HAP1_PE2_target_2
1,G1_010_45-2-9-pegRNA_counts.txt,45-2-9,pegRNA,2,prc_data/20210712_MergedRL_SNP/G1_010_45-2-9-pegRNA_counts.txt,HAP1,FANCF,PE2,R3,FANCF_HAP1_PE2_pegRNA_2_counts_R3,FANCF_HAP1_PE2_pegRNA_2
2,G1_010_45-2-10-target_counts.txt,45-2-10,target,2,prc_data/20210712_MergedRL_SNP/G1_010_45-2-10-target_counts.txt,HAP1,HEK3,PE2,R1,HEK3_HAP1_PE2_target_2_counts_R1,HEK3_HAP1_PE2_target_2
3,G1_010_45-1-2-target_counts.txt,45-1-2,target,1,prc_data/20210712_MergedRL_SNP/G1_010_45-1-2-target_counts.txt,293T,FANCF,PE2,R2,FANCF_293T_PE2_target_1_counts_R2,FANCF_293T_PE2_target_1
4,G1_010_45-1-9-pegRNA_counts.txt,45-1-9,pegRNA,1,prc_data/20210712_MergedRL_SNP/G1_010_45-1-9-pegRNA_counts.txt,HAP1,FANCF,PE2,R3,FANCF_HAP1_PE2_pegRNA_1_counts_R3,FANCF_HAP1_PE2_pegRNA_1
5,G1_010_45-1-12-pegRNA_counts.txt,45-1-12,pegRNA,1,prc_data/20210712_MergedRL_SNP/G1_010_45-1-12-pegRNA_counts.txt,HAP1,HEK3,PE2,R3,HEK3_HAP1_PE2_pegRNA_1_counts_R3,HEK3_HAP1_PE2_pegRNA_1
6,G1_010_45-1-7-target_counts.txt,45-1-7,target,1,prc_data/20210712_MergedRL_SNP/G1_010_45-1-7-target_counts.txt,HAP1,FANCF,PE2,R1,FANCF_HAP1_PE2_target_1_counts_R1,FANCF_HAP1_PE2_target_1
7,G1_010_45-2-8-pegRNA_counts.txt,45-2-8,pegRNA,2,prc_data/20210712_MergedRL_SNP/G1_010_45-2-8-pegRNA_counts.txt,HAP1,FANCF,PE2,R2,FANCF_HAP1_PE2_pegRNA_2_counts_R2,FANCF_HAP1_PE2_pegRNA_2
8,G1_010_45-2-11-target_counts.txt,45-2-11,target,2,prc_data/20210712_MergedRL_SNP/G1_010_45-2-11-target_counts.txt,HAP1,HEK3,PE2,R2,HEK3_HAP1_PE2_target_2_counts_R2,HEK3_HAP1_PE2_target_2
9,G1_010_45-1-3-target_counts.txt,45-1-3,target,1,prc_data/20210712_MergedRL_SNP/G1_010_45-1-3-target_counts.txt,293T,FANCF,PE2,R3,FANCF_293T_PE2_target_1_counts_R3,FANCF_293T_PE2_target_1


### Library count tables
Proceed as above, but with adaoted pattern

In [100]:
# Get file list
libfile_list = [f for f in os.listdir(lib_dir) if re.match(r'Library_.+_counts.txt', f)]
print("Number of files: {}".format(len(libfile_list)))

# # Extract information from filename
libfile_df = pd.DataFrame(libfile_list, columns = ['filename'])
libfile_df[['prefix','library_full','filextension']] = libfile_df['filename'].str.split("_" , expand=True)
libfile_df[['library','read']] = libfile_df['library_full'].str.rsplit('-', 1, expand = True)
libfile_df['read'] = libfile_df['read'].replace({'a': 'R1', 'b': 'R2'})
libfile_df = libfile_df.drop(columns=['filextension', 'prefix','library_full'])

# name library
libfile_df['name'] = libfile_df['library'] + "_library_" + libfile_df['read']

# # extend path for filename
libfile_df['path'] = 'prc_data/' + libfile_df['filename'].astype(str)

libfile_df.head()

Number of files: 4


Unnamed: 0,filename,library,read,name,path
0,Library_FANCF-b_counts.txt,FANCF,R2,FANCF_library_R2,prc_data/Library_FANCF-b_counts.txt
1,Library_HEK3-a_counts.txt,HEK3,R1,HEK3_library_R1,prc_data/Library_HEK3-a_counts.txt
2,Library_HEK3-b_counts.txt,HEK3,R2,HEK3_library_R2,prc_data/Library_HEK3-b_counts.txt
3,Library_FANCF-a_counts.txt,FANCF,R1,FANCF_library_R1,prc_data/Library_FANCF-a_counts.txt


## Read count tables and libraries into dictionary

In [101]:
# Create a dictionary with the count tables
ctable_dict = {}

for i in range(len(meta_df.index)):
    name = meta_df.iloc[i]['name']
    path = os.path.join(workdir, meta_df.iloc[i]['path'])
    # read in
    ctable = pd.read_csv(path,delimiter='\t')
    # add to dict
    ctable_dict[name] = ctable

ctable_dict.keys()

dict_keys(['FANCF_HAP1_PE2_target_2_counts_R1', 'FANCF_HAP1_PE2_pegRNA_2_counts_R3', 'HEK3_HAP1_PE2_target_2_counts_R1', 'FANCF_293T_PE2_target_1_counts_R2', 'FANCF_HAP1_PE2_pegRNA_1_counts_R3', 'HEK3_HAP1_PE2_pegRNA_1_counts_R3', 'FANCF_HAP1_PE2_target_1_counts_R1', 'FANCF_HAP1_PE2_pegRNA_2_counts_R2', 'HEK3_HAP1_PE2_target_2_counts_R2', 'FANCF_293T_PE2_target_1_counts_R3', 'FANCF_HAP1_PE2_pegRNA_1_counts_R2', 'HEK3_293T_PE2_target_1_counts_R3', 'HEK3_293T_PE-FeLV_pegRNA_1_counts_R1', 'HEK3_HAP1_PE2_target_2_counts_R3', 'HEK3_293T_PE-FeLV_pegRNA_1_counts_R3', 'HEK3_293T_PE2_target_1_counts_R2', 'HEK3_HAP1_PE2_pegRNA_1_counts_R1', 'FANCF_293T_PE2_target_1_counts_R1', 'HEK3_293T_PE-FeLV_pegRNA_1_counts_R2', 'HEK3_HAP1_PE2_pegRNA_1_counts_R2', 'HEK3_293T_PE2_target_1_counts_R1', 'FANCF_293T_PE2_pegRNA_1_counts_R3', 'HEK3_HAP1_PE2_pegRNA_2_counts_R2', 'FANCF_HAP1_PE2_target_2_counts_R2', 'HEK3_293T_PE-FeLV_target_1_counts_R1', 'HEK3_293T_PE2_pegRNA_1_counts_R3', 'FANCF_HAP1_PE2_target_1_c

In [102]:
# Create a dictionary with the library count tables
libctable_dict = {}

for i in range(len(libfile_df.index)):
    name = libfile_df.iloc[i]['name']
    path = os.path.join(workdir, libfile_df.iloc[i]['path'])
    lib = libfile_df.iloc[i]['library']
    # read in
    libctable = pd.read_csv(path,delimiter='\t')
    # add to dict
    libctable_dict[name] = ctable

libctable_dict.keys()

dict_keys(['FANCF_library_R2', 'HEK3_library_R1', 'HEK3_library_R2', 'FANCF_library_R1'])

# Preprocessing: Merge replicates and generate ratios from pegRNA and target read counts

1. outer join of dataframes based on 'screen' name by "sequence_original"
2. new column with count_pegRNA = (count_norm.x + count_norm.y + count_norm) / 3
3. Select name, sequence_original, count_pegRNA, count_peg_R1 = count_norm.x, count_peg_R2 = count_norm.y, count_peg_R3 = count_norm, VF_value = VF_value.x

In [103]:
meta_df

Unnamed: 0,filename,ID,type,replicate,path,cell_line,target,prime_editor,read,name,screenname
0,G1_010_45-2-7-target_counts.txt,45-2-7,target,2,prc_data/20210712_MergedRL_SNP/G1_010_45-2-7-target_counts.txt,HAP1,FANCF,PE2,R1,FANCF_HAP1_PE2_target_2_counts_R1,FANCF_HAP1_PE2_target_2
1,G1_010_45-2-9-pegRNA_counts.txt,45-2-9,pegRNA,2,prc_data/20210712_MergedRL_SNP/G1_010_45-2-9-pegRNA_counts.txt,HAP1,FANCF,PE2,R3,FANCF_HAP1_PE2_pegRNA_2_counts_R3,FANCF_HAP1_PE2_pegRNA_2
2,G1_010_45-2-10-target_counts.txt,45-2-10,target,2,prc_data/20210712_MergedRL_SNP/G1_010_45-2-10-target_counts.txt,HAP1,HEK3,PE2,R1,HEK3_HAP1_PE2_target_2_counts_R1,HEK3_HAP1_PE2_target_2
3,G1_010_45-1-2-target_counts.txt,45-1-2,target,1,prc_data/20210712_MergedRL_SNP/G1_010_45-1-2-target_counts.txt,293T,FANCF,PE2,R2,FANCF_293T_PE2_target_1_counts_R2,FANCF_293T_PE2_target_1
4,G1_010_45-1-9-pegRNA_counts.txt,45-1-9,pegRNA,1,prc_data/20210712_MergedRL_SNP/G1_010_45-1-9-pegRNA_counts.txt,HAP1,FANCF,PE2,R3,FANCF_HAP1_PE2_pegRNA_1_counts_R3,FANCF_HAP1_PE2_pegRNA_1
5,G1_010_45-1-12-pegRNA_counts.txt,45-1-12,pegRNA,1,prc_data/20210712_MergedRL_SNP/G1_010_45-1-12-pegRNA_counts.txt,HAP1,HEK3,PE2,R3,HEK3_HAP1_PE2_pegRNA_1_counts_R3,HEK3_HAP1_PE2_pegRNA_1
6,G1_010_45-1-7-target_counts.txt,45-1-7,target,1,prc_data/20210712_MergedRL_SNP/G1_010_45-1-7-target_counts.txt,HAP1,FANCF,PE2,R1,FANCF_HAP1_PE2_target_1_counts_R1,FANCF_HAP1_PE2_target_1
7,G1_010_45-2-8-pegRNA_counts.txt,45-2-8,pegRNA,2,prc_data/20210712_MergedRL_SNP/G1_010_45-2-8-pegRNA_counts.txt,HAP1,FANCF,PE2,R2,FANCF_HAP1_PE2_pegRNA_2_counts_R2,FANCF_HAP1_PE2_pegRNA_2
8,G1_010_45-2-11-target_counts.txt,45-2-11,target,2,prc_data/20210712_MergedRL_SNP/G1_010_45-2-11-target_counts.txt,HAP1,HEK3,PE2,R2,HEK3_HAP1_PE2_target_2_counts_R2,HEK3_HAP1_PE2_target_2
9,G1_010_45-1-3-target_counts.txt,45-1-3,target,1,prc_data/20210712_MergedRL_SNP/G1_010_45-1-3-target_counts.txt,293T,FANCF,PE2,R3,FANCF_293T_PE2_target_1_counts_R3,FANCF_293T_PE2_target_1


In [105]:
# keep the merged dataframes in a dictionary, too.
merged_counts = {}

# select count dataframes based on screen
for name in meta_df.screenname.unique():
    print(name)
    # get count tables that are part of this screen from meta_df
    subtables_0 = meta_df.loc[meta_df['screenname'].isin([name])]['name'].to_list()
    
    # look them up in dictionary
    subtables = [ctable_dict[item] for item in subtables_0]
    
    # concatenate those dataframes
    final = reduce(lambda left,right: pd.merge(left,right,on='sequence_original'), subtables)
    
    # calculate the average count for the count norm columns
    count_norm_cols = [col for col in final if col.startswith('count_norm')]
    if 'RNA' in name:
        final['count_pegRNA'] = final[count_norm_cols].mean(axis=1)
        # subset the dataframe to only contain the columns of interest
        try:
            final = final[['name','sequence_original','count_pegRNA','count_norm_x','count_norm_y','count_norm','VF_value']]
        except:
            # some libraries don't have the VF_value
            final = final[['name','sequence_original','count_pegRNA','count_norm_x','count_norm_y','count_norm']]
        # rename column names (x: first , y: 2nd , no: 3rd in list of dfs when doing the merge) with read number from name
        final = final.rename(columns = {'count_norm_x': 'count_peg_' + subtables_0[0].split("_")[-1],
                                    'count_norm_y': 'count_peg_' + subtables_0[1].split("_")[-1],
                                    'count_norm': 'count_peg_' + subtables_0[2].split("_")[-1]})
    else:
        final['count_target'] = final[count_norm_cols].mean(axis=1)
        # subset the dataframe to only contain the columns of interest
        final = final[['name','sequence_original','count_target','count_norm_x','count_norm_y','count_norm']]
        # rename column names (x: first , y: 2nd , no: 3rd in list of dfs when doing the merge) with read number from name
        final = final.rename(columns = {'count_norm_x': 'count_target_' + subtables_0[0].split("_")[-1],
                                    'count_norm_y': 'count_target_' + subtables_0[1].split("_")[-1],
                                    'count_norm': 'count_target_' + subtables_0[2].split("_")[-1]})
    # store them into dict
    merged_counts[name] = final
    print("{} files \t {}".format(len(subtables),name, ))

# display the first as an example
print(merged_counts.keys())
merged_counts[next(iter(merged_counts))].head()

FANCF_HAP1_PE2_target_2
3 files 	 FANCF_HAP1_PE2_target_2
FANCF_HAP1_PE2_pegRNA_2
3 files 	 FANCF_HAP1_PE2_pegRNA_2
HEK3_HAP1_PE2_target_2
3 files 	 HEK3_HAP1_PE2_target_2
FANCF_293T_PE2_target_1
3 files 	 FANCF_293T_PE2_target_1
FANCF_HAP1_PE2_pegRNA_1
3 files 	 FANCF_HAP1_PE2_pegRNA_1
HEK3_HAP1_PE2_pegRNA_1
3 files 	 HEK3_HAP1_PE2_pegRNA_1
FANCF_HAP1_PE2_target_1
3 files 	 FANCF_HAP1_PE2_target_1
HEK3_293T_PE2_target_1
3 files 	 HEK3_293T_PE2_target_1
HEK3_293T_PE-FeLV_pegRNA_1
3 files 	 HEK3_293T_PE-FeLV_pegRNA_1
FANCF_293T_PE2_pegRNA_1
3 files 	 FANCF_293T_PE2_pegRNA_1
HEK3_HAP1_PE2_pegRNA_2
3 files 	 HEK3_HAP1_PE2_pegRNA_2
HEK3_293T_PE-FeLV_target_1
3 files 	 HEK3_293T_PE-FeLV_target_1
HEK3_293T_PE2_pegRNA_1
3 files 	 HEK3_293T_PE2_pegRNA_1
HEK3_HAP1_PE2_target_1
3 files 	 HEK3_HAP1_PE2_target_1
dict_keys(['FANCF_HAP1_PE2_target_2', 'FANCF_HAP1_PE2_pegRNA_2', 'HEK3_HAP1_PE2_target_2', 'FANCF_293T_PE2_target_1', 'FANCF_HAP1_PE2_pegRNA_1', 'HEK3_HAP1_PE2_pegRNA_1', 'FANCF_HAP1_PE2_t

Unnamed: 0,name,sequence_original,count_target,count_target_R1,count_target_R2,count_target_R3
0,WT,CTTCTGCAGCACCTGGATCG,992305.257748,993197.504543,990143.055873,993575.212826
1,Rand20_3_18,CAACCGGCGCCGCGCCGGCC,53.217113,51.196769,66.720117,41.734453
2,Rand30_3_6,GGGTGCCGTGGCCCGGACACGGCGCGTTCT,49.677727,48.719506,68.626406,31.68727
3,Rand20_2_1,ATGGATGCGCGAGGAAGCGCGTCCAGGTCC,61.733758,47.893752,76.251562,61.055959
4,Rand30_4_2,CCTTGGCGGTGCGGCAGCTCCAGCTCAGGG,53.758863,47.893752,83.241289,30.141549


## Calculating the insertion frequencies and augmenting the resulting data frame

In [113]:
# Cut off the pegRNA or target from the screen names and look for unique screen names
screennames = list(np.unique([x.rsplit('_',2)[0] +"_" +x.rsplit('_',1)[1] for x in list(meta_df.screenname.unique())]))
screennames

['FANCF_293T_PE2_1',
 'FANCF_HAP1_PE2_1',
 'FANCF_HAP1_PE2_2',
 'HEK3_293T_PE-FeLV_1',
 'HEK3_293T_PE2_1',
 'HEK3_HAP1_PE2_1',
 'HEK3_HAP1_PE2_2']

In [114]:
def merge_counts(df1, df2,joinby):
    """
    Merges two dataframes with an full outer join based on the joinby column.
    Expects a name column in both dfs --> name_x and name_y
    """
    # merge
    df = pd.merge(df1,df2,on=joinby)
    df = df.rename(columns = {'name_x':'name'})
    df = df.drop(columns = ['name_y'])
    return df

def add_metrics(df):
    """
    Calculates metrics based on original sequence and name,
    """
    # add quality metrics
    df['percIns'] = df['count_target']/df['count_pegRNA'] * 100
    df['length'] = df['sequence_original'].str.len()
    df['N1'] = df['sequence_original'].str[0]
    df['N2'] = df['sequence_original'].str[1]
    df['N3'] = df['sequence_original'].str[2]
    df['N4'] = df['sequence_original'].str[3]
    df['N-1'] = df['sequence_original'].str[-1]
    df['hairpins'] = df['name'].str.contains(pat = 'Hairpins')
    df['percGC'] = (df['sequence_original'].str.count('G') + df['sequence_original'].str.count('C')/df['length']*100)
    
    return df

In [116]:
qc_dfs = {}

for screen in screennames:
    # the screen is the base name, add pegRNA or target for the corresponding libraries
    # get the df from the merged counts dictionary
    df1 = merged_counts[screen.rsplit("_",1)[0] + "_pegRNA" + "_" + screen.rsplit("_",1)[1]]
    df2 = merged_counts[screen.rsplit("_",1)[0] + "_target" + "_" + screen.rsplit("_",1)[1]]
    
    # merge pegRNA and target counts to calculate ratio
    df = merge_counts(df1, df2,'sequence_original')
    # determine metrics, mostly based on original sequence
    df = add_metrics(df)
    df['experiment'] = screen
    
    # keep rows that pass the filter for count number and are not WT
    n_filter = 20
    dflen0 = df.shape[0]
    df = df[df.count_pegRNA > n_filter]
    dflen1 = df.shape[0]
    df = df[df.name != 'WT']
    
    # store into dict
    print("{} \t length {} \t filtered_count {} \t filtered_wt {}".format(screen, df.shape[0],dflen0-dflen1,dflen1-df.shape[0]))
    qc_dfs[screen] = df
    
    # save to file
    df.to_csv(os.path.join(workdir,'prc_data','qcd_data',screen+'_qc-df.tsv'), sep = '\t', index=False)
    
qc_dfs[next(iter(qc_dfs))].head()

FANCF_293T_PE2_1 	 length 2515 	 filtered_count 151 	 filtered_wt 0
FANCF_HAP1_PE2_1 	 length 2006 	 filtered_count 660 	 filtered_wt 0
FANCF_HAP1_PE2_2 	 length 2512 	 filtered_count 154 	 filtered_wt 0
HEK3_293T_PE-FeLV_1 	 length 2550 	 filtered_count 116 	 filtered_wt 0
HEK3_293T_PE2_1 	 length 2549 	 filtered_count 117 	 filtered_wt 0
HEK3_HAP1_PE2_1 	 length 2084 	 filtered_count 582 	 filtered_wt 0
HEK3_HAP1_PE2_2 	 length 2543 	 filtered_count 123 	 filtered_wt 0


Unnamed: 0,name,sequence_original,count_pegRNA,count_peg_R3,count_peg_R2,count_peg_R1,count_target,count_target_R2,count_target_R3,count_target_R1,percIns,length,N1,N2,N3,N4,N-1,hairpins,percGC,experiment
0,ELMI003807,CCCCCCCCTCCGAGAGCACCT,5478.113318,5701.297621,5605.362689,5127.679644,1.008537,1.612793,0.949106,0.463714,0.01841,21,C,C,C,C,T,False,64.904762,FANCF_293T_PE2_1
1,ELMI002903,CCCCCAACCACTCCATGT,3690.89464,3650.733954,3736.795766,3685.1542,3.210067,3.51882,3.63824,2.47314,0.086973,18,C,C,C,C,T,False,56.555556,FANCF_293T_PE2_1
2,ELMI002279,CGCCCGCGGTCCTGCACATGGCCACTC,3658.863523,3613.721905,3801.707083,3561.161582,4.518831,5.424848,4.112793,4.018852,0.123504,27,C,G,C,C,C,False,55.148148,FANCF_293T_PE2_1
3,ELMI003062,GATCCCCTGCACCCCACACTC,3330.482705,3467.115736,3316.900684,3207.431696,8.575198,8.797051,10.281981,6.646563,0.257476,21,G,A,T,C,C,False,59.142857,FANCF_293T_PE2_1
4,ELMI001684,TTCCCCCCACCACCT,3415.343656,3392.610961,3488.307131,3365.112875,1.961022,1.026323,2.847318,2.009426,0.057418,15,T,T,C,C,T,False,66.666667,FANCF_293T_PE2_1
