In [None]:
# DataFrame from R2C2 Consensus fasta
## Example usage: create a dataframe from input, output a csv for all headers, output a fasta with filtered reads
python dataframe_R2C2.py -i R2C2_Consensus.fasta -o output

## More details below

In [None]:
## Some import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from fileio_R2C2 import parse_args_R2C2, read_fasta, df_to_csv, df_to_fasta

In [None]:
## The columns of our DataFrame, per C3POa

cols = 'readName_averageQuality_originalReadLength_numberOfRepeats_subreadLength'.split('_') + ['sequence']

In [None]:
## Create a dict from fasta file
input_fasta = 'R2C2_Consensus.fasta'
output_path = 'output/'
reads = read_fasta(input_fasta)

In [None]:
## Create a dataframe from the dict above
def reads_to_df(reads):
    print('construct a dataframe from reads dict, columns are:')
    readsData = [k[1:].split('_') + [reads[k]] for k in reads]
    print(cols)
    df = pd.DataFrame(readsData, columns=cols, dtype='string')
    df['averageQuality'] = pd.to_numeric(df['averageQuality'])
    df['originalReadLength'] = pd.to_numeric(df['originalReadLength'])
    df['numberOfRepeats'] = pd.to_numeric(df['numberOfRepeats'])
    df['subreadLength'] = pd.to_numeric(df['subreadLength'])
    return df

df = reads_to_df(reads)
## The readName and sequence columns are string, while the rest are numbers

In [None]:
## Output all headers to a csv
df_to_csv(df.iloc[:, :-1], output_path + "R2C2_Consensus_headers.csv")

In [None]:
## Filter the dataframe, e.g. numberOfRepeats >= 3
def filter_df(df):
    filter_ = df['numberOfRepeats'] >= 3
    return df[filter_]

## Combine ~, &, |, ^ to express your filter criteria, e.g. 
# df['numberOfRepeats'] >= 3 & df['averageQuality'] >= 50

In [None]:
## Output the filtered dataframe to a fasta
dff = filter_df(df)
df_to_fasta(dff, output_path + "R2C2_Consensus_filtered_numberOfRepeats_greater_3.fasta")