# Find unclassified nanopore sequences
## Python codes to extract unclassified sequences from nanopore fastq.gz (basecalled gunzipped) files using WIMP classification in csv
### Author: Hyunjin Shim; Date: 20221130

In [1]:
import os
import glob
import pandas as pd
import gzip

In [2]:
# change path to import nanopore classification csv files
path = '../Data/WIMP/'
files_list = glob.glob(path + 'classification_wimp_*')
files_list

['../Data/WIMP/classification_wimp_Saliva1_R2.csv',
 '../Data/WIMP/classification_wimp_Stool2_R1.csv',
 '../Data/WIMP/classification_wimp_Saliva1_R1.csv',
 '../Data/WIMP/classification_wimp_Stool2_R2.csv',
 '../Data/WIMP/classification_wimp_Saliva3_R1.csv',
 '../Data/WIMP/classification_wimp_Saliva3_R2.csv',
 '../Data/WIMP/classification_wimp_Stool1_R1.csv',
 '../Data/WIMP/classification_wimp_Saliva2_R2.csv',
 '../Data/WIMP/classification_wimp_Stool1_R2.csv',
 '../Data/WIMP/classification_wimp_Saliva2_R1.csv']

In [3]:
# change file name individually and run the entire for loop
file_now = files_list[1]
file_name = '_'.join(file_now.split('/')[-1].split('.')[0:1])
file_name

'classification_wimp_Stool2_R1'

In [4]:
wimp_df = pd.read_csv(file_now)
wimp_df_unclassified = wimp_df[wimp_df['exit_status']=='Unclassified']
wimp_df_unclassified.shape

(14088, 9)

In [5]:
wimp_df_unclassified_filename = wimp_df_unclassified['filename']
wimp_df_unclassified_filename

22744     ALR972_pass_477b7245_6-0072.fastq
22745     ALR972_pass_477b7245_6-0072.fastq
22746     ALR972_pass_477b7245_6-0072.fastq
22747     ALR972_pass_477b7245_6-0072.fastq
22748     ALR972_pass_477b7245_6-0072.fastq
                        ...                
36827    ALR972_pass_477b7245_28-0060.fastq
36828    ALR972_pass_477b7245_28-0060.fastq
36829    ALR972_pass_477b7245_28-0060.fastq
36830    ALR972_pass_477b7245_28-0060.fastq
36831    ALR972_pass_477b7245_28-0060.fastq
Name: filename, Length: 14088, dtype: object

In [None]:
wimp_df_unclassified_readid = wimp_df_unclassified['readid']
wimp_df_unclassified_readid

22744    951ad3e5-b0f6-441a-9ab1-615fe5e13766
22745    173e35b3-042d-4302-bd91-df97cd35ffa9
22746    646fc3f0-3dfd-43cb-93cd-deb16b849baf
22747    b675abc5-8361-400e-87d6-990af0ee22f8
22748    a356e374-ee84-4e59-879e-8c6ec8af0f93
                         ...                 
36827    a99c6e06-e448-4c2e-bb27-e3f6cfbb8a50
36828    e505b7aa-d998-472c-bafe-909a291da706
36829    63ac47ca-7919-47f3-a266-9a29e61b1fe1
36830    7fadd41c-698e-4753-a5fc-17516c3965a5
36831    941ae3ac-8f04-4289-8ce4-fda693863e83
Name: readid, Length: 14088, dtype: object

In [13]:
#wimp_df_unclassified_filename_0 = '_'.join(wimp_df_unclassified_filename[0].split('-')[0].split('_')[7:11])
#wimp_df_unclassified_filename_0 = '_'.join(wimp_df_unclassified_filename[0].split('-')[0].split('_')[7:11])
wimp_df_unclassified_filename_0 = wimp_df_unclassified_filename[0]

KeyError: 0

In [None]:
# change path to import nanopore fasta files
path = '../Data/pass_fastq/'
gzips_list = glob.glob(path + '*.gz')
#gzips_list

In [None]:
for i, row in wimp_df_unclassified.iterrows():
    wimp_df_unclassified_filename = '_'.join(row['filename'].split('-')[0].split('_')[7:11])
    wimp_df_unclassified_readid = row['readid']
    #'AGE883_pass_85baa9f1_11'
    for f in gzips_list:
        f_0 = f.find(wimp_df_unclassified_filename)
        if f_0 != -1:
            f_found = f
            #f_found_name = '_'.join(f.split('/')[-1].split('_'))
            #'AGE883_pass_85baa9f1_11.fastq.gz'
            with gzip.open(f_found,'rb') as f_fasta: # gives out bytes
                file_content=f_fasta.readlines()
                for line in file_content:
                    f_decode = line.decode("utf-8") # bytes into string
                    readid_0 = f_decode.find(wimp_df_unclassified_readid)
                    if readid_0 != -1:
                        readid_found = f_decode
                        readid_seq = file_content[file_content.index(line)+1].decode("utf-8") # next line that is a sequence
                        with open(file_name+'.fasta', 'a') as file:
                            file.write(">" + wimp_df_unclassified_readid + '|' + wimp_df_unclassified_filename + '\n' + readid_seq)
                            

### code test

In [None]:
for f in gzips_list:
    f_0 = f.find(wimp_df_unclassified_filename_0)
    if f_0 != -1:
        f_found = f
f_found

In [None]:
f_found_name = '_'.join(f_found.split('/')[-1].split('_'))
f_found_name

In [None]:
with gzip.open(f_found,'rb') as f: # gives out bytes
    file_content=f.readlines()
print(file_content[0])

In [None]:
for f in file_content:
    f_decode = f.decode("utf-8") # bytes into string
    readid_0 = f_decode.find(wimp_df_unclassified_readid[0])
    if readid_0 != -1:
        readid_found = f_decode
        readid_seq = file_content[file_content.index(f)+1].decode("utf-8") # next line that is a sequence
readid_found
readid_seq
#f_index


In [None]:
file_content[257]

In [None]:
with open('readme.fasta', 'w') as f:
    f.write(">" + wimp_df_unclassified_readid[0] + '|' + wimp_df_unclassified_filename_0 + '\n' + readid_seq)