# Find unclassified nanopore sequences
## Python codes to extract unclassified sequences from nanopore fastq.gz (basecalled gunzipped) files using WIMP classification in csv
### Author: Hyunjin Shim; Date: 20221130

In [1]:
import os
import glob
import pandas as pd
import gzip

In [2]:
# change path to import nanopore classification csv files
path = '../Data/WIMP/'
files_list = glob.glob(path + 'classification_wimp_*')
files_list

['../Data/WIMP/classification_wimp_Saliva1_R2.csv',
 '../Data/WIMP/classification_wimp_Stool2_R1.csv',
 '../Data/WIMP/classification_wimp_Saliva1_R1.csv',
 '../Data/WIMP/classification_wimp_Stool2_R2.csv',
 '../Data/WIMP/classification_wimp_Saliva3_R1.csv',
 '../Data/WIMP/classification_wimp_Saliva3_R2.csv',
 '../Data/WIMP/classification_wimp_Stool1_R1.csv',
 '../Data/WIMP/classification_wimp_Saliva2_R2.csv',
 '../Data/WIMP/classification_wimp_Stool1_R2.csv',
 '../Data/WIMP/classification_wimp_Saliva2_R1.csv']

In [45]:
# change file name individually and run the entire for loop
file_now = files_list[5]
file_name = '_'.join(file_now.split('/')[-1].split('.')[0:1])
file_name

'classification_wimp_Saliva3_R2'

In [46]:
wimp_df = pd.read_csv(file_now)
wimp_df_unclassified = wimp_df[wimp_df['exit_status']=='Unclassified']
wimp_df_unclassified.shape

(6301, 9)

In [47]:
wimp_df_unclassified_filename = wimp_df_unclassified['filename']
wimp_df_unclassified_filename

0       fastq_pass_ATJ376_pass_562f436b_24-0019.fastq
1       fastq_pass_ATJ376_pass_562f436b_24-0019.fastq
2       fastq_pass_ATJ376_pass_562f436b_24-0019.fastq
3       fastq_pass_ATJ376_pass_562f436b_24-0019.fastq
4       fastq_pass_ATJ376_pass_562f436b_24-0019.fastq
                            ...                      
6296    fastq_pass_ATJ376_pass_562f436b_13-0004.fastq
6297    fastq_pass_ATJ376_pass_562f436b_13-0004.fastq
6298    fastq_pass_ATJ376_pass_562f436b_13-0004.fastq
6299    fastq_pass_ATJ376_pass_562f436b_13-0004.fastq
6300    fastq_pass_ATJ376_pass_562f436b_13-0004.fastq
Name: filename, Length: 6301, dtype: object

In [48]:
wimp_df_unclassified_readid = wimp_df_unclassified['readid']
wimp_df_unclassified_readid

0       1e970ba1-7f51-4287-bfec-dedc76d8a395
1       5cfa055b-68a4-4aa4-a369-7dc45e76d3a6
2       fdf899a4-fcd5-4b53-8d83-f3b54ee226b8
3       71912e26-7bbb-47aa-ae04-736e3540e0b8
4       34f4364f-3702-4618-8a67-e1191a47d2a8
                        ...                 
6296    4f730c11-c77f-4e2a-853a-ec7ecd653a71
6297    a207e896-6751-476e-bf33-6f910f9bf8c0
6298    757cb35e-c648-4e67-a526-6e48d5a98173
6299    1f8f8577-2131-45c3-a3d1-895ce04e2f4b
6300    1c92002a-f997-4cb9-a779-474c580689d6
Name: readid, Length: 6301, dtype: object

In [49]:
#wimp_df_unclassified_filename_0 = '_'.join(wimp_df_unclassified_filename[0].split('-')[0].split('_')[7:11])
#wimp_df_unclassified_filename_0 = '_'.join(wimp_df_unclassified_filename[0].split('-')[0].split('_')[7:11])
#wimp_df_unclassified_filename_0 = wimp_df_unclassified_filename

In [50]:
# change path to import nanopore fasta files
path = '../Data/pass_fastq/'
gzips_list = glob.glob(path + '*.gz')
#gzips_list

In [51]:
for i, row in wimp_df_unclassified.iterrows():
    wimp_df_unclassified_filename = '_'.join(row['filename'].split('-')[0].split('_')[2:6])
    wimp_df_unclassified_readid = row['readid']
    #'AGE883_pass_85baa9f1_11'
    for f in gzips_list:
        f_0 = f.find(wimp_df_unclassified_filename)
        if f_0 != -1:
            f_found = f
            #f_found_name = '_'.join(f.split('/')[-1].split('_'))
            #'AGE883_pass_85baa9f1_11.fastq.gz'
            with gzip.open(f_found,'rb') as f_fasta: # gives out bytes
                file_content=f_fasta.readlines()
                for line in file_content:
                    f_decode = line.decode("utf-8") # bytes into string
                    readid_0 = f_decode.find(wimp_df_unclassified_readid)
                    if readid_0 != -1:
                        readid_found = f_decode
                        readid_seq = file_content[file_content.index(line)+1].decode("utf-8") # next line that is a sequence
                        with open(file_name+'.fasta', 'a') as file:
                            file.write(">" + wimp_df_unclassified_readid + '|' + wimp_df_unclassified_filename + '\n' + readid_seq)
                            

### code test

In [None]:
for f in gzips_list:
    f_0 = f.find(wimp_df_unclassified_filename_0)
    if f_0 != -1:
        f_found = f
f_found

In [None]:
f_found_name = '_'.join(f_found.split('/')[-1].split('_'))
f_found_name

In [None]:
with gzip.open(f_found,'rb') as f: # gives out bytes
    file_content=f.readlines()
print(file_content[0])

In [None]:
for f in file_content:
    f_decode = f.decode("utf-8") # bytes into string
    readid_0 = f_decode.find(wimp_df_unclassified_readid[0])
    if readid_0 != -1:
        readid_found = f_decode
        readid_seq = file_content[file_content.index(f)+1].decode("utf-8") # next line that is a sequence
readid_found
readid_seq
#f_index


In [None]:
file_content[257]

In [None]:
with open('readme.fasta', 'w') as f:
    f.write(">" + wimp_df_unclassified_readid[0] + '|' + wimp_df_unclassified_filename_0 + '\n' + readid_seq)