# Extract species labels from raw labels

This is an explanation of the  
**extract_labels.py**    
file.  

using the regex defined in the config file, we
1. extract the proper label for each row
2. create parquet and csv files of the unique labels
3. encode the labels (convert the string to an integer) in the main data

Then we will save the output as parquet files.

## Step 0: import required libraries

In [1]:
import sys
sys.path.append('..') # this is to allow the script to read from the parent folder

from scripts.global_funcs import load_data_config
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
import dask_cudf

## Step 1: load config file data

In [2]:
configs = load_data_config()

In [3]:
# these are the variables we will be using
for key, val in configs.items():
    print(f"{key}: {val}")

clean_fasta_file: /media/jcosme/Data/MarRef_parquet_10_cats
output_dir: /media/jcosme/Data
project_name: MarRef_parquet_10_cats
base_col_names: ['seq', 'label']
label_col_name: label
input_col_name: seq
label_regex: (?:[^a-zA-Z0-9]+)([a-zA-Z]+[0-9]+)(?:[^a-zA-Z0-9]+)
k_mer: 1
possible_gene_values: ['A', 'C', 'G', 'T']
max_seq_len: 150
data_splits: {'train': 0.9, 'val': 0.05, 'test': 0.05}
random_seed: 42
fasta_sep: >
unq_labs_dir: /media/jcosme/Data/MarRef_parquet_10_cats/data/unq_labels
unq_labs_dir_csv: /media/jcosme/Data/MarRef_parquet_10_cats/data/unq_labels.csv
data_dir: /media/jcosme/Data/MarRef_parquet_10_cats/data/MarRef_parquet_10_cats
nvtab_dir: /media/jcosme/Data/MarRef_parquet_10_cats/nvtab
dask_dir: /media/jcosme/Data/MarRef_parquet_10_cats/dask
tensorboard_dir: /media/jcosme/Data/MarRef_parquet_10_cats/tensorboard
model_checkpoints_dir: /media/jcosme/Data/MarRef_parquet_10_cats/checkpoints/model_checkpoints
model_checkpoints_parent_dir: /media/jcosme/Data/MarRef_parquet_1

In [4]:
# lets put the variables we need into python variables
clean_fasta_filepath = configs['clean_fasta_file']
output_dir = configs['output_dir']
project_name = configs['project_name']
unq_labs_dir = configs['unq_labs_dir']
unq_labs_dir_csv = configs['unq_labs_dir_csv']
data_dir = configs['data_dir']
label_col_name = configs['label_col_name']
label_regex = configs['label_regex']
dask_dir = configs['dask_dir']

## Step 2: start the cluster

In [5]:
cluster = LocalCUDACluster(local_directory=dask_dir)
client = Client(cluster)
client

2022-05-23 19:05:42,378 - distributed.diskutils - INFO - Found stale lock file and directory '/media/jcosme/Data/MarRef_parquet_10_cats/dask/dask-worker-space/worker-6r0m3uzf', purging
2022-05-23 19:05:42,379 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize


0,1
Connection method: Cluster object,Cluster type: dask_cuda.LocalCUDACluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 1
Total threads: 1,Total memory: 31.21 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:39833,Workers: 1
Dashboard: http://127.0.0.1:8787/status,Total threads: 1
Started: Just now,Total memory: 31.21 GiB

0,1
Comm: tcp://127.0.0.1:43093,Total threads: 1
Dashboard: http://127.0.0.1:32827/status,Memory: 31.21 GiB
Nanny: tcp://127.0.0.1:45343,
Local directory: /media/jcosme/Data/MarRef_parquet_10_cats/dask/dask-worker-space/worker-ygleuq5t,Local directory: /media/jcosme/Data/MarRef_parquet_10_cats/dask/dask-worker-space/worker-ygleuq5t
GPU: NVIDIA GeForce RTX 3080 Laptop GPU,GPU memory: 16.00 GiB


### optional: 
click the link above to open up the Dask Dashboard, which will allow you to see the progress of your job.  
**note:** this will only work on a jupyter notebook

## Step 3: define label extraction function

In [6]:
# this function will be applied to the data
def extract_labels(df):
    df[label_col_name] = df[label_col_name].str.extract(label_regex).loc[:, 0]
    return df

## Step 4: data transformations

In [7]:
# first we read the parquet file
df = dask_cudf.read_parquet(clean_fasta_filepath).repartition(partition_size="10M")



In [8]:
# here is a data sample
df.head()

Unnamed: 0,seq,label
0,GAAAGCGAGTACGACCCGTTCGGCGCCGCCCACAGCTCGACCTCGA...,label|708|MMP00165213-3860/1
1,AGCATCGGAACAAGATCCGCAAGGGCGAGTTCGAGCAGGCCGTCGC...,label|708|MMP00165213-3858/1
2,TCGAGCGGCTACCTGGCCGAGACGTTCCGGGCGGGGATCGAGAATG...,label|708|MMP00165213-3856/1
3,GCGCCCCGGTATGTCGAGATGGTTCGTAGGCTCGTCCAGCACGAAG...,label|708|MMP00165213-3854/1
4,GCGTCCGTCATCGCCGGCGGGAAGATCGCGATCTCGCTGCTCTCGG...,label|708|MMP00165213-3852/1


In [9]:
# next, we apply the function defined above to the data
df = df.map_partitions(extract_labels)

In [10]:
# here is a data sample
df.head()

Unnamed: 0,seq,label
0,GAAAGCGAGTACGACCCGTTCGGCGCCGCCCACAGCTCGACCTCGA...,MMP00165213
1,AGCATCGGAACAAGATCCGCAAGGGCGAGTTCGAGCAGGCCGTCGC...,MMP00165213
2,TCGAGCGGCTACCTGGCCGAGACGTTCCGGGCGGGGATCGAGAATG...,MMP00165213
3,GCGCCCCGGTATGTCGAGATGGTTCGTAGGCTCGTCCAGCACGAAG...,MMP00165213
4,GCGTCCGTCATCGCCGGCGGGAAGATCGCGATCTCGCTGCTCTCGG...,MMP00165213


In [11]:
# now we extract the unique labels
unq_labs_df = df.sort_values(label_col_name)[label_col_name].unique().to_frame()

In [12]:
# here is a sample of the unique labels
unq_labs_df.head()

Unnamed: 0,label
0,MMP00010387
1,MMP00165213
2,MMP02202185
3,MMP02603914
4,MMP02744159


In [13]:
%%time
# this might take some time
# we save the unique labels as a parquet file...
_ = unq_labs_df.to_parquet(unq_labs_dir)
# ...and as a .csv file.
_ = unq_labs_df.to_csv(unq_labs_dir_csv, index=False, single_file=True)

CPU times: user 107 ms, sys: 7.13 ms, total: 114 ms
Wall time: 723 ms


In [14]:
# next, we encode the labels
df = df.categorize(columns=[label_col_name])
df[label_col_name] = df[label_col_name].cat.codes

In [15]:
# here is a sample of encoded data
df.head()

Unnamed: 0,seq,label
0,GAAAGCGAGTACGACCCGTTCGGCGCCGCCCACAGCTCGACCTCGA...,1
1,AGCATCGGAACAAGATCCGCAAGGGCGAGTTCGAGCAGGCCGTCGC...,1
2,TCGAGCGGCTACCTGGCCGAGACGTTCCGGGCGGGGATCGAGAATG...,1
3,GCGCCCCGGTATGTCGAGATGGTTCGTAGGCTCGTCCAGCACGAAG...,1
4,GCGTCCGTCATCGCCGGCGGGAAGATCGCGATCTCGCTGCTCTCGG...,1


## Step 5: save the data

In [16]:
%%time
# the final step is to save the cleaned data. 
# this might take some time
_ = df.to_parquet(data_dir)

CPU times: user 27.7 ms, sys: 16.9 ms, total: 44.5 ms
Wall time: 986 ms


## Step 6: cleanup

In [17]:
# we delete the dataframe
del df, unq_labs_df

# then we shutdown the Dask cluster
client.shutdown()

# finally we close the Dask cluster
client.close()

## finished!