# Make k-mers

This is an explanation of the  
**make_kmers.py**    
file.  

using variables from the config file, we
1. split the raw gene sequence into k-merks subsets

note:
+ 1-mer is the fastest; we just split each letter individually
+ 2+-mer is slow because it requires a sliding window. The smaller the k, the longer it will take (e.g 2-mer will take longer than 10-mer).

Then we will save the output as parquet files.

## Step 0: import required libraries

In [1]:
import sys
sys.path.append('..') # this is to allow the script to read from the parent folder

from scripts.global_funcs import load_data_config
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
import dask_cudf
import numpy as np

## Step 1: load config file data

In [2]:
configs = load_data_config()

In [3]:
# these are the variables we will be using
for key, val in configs.items():
    print(f"{key}: {val}")

clean_fasta_file: /media/jcosme/Data/MarRef_parquet_10_cats
output_dir: /media/jcosme/Data
project_name: small_mer_1
base_col_names: ['seq', 'label']
label_col_name: label
input_col_name: seq
label_regex: (?:[^a-zA-Z0-9]+)([a-zA-Z]+[0-9]+)(?:[^a-zA-Z0-9]+)
k_mer: 1
possible_gene_values: ['A', 'C', 'G', 'T']
max_seq_len: 150
data_splits: {'train': 0.9, 'val': 0.05, 'test': 0.05}
random_seed: 42
fasta_sep: >
unq_labs_dir: /media/jcosme/Data/small_mer_1/data/unq_labels
unq_labs_dir_csv: /media/jcosme/Data/small_mer_1/data/unq_labels.csv
data_dir: /media/jcosme/Data/small_mer_1/data/small_mer_1
nvtab_dir: /media/jcosme/Data/small_mer_1/nvtab
dask_dir: /media/jcosme/Data/small_mer_1/dask
tensorboard_dir: /media/jcosme/Data/small_mer_1/tensorboard
model_checkpoints_dir: /media/jcosme/Data/small_mer_1/checkpoints/model_checkpoints
model_checkpoints_parent_dir: /media/jcosme/Data/small_mer_1/checkpoints
model_weights_dir: /media/jcosme/Data/small_mer_1/model_weights.h5


In [4]:
# lets put the variables we need into python variables
input_col_name = configs['input_col_name']
label_col_name = configs['label_col_name']
data_dir = configs['data_dir']
k_mer = configs['k_mer']
dask_dir = configs['dask_dir']
possible_gene_values = configs['possible_gene_values']
possible_gene_values = sorted(possible_gene_values)
max_seq_len = configs['max_seq_len']

## Step 2: start the cluster

In [5]:
cluster = LocalCUDACluster(local_directory=dask_dir)
client = Client(cluster)
client

2022-05-24 13:23:25,009 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize


0,1
Connection method: Cluster object,Cluster type: dask_cuda.LocalCUDACluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 1
Total threads: 1,Total memory: 31.21 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:42655,Workers: 1
Dashboard: http://127.0.0.1:8787/status,Total threads: 1
Started: Just now,Total memory: 31.21 GiB

0,1
Comm: tcp://127.0.0.1:35989,Total threads: 1
Dashboard: http://127.0.0.1:36231/status,Memory: 31.21 GiB
Nanny: tcp://127.0.0.1:40003,
Local directory: /media/jcosme/Data/small_mer_1/dask/dask-worker-space/worker-4r_abh0f,Local directory: /media/jcosme/Data/small_mer_1/dask/dask-worker-space/worker-4r_abh0f
GPU: NVIDIA GeForce RTX 3080 Laptop GPU,GPU memory: 16.00 GiB


### optional: 
click the link above to open up the Dask Dashboard, which will allow you to see the progress of your job.  
**note:** this will only work on a jupyter notebook

## Step 3: define function

In [6]:
replace_gene_values = []
for gene_val in possible_gene_values:
    replace_gene_values.append(gene_val + ' ')

def add_whitespace(df):
    df[input_col_name] = df[input_col_name].str.replace(possible_gene_values, replace_gene_values, regex=False)
    return df

def get_kmers(df):
    df['temp'] = df[input_col_name].copy()
    df['temp'] = ' ' 
    for i in np.arange(0, df[input_col_name].str.len().max() - k_mer):
        # print(i)
        temp_df = df[input_col_name].str[i: i+k_mer].fillna(' ')
        change_mask = temp_df.str.len() < k_mer
        temp_df[change_mask] = ' ' 
        df['temp'] = df['temp'] + ' ' + temp_df  
    df['temp'] = df['temp'].str.normalize_spaces()
    df[input_col_name] = df['temp']
    df = df.drop(columns=['temp'])
    return df

## Step 4: data transformations

In [7]:
# first we read the parquet
df = dask_cudf.read_parquet(data_dir)

In [8]:
# here is a data sample
df.head()

Unnamed: 0,seq,label
0,GGGCGGCCGAGACCGTCGCCTCGGCCATGGTTTCGAACACCAGCCA...,1
1,AGCCGAGCAGCCGGTCGATGCTGGCCAGCGTGTAGGTGAAGGTGCC...,1
2,GGAGCGGGCCGCCGGCAAGGAGGCCGCGCGCGGACTCGATCACGGC...,1
3,CGATCGACCGCCGCTGCCGACGCACATCACCGAGCTGGCCGAACTC...,1
4,CCGGGCGCTGACCGACATCCGGGAATTGCTGCTGGCCGGCGCCGAC...,1


In [9]:
# next, we apply the function defined above to the data
if k_mer == 1:
    df = df.map_partitions(add_whitespace)
elif (k_mer > 1):
    df = df.map_partitions(get_kmers)




In [10]:
df.head()



Unnamed: 0,seq,label
0,G G G C G G C C G A G A C C G T C G C C T C G ...,1
1,A G C C G A G C A G C C G G T C G A T G C T G ...,1
2,G G A G C G G G C C G C C G G C A A G G A G G ...,1
3,C G A T C G A C C G C C G C T G C C G A C G C ...,1
4,C C G G G C G C T G A C C G A C A T C C G G G ...,1


In [11]:
df[input_col_name] = df[input_col_name].str.split()

In [12]:
df.head()

Unnamed: 0,seq,label
0,"[G, G, G, C, G, G, C, C, G, A, G, A, C, C, G, ...",1
1,"[A, G, C, C, G, A, G, C, A, G, C, C, G, G, T, ...",1
2,"[G, G, A, G, C, G, G, G, C, C, G, C, C, G, G, ...",1
3,"[C, G, A, T, C, G, A, C, C, G, C, C, G, C, T, ...",1
4,"[C, C, G, G, G, C, G, C, T, G, A, C, C, G, A, ...",1


## Step 5: save the data

In [13]:
%%time
# the final step is to save the cleaned data. 
# this might take some time
_ = df.to_parquet(data_dir)

CPU times: user 10.8 ms, sys: 12 ms, total: 22.8 ms
Wall time: 504 ms


## Step 6: cleanup

In [14]:
# we delete the dataframe
del df

# then we shutdown the Dask cluster
client.shutdown()

# finally we close the Dask cluster
client.close()

## finished!