# Make k-mers

This is an explanation of the  
**make_kmers.py**    
file.  

using variables from the config file, we
1. split the raw gene sequence into k-merks subsets

note:
+ 1-mer is the fastest; we just split each letter individually
+ 2+-mer is slow because it requires a sliding window. The smaller the k, the longer it will take (e.g 2-mer will take longer than 10-mer).

Then we will save the output as parquet files.

## Step 0: import required libraries

In [1]:
import sys
sys.path.append('..') # this is to allow the script to read from the parent folder

from scripts.global_funcs import load_data_config
from dask.distributed import Client
from dask_cuda import LocalCUDACluster
import dask_cudf
import numpy as np

## Step 1: load config file data

In [2]:
configs = load_data_config()

In [3]:
# these are the variables we will be using
for key, val in configs.items():
    print(f"{key}: {val}")

clean_fasta_file: /media/jcosme/Data/MarRef_parquet_10_cats
output_dir: /media/jcosme/Data
project_name: MarRef_parquet_10_cats
base_col_names: ['seq', 'label']
label_col_name: label
input_col_name: seq
label_regex: (?:[^a-zA-Z0-9]+)([a-zA-Z]+[0-9]+)(?:[^a-zA-Z0-9]+)
k_mer: 1
possible_gene_values: ['A', 'C', 'G', 'T']
max_seq_len: 150
data_splits: {'train': 0.9, 'val': 0.05, 'test': 0.05}
random_seed: 42
fasta_sep: >
unq_labs_dir: /media/jcosme/Data/MarRef_parquet_10_cats/data/unq_labels
unq_labs_dir_csv: /media/jcosme/Data/MarRef_parquet_10_cats/data/unq_labels.csv
data_dir: /media/jcosme/Data/MarRef_parquet_10_cats/data/MarRef_parquet_10_cats
nvtab_dir: /media/jcosme/Data/MarRef_parquet_10_cats/nvtab
dask_dir: /media/jcosme/Data/MarRef_parquet_10_cats/dask
tensorboard_dir: /media/jcosme/Data/MarRef_parquet_10_cats/tensorboard
model_checkpoints_dir: /media/jcosme/Data/MarRef_parquet_10_cats/checkpoints/model_checkpoints
model_checkpoints_parent_dir: /media/jcosme/Data/MarRef_parquet_1

In [4]:
# lets put the variables we need into python variables
input_col_name = configs['input_col_name']
label_col_name = configs['label_col_name']
data_dir = configs['data_dir']
k_mer = configs['k_mer']
dask_dir = configs['dask_dir']
possible_gene_values = configs['possible_gene_values']
possible_gene_values = sorted(possible_gene_values)
max_seq_len = configs['max_seq_len']

## Step 2: start the cluster

In [5]:
cluster = LocalCUDACluster(local_directory=dask_dir)
client = Client(cluster)
client

2022-05-23 17:19:15,381 - distributed.preloading - INFO - Import preload module: dask_cuda.initialize


0,1
Connection method: Cluster object,Cluster type: dask_cuda.LocalCUDACluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 1
Total threads: 1,Total memory: 31.21 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:37531,Workers: 1
Dashboard: http://127.0.0.1:8787/status,Total threads: 1
Started: Just now,Total memory: 31.21 GiB

0,1
Comm: tcp://127.0.0.1:33667,Total threads: 1
Dashboard: http://127.0.0.1:41227/status,Memory: 31.21 GiB
Nanny: tcp://127.0.0.1:33339,
Local directory: /media/jcosme/Data/MarRef_parquet_10_cats/dask/dask-worker-space/worker-ue57nu7l,Local directory: /media/jcosme/Data/MarRef_parquet_10_cats/dask/dask-worker-space/worker-ue57nu7l
GPU: NVIDIA GeForce RTX 3080 Laptop GPU,GPU memory: 16.00 GiB


### optional: 
click the link above to open up the Dask Dashboard, which will allow you to see the progress of your job.  
**note:** this will only work on a jupyter notebook

## Step 3: define function

In [6]:
replace_gene_values = []
for gene_val in possible_gene_values:
    replace_gene_values.append(gene_val + ' ')

def add_whitespace(df):
    df[input_col_name] = df[input_col_name].str.replace(possible_gene_values, replace_gene_values, regex=False)
    return df

def get_kmers(df):
    df['temp'] = df[input_col_name].copy()
    df['temp'] = ' ' 
    for i in np.arange(0, df[input_col_name].str.len().max() - k_mer):
        # print(i)
        temp_df = df[input_col_name].str[i: i+k_mer].fillna(' ')
        change_mask = temp_df.str.len() < k_mer
        temp_df[change_mask] = ' ' 
        df['temp'] = df['temp'] + ' ' + temp_df  
    df['temp'] = df['temp'].str.normalize_spaces()
    df[input_col_name] = df['temp']
    df = df.drop(columns=['temp'])
    return df

def split_whitespace(df):
    df.loc[:, 0:150] = df[0].str.split(expand=True)
    return df

## Step 4: data transformations

In [7]:
# first we read the parquet
df = dask_cudf.read_parquet(data_dir)

In [8]:
# here is a data sample
df.head()

Unnamed: 0,seq,label
0,GAAAGCGAGTACGACCCGTTCGGCGCCGCCCACAGCTCGACCTCGA...,1
1,AGCATCGGAACAAGATCCGCAAGGGCGAGTTCGAGCAGGCCGTCGC...,1
2,TCGAGCGGCTACCTGGCCGAGACGTTCCGGGCGGGGATCGAGAATG...,1
3,GCGCCCCGGTATGTCGAGATGGTTCGTAGGCTCGTCCAGCACGAAG...,1
4,GCGTCCGTCATCGCCGGCGGGAAGATCGCGATCTCGCTGCTCTCGG...,1


In [9]:
# next, we apply the function defined above to the data
if k_mer == 1:
    df = df.map_partitions(add_whitespace)
    # df = df.map_partitions(split_whitespace)
elif (k_mer > 1):
    df = df.map_partitions(get_kmers)
    # df = df.map_partitions(split_whitespace)



In [10]:
df.head()



Unnamed: 0,seq,label
0,G A A A G C G A G T A C G A C C C G T T C G G ...,1
1,A G C A T C G G A A C A A G A T C C G C A A G ...,1
2,T C G A G C G G C T A C C T G G C C G A G A C ...,1
3,G C G C C C C G G T A T G T C G A G A T G G T ...,1
4,G C G T C C G T C A T C G C C G G C G G G A A ...,1


In [25]:
df_out = df[input_col_name].str.split(expand=True, n=max_seq_len-k_mer)

In [26]:
df_out.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,140,141,142,143,144,145,146,147,148,149
0,G,A,A,A,G,C,G,A,G,T,...,,,,,,,,,,
1,A,G,C,A,T,C,G,G,A,A,...,,,,,,,,,,
2,T,C,G,A,G,C,G,G,C,T,...,C,G,C,,,,,,,
3,G,C,G,C,C,C,C,G,G,T,...,,,,,,,,,,
4,G,C,G,T,C,C,G,T,C,A,...,,,,,,,,,,


In [27]:
df_out = df_out.rename(columns={a_col: f"{input_col_name}_{a_col}"for a_col in df_out.columns})

In [28]:
df_out.head()

Unnamed: 0,seq_0,seq_1,seq_2,seq_3,seq_4,seq_5,seq_6,seq_7,seq_8,seq_9,...,seq_140,seq_141,seq_142,seq_143,seq_144,seq_145,seq_146,seq_147,seq_148,seq_149
0,G,A,A,A,G,C,G,A,G,T,...,,,,,,,,,,
1,A,G,C,A,T,C,G,G,A,A,...,,,,,,,,,,
2,T,C,G,A,G,C,G,G,C,T,...,C,G,C,,,,,,,
3,G,C,G,C,C,C,C,G,G,T,...,,,,,,,,,,
4,G,C,G,T,C,C,G,T,C,A,...,,,,,,,,,,


In [29]:
df_out[label_col_name] = df[label_col_name]

In [30]:
df_out.head()

Unnamed: 0,seq_0,seq_1,seq_2,seq_3,seq_4,seq_5,seq_6,seq_7,seq_8,seq_9,...,seq_141,seq_142,seq_143,seq_144,seq_145,seq_146,seq_147,seq_148,seq_149,label
0,G,A,A,A,G,C,G,A,G,T,...,,,,,,,,,,1
1,A,G,C,A,T,C,G,G,A,A,...,,,,,,,,,,1
2,T,C,G,A,G,C,G,G,C,T,...,G,C,,,,,,,,1
3,G,C,G,C,C,C,C,G,G,T,...,,,,,,,,,,1
4,G,C,G,T,C,C,G,T,C,A,...,,,,,,,,,,1


## Step 5: save the data

In [15]:
%%time
# the final step is to save the cleaned data. 
# this might take some time
_ = df_out.to_parquet(data_dir)

Key:       ('to-parquet-f90fa9020bfa438de5616655b39f9a6c', 0)
Function:  subgraph_callable-e7425658-09a1-4d99-b999-1bf6d41b
args:      ((0,), {'piece': ('/media/jcosme/Data/MarRef_parquet_10_cats/data/MarRef_parquet_10_cats/part.0.parquet', [0], [])})
kwargs:    {}
Exception: "ValueError('parquet must have string column names')"

Key:       ('to-parquet-f90fa9020bfa438de5616655b39f9a6c', 1)
Function:  subgraph_callable-e7425658-09a1-4d99-b999-1bf6d41b
args:      ((1,), {'piece': ('/media/jcosme/Data/MarRef_parquet_10_cats/data/MarRef_parquet_10_cats/part.1.parquet', [0], [])})
kwargs:    {}
Exception: "ValueError('parquet must have string column names')"



ValueError: parquet must have string column names

## Step 6: cleanup

In [16]:
# we delete the dataframe
del df, df_out

# then we shutdown the Dask cluster
client.shutdown()

# finally we close the Dask cluster
client.close()

## finished!