# Construct k-mers for given sequences

In [None]:
SEQUENCES_DATAFRAME_PATH = '../../data/combined/clustered/final/consensus_seq/Galson_2016.parquet'
KMERS_DATAFRAME_OUTPUT_PATH = '../../data/features_data/kmers/Galson_2016.parquet'
SEQ_COL_IDX = 0
K = 3

## Load data

In [None]:
import pandas as pd

sequences_df = pd.read_parquet(SEQUENCES_DATAFRAME_PATH)

sequences_df.info()
sequences_df.head()

In [None]:
sequences = sequences_df.iloc[:,SEQ_COL_IDX].values
sequences

## Construct k-mers

In [None]:
import itertools

AMINO_ACIDS = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
               'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
KMER_VOCABULARY = [''.join(kmer_tuple) for kmer_tuple in itertools.product(AMINO_ACIDS, repeat=K)]

print(f'There is {len(KMER_VOCABULARY)} possible {K}-mers when considering {len(AMINO_ACIDS)} possible amino acids.')

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

vectorizer = CountVectorizer(vocabulary=KMER_VOCABULARY, analyzer='char', ngram_range=(K,K), lowercase=False)
kmer_data = vectorizer.transform(sequences).astype(np.uint16)

print(f'Constructed {K}-mers data shape: {kmer_data.shape}')

## Save k-mers data

In [None]:
features_names = [''.join(feature_tuple) for feature_tuple in vectorizer.get_feature_names()]
res_df = pd.DataFrame(data=kmer_data.toarray(), index=sequences_df.index, columns=vectorizer.get_feature_names())

res_df.info()
res_df.head()

In [None]:
res_df.to_parquet(KMERS_DATAFRAME_OUTPUT_PATH)

## Basic k-mers overview

In [None]:
res_df.sum().sort_values(ascending=False).head()

In [None]:
res_df.mean().sort_values(ascending=False).head()