# Construct k-mers for given sequences

In [1]:
SEQUENCES_DATAFRAME_PATH = '../../data/combined/clustered/final/consensus_seq/Galson_2016.parquet'
KMERS_DATAFRAME_OUTPUT_PATH = '../../data/features_data/kmers/Galson_2016.parquet'
SEQ_COL_IDX = 0
K = 3

In [2]:
# Parameters
SEQUENCES_DATAFRAME_PATH = (
    "data/combined/clustered/final/mode_seq/Galson_2016_single.parquet"
)
KMERS_DATAFRAME_OUTPUT_PATH = (
    "data/features_data_g2016/kmers_cdr3/Galson_2016_single.parquet"
)
K = 3
SEQ_COL_IDX = 5


## Load data

In [3]:
import pandas as pd

sequences_df = pd.read_parquet(SEQUENCES_DATAFRAME_PATH)

sequences_df.info()
sequences_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682432 entries, 0 to 1682431
Data columns (total 26 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   num_errors     1682432 non-null  object
 1   redundancy     1682432 non-null  int64 
 2   name           1682432 non-null  int64 
 3   seq            1682432 non-null  object
 4   v              1682432 non-null  object
 5   cdr3           1682432 non-null  object
 6   original_name  1682432 non-null  object
 7   errors         1682432 non-null  object
 8   j              1682432 non-null  object
 9   data           1682432 non-null  object
 10  Longitudinal   1682432 non-null  object
 11  Chain          1682432 non-null  object
 12  Author         1682432 non-null  object
 13  Isotype        1682432 non-null  object
 14  Age            1682432 non-null  object
 15  Size_igblastn  1682432 non-null  int64 
 16  Disease        1682432 non-null  object
 17  Link           1682432 non-

Unnamed: 0,num_errors,redundancy,name,seq,v,cdr3,original_name,errors,j,data,...,Disease,Link,BSource,BType,Subject,Species,Vaccine,Size,cdr3_len,Cluster_ID
0,0,1,63311,SLRLSCAASGFIFSSYVMSWVRQAPGKGLEWVSAIIGSGGTTFYAD...,IGHV3-23*01,AKDPVEVATLFPHQDDVTNWFDP,108143,[None],IGHJ5*02,"{""fwh1"": {""24"": ""A"", ""25"": ""A"", ""26"": ""S"", ""20...",...,,https://genomemedicine.biomedcentral.com/artic...,PBMC,HepB+B-cells,Subject-2492,human,HepB,231143,23,0
1,1,1,105558,SLRLSCAASEFTFSSYAVSWVRQAPGKGLEWVSAGSGTGGIKYYAD...,IGHV3-23*01,ARGPWGYCGGGDCPFPSYNWFDP,105427,"[('56', 'G')]",IGHJ5*02,"{""fwh1"": {""24"": ""A"", ""25"": ""A"", ""26"": ""S"", ""20...",...,,https://genomemedicine.biomedcentral.com/artic...,PBMC,Unsorted-B-Cells,Subject-2277,human,HepB,198956,23,1
2,0,1,89152,SLRLSCAASGFTFSGFAMGWVRQAPGKGLEWVSSLSDSGANRYYAD...,IGHV3-23*01,AKELGGGWFFGEVVSPRHNWFDP,121789,[None],IGHJ5*02,"{""fwh1"": {""24"": ""A"", ""25"": ""A"", ""26"": ""S"", ""20...",...,,https://genomemedicine.biomedcentral.com/artic...,PBMC,Unsorted-B-Cells,Subject-2335,human,HepB,168255,23,2
3,1,1,641,SLRLSCAAFGFTFSNHAMNWVRQAPGKGLEWVSGIRGGGQSSFYAD...,IGHV3-23*01,AREIGYCTVSGPNPKKRGCWFDP,98214,"[('63', 'Q')]",IGHJ5*02,"{""fwh1"": {""24"": ""A"", ""25"": ""A"", ""26"": ""F"", ""20...",...,,https://genomemedicine.biomedcentral.com/artic...,PBMC,Unsorted-B-Cells,Subject-2752,human,HepB,171405,23,3
4,0,1,25941,SLRLSCAGSGFTFSSFAMSWVRQAPGKGLEWVSAISASGASTYYAD...,IGHV3-23*01,AKDMRREGILRTMIRGVSCWFDP,86341,[None],IGHJ5*02,"{""fwh1"": {""24"": ""A"", ""25"": ""G"", ""26"": ""S"", ""20...",...,,https://genomemedicine.biomedcentral.com/artic...,PBMC,HepB+B-cells,Subject-2277,human,HepB,121330,23,4


In [4]:
sequences = sequences_df.iloc[:,SEQ_COL_IDX].values
sequences

array(['AKDPVEVATLFPHQDDVTNWFDP', 'ARGPWGYCGGGDCPFPSYNWFDP',
       'AKELGGGWFFGEVVSPRHNWFDP', ..., 'APPGEYGMDV', 'VRENRWGYDD',
       'VKDNWNYDSV'], dtype=object)

## Construct k-mers

In [5]:
import itertools

AMINO_ACIDS = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
               'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
KMER_VOCABULARY = [''.join(kmer_tuple) for kmer_tuple in itertools.product(AMINO_ACIDS, repeat=K)]

print(f'There is {len(KMER_VOCABULARY)} possible {K}-mers when considering {len(AMINO_ACIDS)} possible amino acids.')

There is 8000 possible 3-mers when considering 20 possible amino acids.


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

vectorizer = CountVectorizer(vocabulary=KMER_VOCABULARY, analyzer='char', ngram_range=(K,K), lowercase=False)
kmer_data = vectorizer.transform(sequences).astype(np.uint16)

print(f'Constructed {K}-mers data shape: {kmer_data.shape}')

Constructed 3-mers data shape: (1682432, 8000)


## Save k-mers data

In [7]:
features_names = [''.join(feature_tuple) for feature_tuple in vectorizer.get_feature_names()]
res_df = pd.DataFrame(data=kmer_data.toarray(), index=sequences_df.index, columns=vectorizer.get_feature_names())

res_df.info()
res_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682432 entries, 0 to 1682431
Columns: 8000 entries, AAA to YYY
dtypes: uint16(8000)
memory usage: 25.1 GB


Unnamed: 0,AAA,AAC,AAD,AAE,AAF,AAG,AAH,AAI,AAK,AAL,...,YYM,YYN,YYP,YYQ,YYR,YYS,YYT,YYV,YYW,YYY
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
res_df.to_parquet(KMERS_DATAFRAME_OUTPUT_PATH)

## Basic k-mers overview

In [9]:
res_df.sum().sort_values(ascending=False).head()

ARD    241775
FDY    230037
MDV    214355
YYY    212891
ARG    187222
dtype: int64

In [10]:
res_df.mean().sort_values(ascending=False).head()

ARD    0.143706
FDY    0.136729
MDV    0.127408
YYY    0.126538
ARG    0.111281
dtype: float64