# Construct k-mers for given sequences

In [1]:
SEQUENCES_DATAFRAME_PATH = '../../data/combined/clustered/final/consensus_seq/Galson_2016.parquet'
KMERS_DATAFRAME_OUTPUT_PATH = '../../data/features_data/kmers/Galson_2016.parquet'
SEQ_COL_IDX = 0
K = 3

In [2]:
# Parameters
SEQUENCES_DATAFRAME_PATH = (
    "data/combined/clustered/final/mode_seq/Galson_2015a_single.parquet"
)
KMERS_DATAFRAME_OUTPUT_PATH = (
    "data/features_data/kmers/cdr3_Galson_2015a_single.parquet"
)
K = 3
SEQ_COL_IDX = 5


## Load data

In [3]:
import pandas as pd

sequences_df = pd.read_parquet(SEQUENCES_DATAFRAME_PATH)

sequences_df.info()
sequences_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5698104 entries, 0 to 5698103
Data columns (total 26 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   num_errors     object
 1   redundancy     int64 
 2   name           int64 
 3   seq            object
 4   v              object
 5   cdr3           object
 6   original_name  object
 7   errors         object
 8   j              object
 9   data           object
 10  Longitudinal   object
 11  Chain          object
 12  Author         object
 13  Isotype        object
 14  Age            object
 15  Size_igblastn  int64 
 16  Disease        object
 17  Link           object
 18  BSource        object
 19  BType          object
 20  Size           int64 
 21  Species        object
 22  Vaccine        object
 23  Subject        object
 24  cdr3_len       int64 
 25  Cluster_ID     int64 
dtypes: int64(6), object(20)
memory usage: 1.1+ GB


Unnamed: 0,num_errors,redundancy,name,seq,v,cdr3,original_name,errors,j,data,...,Disease,Link,BSource,BType,Size,Species,Vaccine,Subject,cdr3_len,Cluster_ID
0,1,1,134263,SVKVSCKASGYSFTSKGISWVRQAPGQGLEWMGWISTNSGDTNYAQ...,IGHV1-18*03,ARDVDHRFDH,98191,"[('37', 'K')]",IGHJ1*01,"{""fwh1"": {""24"": ""K"", ""25"": ""A"", ""26"": ""S"", ""20...",...,,https://www.sciencedirect.com/science/article/...,PBMC,Unsorted-B-Cells,320541,human,HepB,Subject-1776,10,0
1,0,1,88640,SVKVSCKASGYNSATFGLCWVRQAPGEGLEWIGWISGYNGNAYYVP...,IGHV1-18*03,ARRAPFGFDH,128329,[None],IGHJ1*01,"{""fwh1"": {""24"": ""K"", ""25"": ""A"", ""26"": ""S"", ""20...",...,,https://www.sciencedirect.com/science/article/...,PBMC,Unsorted-B-Cells,257563,human,HepB,Subject-1070,10,1
2,1,1,68354,SVKVSCKASGYPFTINGISWVRQAPGQGLEWMGWISANSGNTIYAY...,IGHV1-18*03,ARDRNYRFDH,67869,"[('69', 'Y')]",IGHJ1*01,"{""fwh1"": {""24"": ""K"", ""25"": ""A"", ""26"": ""S"", ""20...",...,,https://www.sciencedirect.com/science/article/...,PBMC,Unsorted-B-Cells,210056,human,HepB,Subject-1032,10,2
3,2,1,35076,SVKVSCKASGYTFTRNGISWVRQAPGQGLEWMGWISTNSGNTKYAQ...,IGHV1-18*03,ARDRSHSFVD,114706,"[('101', 'R'), ('123', 'S')]",IGHJ1*01,"{""fwh1"": {""24"": ""K"", ""25"": ""A"", ""26"": ""S"", ""20...",...,,https://www.sciencedirect.com/science/article/...,PBMC,Unsorted-B-Cells,297611,human,HepB,Subject-1848,10,3
4,1,1,9399,SVKVPCKASGYSFTSNGISWVRQAPGQGLEWLGWISINSGNTNYAQ...,IGHV1-18*03,ARDRLHSLDH,137143,"[('22', 'P')]",IGHJ1*01,"{""fwh1"": {""24"": ""K"", ""25"": ""A"", ""26"": ""S"", ""20...",...,,https://www.sciencedirect.com/science/article/...,PBMC,Unsorted-B-Cells,297611,human,HepB,Subject-1848,10,4


In [4]:
sequences = sequences_df.iloc[:,SEQ_COL_IDX].values
sequences

array(['ARDVDHRFDH', 'ARRAPFGFDH', 'ARDRNYRFDH', ..., 'ASGGPNMDV',
       'AEGTSNMDV', 'ARVRGYRDG'], dtype=object)

## Construct k-mers

In [5]:
import itertools

AMINO_ACIDS = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L',
               'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
KMER_VOCABULARY = [''.join(kmer_tuple) for kmer_tuple in itertools.product(AMINO_ACIDS, repeat=K)]

print(f'There is {len(KMER_VOCABULARY)} possible {K}-mers when considering {len(AMINO_ACIDS)} possible amino acids.')

There is 8000 possible 3-mers when considering 20 possible amino acids.


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

vectorizer = CountVectorizer(vocabulary=KMER_VOCABULARY, analyzer='char', ngram_range=(K,K), lowercase=False)
kmer_data = vectorizer.transform(sequences).astype(np.uint16)

print(f'Constructed {K}-mers data shape: {kmer_data.shape}')

Constructed 3-mers data shape: (5698104, 8000)


## Save k-mers data

In [7]:
features_names = [''.join(feature_tuple) for feature_tuple in vectorizer.get_feature_names()]
res_df = pd.DataFrame(data=kmer_data.toarray(), index=sequences_df.index, columns=vectorizer.get_feature_names())

res_df.info()
res_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5698104 entries, 0 to 5698103
Columns: 8000 entries, AAA to YYY
dtypes: uint16(8000)
memory usage: 84.9 GB


Unnamed: 0,AAA,AAC,AAD,AAE,AAF,AAG,AAH,AAI,AAK,AAL,...,YYM,YYN,YYP,YYQ,YYR,YYS,YYT,YYV,YYW,YYY
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
res_df.to_parquet(KMERS_DATAFRAME_OUTPUT_PATH)

## Basic k-mers overview

In [9]:
res_df.sum().sort_values(ascending=False).head()

YYY    1774358
FDY    1179093
MDV    1101436
ARD    1031142
GMD     768731
dtype: int64

In [10]:
res_df.mean().sort_values(ascending=False).head()

YYY    0.311394
FDY    0.206927
MDV    0.193299
ARD    0.180962
GMD    0.134910
dtype: float64