## Quick start with Sylphy library

This notebook will demonstrate how to use the sylphy library for enconding protein sequences using directly the encoders or by applying the factory for instance the encoder.

The encoder strategies available on Sylphy are:

- OneHot Encoder
- Ordinal Encoder
- Frequency Encoder
- k-Mers Encoder
- Physicochemical Encoder
- FFT Encoder

In [9]:
import warnings

warnings.filterwarnings("ignore")

In [5]:
import pandas as pd

from sylphy.sequence_encoder import (
    FFTEncoder,
    FrequencyEncoder,
    KMersEncoders,
    OneHotEncoder,
    OrdinalEncoder,
    PhysicochemicalEncoder,
)

- Simulating fake data

In [2]:
data_sequences = [
    {"id_seq": 1, "sequence": "AFTGTGGGSSGHYT"},
    {"id_seq": 2, "sequence": "LPLPLKKLKMMNVN"},
    {"id_seq": 3, "sequence": "SASDRRDDQWSED"},
]

df_sequences = pd.DataFrame(data_sequences)
df_sequences

Unnamed: 0,id_seq,sequence
0,1,AFTGTGGGSSGHYT
1,2,LPLPLKKLKMMNVN
2,3,SASDRRDDQWSED


- Starting with Sylphy to produce a numerical representation based on encoder methods

In [3]:
one_hot_encoder = OneHotEncoder(dataset=df_sequences, sequence_column="sequence", max_length=20, debug=True)

one_hot_encoder.run_process()
one_hot_encoder.coded_dataset

Unnamed: 0,p_0,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_8,p_9,...,p_391,p_392,p_393,p_394,p_395,p_396,p_397,p_398,p_399,sequence
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,AFTGTGGGSSGHYT
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,LPLPLKKLKMMNVN
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,SASDRRDDQWSED


- Applying ordinal encoder

In [4]:
ordinal_encoder = OrdinalEncoder(dataset=df_sequences, sequence_column="sequence", max_length=20, debug=True)

ordinal_encoder.run_process()
ordinal_encoder.coded_dataset

Unnamed: 0,p_0,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_8,p_9,...,p_11,p_12,p_13,p_14,p_15,p_16,p_17,p_18,p_19,sequence
0,0,4,16,5,16,5,5,5,15,15,...,6,19,16,0,0,0,0,0,0,AFTGTGGGSSGHYT
1,10,12,10,12,10,9,9,10,9,11,...,8,17,8,0,0,0,0,0,0,LPLPLKKLKMMNVN
2,15,0,15,2,14,14,2,2,13,18,...,3,2,0,0,0,0,0,0,0,SASDRRDDQWSED


- Frequency encoder

In [7]:
frequency_encoder = FrequencyEncoder(dataset=df_sequences, sequence_column="sequence", debug=True)

frequency_encoder.run_process()
frequency_encoder.coded_dataset

Unnamed: 0,freq_A,freq_C,freq_D,freq_E,freq_F,freq_G,freq_H,freq_I,freq_N,freq_K,...,freq_M,freq_P,freq_Q,freq_R,freq_S,freq_T,freq_V,freq_W,freq_Y,sequence
0,0.071429,0.0,0.0,0.0,0.071429,0.357143,0.071429,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.142857,0.214286,0.0,0.0,0.071429,AFTGTGGGSSGHYT
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.214286,...,0.142857,0.142857,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,LPLPLKKLKMMNVN
2,0.076923,0.0,0.307692,0.076923,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.076923,0.153846,0.230769,0.0,0.0,0.076923,0.0,SASDRRDDQWSED


- k-Mers demo

In [11]:
kmer_encoder = KMersEncoders(dataset=df_sequences, sequence_column="sequence", size_kmer=4, debug=True)

kmer_encoder.run_process()
kmer_encoder.coded_dataset

Unnamed: 0,AFTG,ASDR,DDQW,DQWS,DRRD,FTGT,GGGS,GGSS,GHYT,GSSG,...,RDDQ,RRDD,SASD,SDRR,SGHY,SSGH,TGGG,TGTG,WSED,sequence
0,0.301511,0.0,0.0,0.0,0.0,0.301511,0.301511,0.301511,0.301511,0.301511,...,0.0,0.0,0.0,0.0,0.301511,0.301511,0.301511,0.301511,0.0,AFTGTGGGSSGHYT
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,LPLPLKKLKMMNVN
2,0.0,0.316228,0.316228,0.316228,0.316228,0.0,0.0,0.0,0.0,0.0,...,0.316228,0.316228,0.316228,0.316228,0.0,0.0,0.0,0.0,0.316228,SASDRRDDQWSED


- Physicochemical property encoders demo

In [12]:
property_encoder = PhysicochemicalEncoder(
    dataset=df_sequences, sequence_column="sequence", max_length=20, debug=True
)

property_encoder.run_process()
property_encoder.coded_dataset

Unnamed: 0,p_0,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_8,p_9,...,p_11,p_12,p_13,p_14,p_15,p_16,p_17,p_18,p_19,sequence
0,4.35,4.66,4.35,3.97,4.35,3.97,3.97,3.97,4.5,4.5,...,4.63,4.6,4.35,0.0,0.0,0.0,0.0,0.0,0.0,AFTGTGGGSSGHYT
1,4.17,4.44,4.17,4.44,4.17,4.36,4.36,4.17,4.36,4.52,...,4.75,3.95,4.75,0.0,0.0,0.0,0.0,0.0,0.0,LPLPLKKLKMMNVN
2,4.5,4.35,4.5,4.76,4.38,4.38,4.76,4.76,4.37,4.7,...,4.29,4.76,0.0,0.0,0.0,0.0,0.0,0.0,0.0,SASDRRDDQWSED


- FFT-based encoder demo

In [13]:
fft_encoder = FFTEncoder(dataset=property_encoder.coded_dataset, sequence_column="sequence", debug=True)

fft_encoder.run_process()
fft_encoder.coded_dataset

Unnamed: 0,p_0,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_8,p_9,p_10,p_11,p_12,p_13,p_14,p_15,sequence
0,60.14,42.53899,7.194392,13.781904,8.675338,5.334341,7.632733,1.629274,7.380257,2.769735,3.696036,2.658813,2.624597,2.885261,2.397628,4.557881,AFTGTGGGSSGHYT
1,61.13,43.633426,8.413949,12.792859,8.424005,4.696131,6.815929,1.607118,5.928246,0.647388,5.047883,2.30874,4.161243,4.432522,2.16976,4.820055,LPLPLKKLKMMNVN
2,59.01,44.382344,13.159132,9.530566,10.575677,1.117287,7.881682,3.268138,4.266954,4.217733,2.390644,6.135589,2.163807,4.098288,4.361004,1.147902,SASDRRDDQWSED


### Alternaty, using the factory object

In [15]:
from sylphy.sequence_encoder.factory import create_encoder

In [18]:
one_hot = create_encoder(
    "onehot", dataset=df_sequences, sequence_column="sequence", max_length=20, debug_mode=False
)

one_hot.run_process()
one_hot.coded_dataset

Unnamed: 0,p_0,p_1,p_2,p_3,p_4,p_5,p_6,p_7,p_8,p_9,...,p_391,p_392,p_393,p_394,p_395,p_396,p_397,p_398,p_399,sequence
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,AFTGTGGGSSGHYT
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,LPLPLKKLKMMNVN
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,SASDRRDDQWSED
