In [26]:
import numpy as np
import pandas as pd
from stringphylo.kebab_wraps import spectrum_kernel, gappy_pair_kernel, mismatch_kernel, compute_kernel

In [33]:
from rpy2.robjects.packages import importr, PackageNotInstalledError
from rpy2.robjects.vectors import StrVector

kebabs = importr('kebabs')
biostrings = importr('Biostrings')

In [51]:
alphabet = ["A", "G", "T", "C"]
n_OTUs = 300
rng = np.random.default_rng(1234256)

repr_seqs = ["".join(rng.choice(alphabet, size=300).tolist()) for _ in range(n_OTUs)]
repr_seqs = pd.Series(repr_seqs, index=[f"OTU{x}" for x in range(300)])
repr_seqs

OTU0      AAGTAAGAAACAACTGACTGCCCATAATTTTTTATCCTCCCCAGAC...
OTU1      TGCACTACACACATACTTTTGTCAGACAAATAATTAAACACGGTGC...
OTU2      GGGTAGATAGAACGATGTGCCAAGAAGTAACGGGGGCTTTTCGATA...
OTU3      AATCTGGAAAGTTCCGCGTATTCGTGGTGCAACCCGTAACGGTGGC...
OTU4      TTGTTTACCCCTTCATCACGATTAAATTGTTTGTACACACTTCTAG...
                                ...                        
OTU295    TTCAGACACTCAAGAGCTTTTGCAGACTGCGGCCCCAGTAGATCCG...
OTU296    CACCATACAAAATGCCAACTGAGCCCTATTATCGCCATTTAAAACT...
OTU297    CATTAGCTTTAACGAAACGACAGAATTTACGAACATGATCGAAAAA...
OTU298    AGCACAGCCGTGCAAACTTGAAGAGTACTACCTTAGAACGCAGATG...
OTU299    CCTGGACTATTCGTGTTAAGAAATATTTCGAGTCTCAGCAGAGTAA...
Length: 300, dtype: object

In [71]:
gappy_pair_kernel()

<rpy2.robjects.functions.SignatureTranslatedFunction object at 0x7fb500517eb0> [RTYPES.CLOSXP]
R classes: ('GappyPairKernel',)

In [70]:
def compute_kernel(kern_obj, repr_strings):
    """Compute the string kernel Q matrix on the representative strings
    
    Args:
        kern_obj (rpy2.robjects.functions.SignatureTranslatedFunction): an R object representing
            a kernel from the kebabs package (created using spectrum_kernel, gappy_pair_kernel
            or mismatch_kernel)
        repr_strings (list, np.ndarray or pd.Series): the representative sequences

    Returns:
        A DataFrame containing the Q matrix (size p x p for p OTUs). The OTU names are taken 
        from the index of repr_strings if it is a pd.Series or given as OTU0, OTU1, ... otherwise
    """
    if isinstance(repr_strings, pd.Series):
        otu_names = repr_strings.index.to_numpy()
    else:
        otu_names = [f"OTU{x}" for x in range(len(repr_strings))]
    
    Qmat = np.array(
        kern_obj(
            biostrings.DNAStringSet(
                StrVector(repr_strings)
            )
        )
    )
    
    return pd.DataFrame(Qmat, columns=otu_names, index=otu_names)

compute_kernel(mismatch_kernel(k=3), repr_seqs)

Unnamed: 0,OTU0,OTU1,OTU2,OTU3,OTU4,OTU5,OTU6,OTU7,OTU8,OTU9,...,OTU290,OTU291,OTU292,OTU293,OTU294,OTU295,OTU296,OTU297,OTU298,OTU299
OTU0,1.000000,0.988786,0.952741,0.980025,0.986722,0.988377,0.971753,0.985794,0.986520,0.978447,...,0.979211,0.986914,0.990372,0.995050,0.961659,0.990229,0.990972,0.977468,0.973639,0.991404
OTU1,0.988786,1.000000,0.976018,0.989990,0.983780,0.993003,0.988152,0.993095,0.993511,0.988013,...,0.990985,0.985434,0.984677,0.988991,0.983923,0.991789,0.984149,0.987635,0.983109,0.994297
OTU2,0.952741,0.976018,1.000000,0.980415,0.953144,0.974097,0.990895,0.972364,0.973923,0.986536,...,0.981381,0.951272,0.948682,0.955256,0.990416,0.967653,0.949084,0.981931,0.977906,0.977772
OTU3,0.980025,0.989990,0.980415,1.000000,0.981551,0.990251,0.988901,0.988861,0.984345,0.984466,...,0.991410,0.984518,0.982573,0.979801,0.977591,0.987331,0.979895,0.983142,0.984180,0.990798
OTU4,0.986722,0.983780,0.953144,0.981551,1.000000,0.982862,0.969644,0.988676,0.981850,0.971512,...,0.978203,0.991163,0.986257,0.988230,0.955338,0.988404,0.975957,0.964426,0.958492,0.985693
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
OTU295,0.990229,0.991789,0.967653,0.987331,0.988404,0.991580,0.981817,0.992886,0.990445,0.984204,...,0.990221,0.990215,0.988817,0.989282,0.972779,1.000000,0.987742,0.980393,0.979592,0.992757
OTU296,0.990972,0.984149,0.949084,0.979895,0.975957,0.989556,0.969150,0.979498,0.981766,0.973841,...,0.980725,0.983853,0.990846,0.988017,0.957925,0.987742,1.000000,0.979156,0.982108,0.987338
OTU297,0.977468,0.987635,0.981931,0.983142,0.964426,0.986545,0.987563,0.978500,0.985416,0.986971,...,0.987934,0.965103,0.971205,0.976851,0.983507,0.980393,0.979156,1.000000,0.986288,0.989038
OTU298,0.973639,0.983109,0.977906,0.984180,0.958492,0.987769,0.986177,0.977717,0.979697,0.985333,...,0.985095,0.967857,0.974380,0.972260,0.979079,0.979592,0.982108,0.986288,1.000000,0.985301
