In [13]:
import h5py

from embedding_sequence.pfcgr.pfcgr import PFCGR

In [2]:
# Create PFCGR instance
pfcgr = PFCGR(k=4)  # Using k=4 for demonstration

# Example DNA sequence
test_sequence = "ATCGATCGATCGATCGAAATCGATCGATCGATCGAAATCGATCGATCGATCGAA"

# Test input validation
try:
    pfcgr.generate_pfcgr("")  # Should raise ValueError
except ValueError as e:
    print(f"Caught expected error: {e}")

# Generate traditional FCGR
fcgr_matrix = pfcgr.generate_fcgr(test_sequence)
print(f"FCGR matrix shape: {fcgr_matrix.shape}")

# Generate PFCGR in different formats
pfcgr_channels = pfcgr.generate_pfcgr(test_sequence, return_format='channels')
print(f"PFCGR channels shape: {pfcgr_channels.shape}")

# Test with presence mask
pfcgr_with_mask = pfcgr.generate_pfcgr(test_sequence, return_format='channels', include_mask=True)
print(f"PFCGR with mask shape: {pfcgr_with_mask.shape}")

# Test different normalization methods
pfcgr_zscore = pfcgr.generate_pfcgr(test_sequence, normalization='zscore')
print(f"PFCGR with z-score normalization shape: {pfcgr_zscore.shape}")

pfcgr_separate = pfcgr.generate_pfcgr(test_sequence, return_format='separate')
print(f"PFCGR separate matrices: {list(pfcgr_separate.keys())}")

pfcgr_tabular = pfcgr.generate_pfcgr(test_sequence, return_format='tabular')
print(f"PFCGR tabular features: {len(pfcgr_tabular)} features")

# Get feature names
feature_names = pfcgr.get_feature_names()
print(f"First 5 feature names: {feature_names[:5]}")

# Test with sequence containing non-ACGT characters
mixed_sequence = "ATCGATNNNGATCGATCGAA"
pfcgr_mixed = pfcgr.generate_pfcgr(mixed_sequence)
print(f"PFCGR from mixed sequence shape: {pfcgr_mixed.shape}")

Caught expected error: Sequence cannot be empty
FCGR matrix shape: (16, 16)
PFCGR channels shape: (5, 16, 16)
PFCGR with mask shape: (6, 16, 16)
PFCGR with z-score normalization shape: (5, 16, 16)
PFCGR separate matrices: ['frequency', 'mean', 'std', 'skewness', 'kurtosis']
PFCGR tabular features: 1280 features
First 5 feature names: ['AAAA_frequency', 'AAAA_mean', 'AAAA_std', 'AAAA_skewness', 'AAAA_kurtosis']
PFCGR from mixed sequence shape: (5, 16, 16)


In [4]:
fcgr_matrix

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.16666667, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.16666667, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 

In [10]:
from embedding_sequence.fcgr.fcgr import FCGR

fcgr = FCGR(k=4)
matrix = fcgr(test_sequence)
print(matrix.shape)

(16, 16)


In [11]:
matrix

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  2.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  3.,
         0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  2.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  9.,  0.,  0., 12.,  0.,  0.,  0.,
         0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  2.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.],
       [ 0.,  0.,  0., 12.,  0

In [27]:
import numpy as np

vectors = None
with h5py.File("C:\\Users\Admin\Temp\\1200_1800\\fold_1\\test\data.h5", "r") as f:
    print(f['labels'].shape)
    vectors = f['vectors'][:].astype(np.float32)

(24164,)


In [29]:
vectors[0]

array([[0., 0., 0., ..., 4., 1., 2.],
       [0., 0., 0., ..., 0., 4., 3.],
       [0., 0., 0., ..., 4., 2., 2.],
       ...,
       [0., 0., 0., ..., 0., 0., 2.],
       [0., 0., 0., ..., 1., 1., 1.],
       [0., 0., 0., ..., 1., 1., 3.]], dtype=float32)