# Example of extracting features from a DNA sequence.

Here I'm using a 2kb region from which I want to extract features of 100bp each, using a sliding window where each feature has a 20% overlap with the next feature, i.e., they share 20bp.

## Libraries

In [None]:
from Bio import SeqIO
import gzip
import re

# Unzip and load the RNA sequence from a genome


In [70]:
# Path to file
genome_path = '../input/Genomes/Brapassp_chinensisvar_communisPCGlu_712_v2.0.fa.gz'
# Unzip and read
with gzip.open(genome_path, "rt") as seq:
    sequence = seq.read()

# Extract a 2kb region

In [None]:
 # Remove anything that is not a base pair
base_pairs = re.sub("[^ATCG]", "", sequence) 
region_2kb = base_pairs[:2000]
# len(region_2kb)
# region_2kb

# Extract features

In [68]:
# Select a window size and percentage overlap
window_size = 100
overlap = 0.2
step_size = int(window_size * overlap)

# Define a list to store the features
features = []

# Slide the window along the sequence and extract the features
for region in range(0, len(region_2kb) - window_size + 1, step_size):
    feature = region_2kb[region:region + window_size]
    features.append(feature)


# See example of features

In [71]:
features

['CAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCT',
 'CCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACC',
 'ACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAA',
 'AAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTA',
 'CTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCC',
 'CCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAAC',
 'AACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAA',
 'TAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCT',
 'CCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACC',
 'ACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAA',
 'AAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTA',
 'CTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCC',
 'CCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAAC',
 'AACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAA',
 'TAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCT',
 'CCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACC',
 'ACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAA',
 'AAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTA',
 'CTAAACCC