In [2]:
# look for offset of a part of the string - my first regex
t = 'There would have been a time for such a word'
t.find('word')

40

In [8]:
#naive matching algorithm - try all lineups of pattern against string
def naive(p, t):
    occurrences = []
    for i in range(len(t) - len(p) + 1): # loop over alignments
        match = True
        for j in range(len(p)):          # loop over characters
            if t[i + j] != p[j]:         # compare characters
                match = False            # mismatch; reject alignment
                break
        if match:
            occurrences.append(i)         # all chars matched; record
    return occurrences



In [4]:
!"C:\Program Files (x86)\GnuWin32\bin\wget" --no-check-certificate https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/phix.fa

SYSTEM_WGETRC = c:/progra~1/wget/etc/wgetrc
syswgetrc = C:\Program Files (x86)\GnuWin32/etc/wgetrc
--2022-02-04 15:53:28--  https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/phix.fa
Resolving d28rh4a8wq0iu5.cloudfront.net... 52.84.93.54, 52.84.93.3, 52.84.93.164, ...
Connecting to d28rh4a8wq0iu5.cloudfront.net|52.84.93.54|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 5528 (5.4K) [application/octet-stream]
Saving to: `phix.fa'

     0K .....                                                 100% 5.71M=0.001s

2022-02-04 15:53:29 (5.71 MB/s) - `phix.fa' saved [5528/5528]



In [5]:
def readGenome(filename):
    genome = ''
    with open(filename, 'r') as f:
        for line in f:
            if not line[0] == ">":
                # rstrip removes trailing whitespace
                genome += line.rstrip()
    return genome

In [6]:
genome = readGenome('phix.fa')

In [9]:
t = 'AGCTTAGATAGC'
p = 'AG'
naive(p, t)

[0, 5, 9]

In [11]:
# generate random reads from the genome
import random
def generateReads(genome, numReads, readLen):
    '''Generate reads from random positions in the given genome.'''
    
    reads = []
    for _ in range(numReads):
        start = random.randint(0, len(genome) - readLen) - 1
        reads.append(genome[start : start+readLen])
    return reads

In [14]:
reads = generateReads(genome, 100, 100)

numMatched = 0
for r in reads:
    matches = naive(r, genome)
    if len(matches) > 0:
        numMatched += 1
        
print('%d/ %d reads matched exactly!' % (numMatched, len(reads)))


100/ 100 reads matched exactly!


In [15]:
# real reads
!"C:\Program Files (x86)\GnuWin32\bin\wget" --no-check-certificate https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ERR266411_1.first1000.fastq

SYSTEM_WGETRC = c:/progra~1/wget/etc/wgetrc
syswgetrc = C:\Program Files (x86)\GnuWin32/etc/wgetrc
--2022-02-04 16:04:32--  https://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ERR266411_1.first1000.fastq
Resolving d28rh4a8wq0iu5.cloudfront.net... 52.84.93.54, 52.84.93.77, 52.84.93.3, ...
Connecting to d28rh4a8wq0iu5.cloudfront.net|52.84.93.54|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 254384 (248K) [audio/mpeg]
Saving to: `ERR266411_1.first1000.fastq'

     0K .......... .......... .......... .......... .......... 20%  257K 1s
    50K .......... .......... .......... .......... .......... 40%  368K 0s
   100K .......... .......... .......... .......... .......... 60%  490K 0s
   150K .......... .......... .......... .......... .......... 80%  692K 0s
   200K .......... .......... .......... .......... ........  100%  498K=0.6s

2022-02-04 16:04:34 (413 KB/s) - `ERR266411_1.first1000.fastq' saved [254384/25438

In [16]:
def readFastq(filename):
    sequences = []
    qualities = []
    with open(filename) as fh:
        while True:
            # throwing away this line
            fh.readline()
            # read in a line and strip the whitespace
            seq = fh.readline().rstrip()
            # also throwing away this line
            fh.readline()
            qual = fh.readline().rstrip()
            # checking if no more sequences to read
            if len(seq) == 0:
                # break out of while
                break
            sequences.append(seq)
            qualities.append(qual)
    return sequences, qualities

In [17]:
phix_reads, _ = readFastq('ERR266411_1.first1000.fastq')

In [18]:
numMatched = 0
n = 0
for r in phix_reads:
    matches = naive(r, genome)
    n += 1
    if len(matches) > 0:
        numMatched += 1
        
print('%d / %d reads matched the genome!' % (numMatched, n))

7 / 1000 reads matched the genome!


In [19]:
# reattempt using only the first 30 bases of each read
numMatched = 0
n = 0
for r in phix_reads:
    r = r[:30]                       # first thirty bases
    matches = naive(r, genome)
    n += 1
    if len(matches) > 0:
        numMatched += 1
        
print('%d / %d reads matched the genome!' % (numMatched, n))

459 / 1000 reads matched the genome!


In [22]:
# also get reverse complements
# from previous lecture but also 'N' added to dictionary

def reverseComplement(s):
    complement = {'A':'T', 'C': 'G', 'T': 'A', 'G': 'C', 'N' : 'N'}
    t = ''
    for base in s:
        # adding to the front rather than back for reverse
        t = complement[base] + t
    return t

In [23]:
numMatched = 0
n = 0
for r in phix_reads:
    r = r[:30]                       # first thirty bases
    matches = naive(r, genome)                           # look at forward
    matches.extend(naive(reverseComplement(r), genome))  # also look at reverse complement
    n += 1
    if len(matches) > 0:
        numMatched += 1
        
print('%d / %d reads matched the genome!' % (numMatched, n))

932 / 1000 reads matched the genome!
