# Regular Expressions

Regular expressions are basically powerful methods for searching text strings and finding specific matches. Some examples have been included below but to be honest you're probably going to have to reference a table of what each char means to build these search strings for a while.

In [1]:
# Set up an example list as if I've read in a text file
multiLineEx = ['Hello!-this: is line one\n',
               'line two line two\n',
               'third line, mighty fine\n',
               'Hey-this: is line four\n',
               'this is the -- fifth -- line\n',
               'Hi!- this: is line six\n',
               'Hi!-this: is line seven\n']
for line in multiLineEx:
    print(line)

Hello!-this: is line one

line two line two

third line, mighty fine

Hey-this: is line four

this is the -- fifth -- line

Hi!- this: is line six

Hi!-this: is line seven



In [2]:
import re   # Import regex library

# Example of searching for string at beginning of line
for line in multiLineEx:
    line = line.rstrip() # Included \n chars in example list
    
    # Search line for the specified string; in regex, ^ = starts with
    if re.search('^(th.*)', line):
        print(line)

third line, mighty fine
this is the -- fifth -- line


In [3]:
# . matches any character - wildcard. * means any number of times, specifically 0 or more
# So, .* = any number of any characters
# Example of searching for f*** anywhere in line
for line in multiLineEx:
    line = line.rstrip() # Included \n chars in example list
    
    # Search line for the specified string
    if re.search('f.*', line):
        print('First query: ' + line)
    
    # Similarly, could look only for f*** --; * applies once, retroactively;
    # can still build string up from there
    if re.search('f.* --', line):
        print('Second query: ' + line)
        
    # Can put together a more complicated search. This searches for, in plain language,
    # strings that start with H, followed by any number of any character, followed by a -,
    # followed by one or more non-whitespace characters, followed by a colon!
    # NOTE especially that this does not pick up line six due to space after - !
    # It does however pick up line seven, which is nearly identical except for lack
    # of space. So, regex are pretty powerful, if weird to read.
    if re.search('^H.*-\S+:', line):
        print('Third query: ' + line)

Third query: Hello!-this: is line one
First query: third line, mighty fine
First query: Hey-this: is line four
Third query: Hey-this: is line four
First query: this is the -- fifth -- line
Second query: this is the -- fifth -- line
Third query: Hi!-this: is line seven


In [4]:
# Now, instead of simple searching and returning a bool, try extracting some data!
exString = '''The fine flower bloomed near the flooded riverbank; it took 21 days, 2 gentle hands, 
              nurturing its growth, and 230 carefully-applied drops of the purest water.'''

# Get all digits; anything in the range 0-9, one or more in a row!
# Note that this returns a list.
print(re.findall('[0-9]+', exString))

# Get all vowels, then only double vowels
# Be careful to not put spaces in between entries in the [], or it'll
# search for spaces as well. Note also that you don't need commas!
print(re.findall('[aeiou]+', exString))
print(re.findall('[aeiou]{2}', exString))

# Get all uppercase vowels
# Note that this is case sensitive, and returns an empty list if not present in string
print(re.findall('[AEIOU]+', exString))

['21', '2', '230']
['e', 'i', 'e', 'o', 'e', 'oo', 'e', 'ea', 'e', 'oo', 'e', 'i', 'e', 'a', 'i', 'oo', 'a', 'e', 'e', 'a', 'u', 'u', 'i', 'i', 'o', 'a', 'a', 'e', 'u', 'a', 'ie', 'o', 'o', 'e', 'u', 'e', 'a', 'e']
['oo', 'ea', 'oo', 'oo', 'ie']
[]


In [6]:
# Example of what greedy matching means - basically, re will look for the
# longest string that fits the search and try to return that. This comes
# into play when there are two or more possible subsets of the string
# that would match a search, like below.
# Also note that greedy matching goes in both directions!
exStr = 'From: boaty@gmail.com: subject: boats'

# If I do nothing and allow greedy matching, it grabs until the last colon
# So, it looks for longest result, it does not stop at the first colon it finds
print('Greedy: ' + str(re.findall('^F.+:', exStr)))
print('Non-greedy: ' + str(re.findall('^F.+?:', exStr)))

# If I want up through the second colon, can just refine search; note
# that it would still grab through the end of the string if I leave the first
# .+ as greedy!
print('Through second colon: ' + str(re.findall('^F.+?: .+?:', exStr)))

# Grab only email address; \S+ = at least one non-whitespace
print('Email: ' + str(re.findall('(\S+@\S+):', exStr)))

# Use parentheses to tell findall when to start extracting; can match longer
# string and then only extract subset, in parentheses, if desired
# Can stop getting the colon after by refining the parentheses.
print('Email, with more specific search: ' + str(re.findall('^F.*?: (\S+@\S+):', exStr)))

# Get only 'gmail.com'. For [], if the first character is ^, that means
# NOT, i.e., here get only those characters which are NOT spaces
print('Email, only domain: ' + str(re.findall('@([^ ]*):', exStr)))

Greedy: ['From: boaty@gmail.com: subject:']
Non-greedy: ['From:']
Through second colon: ['From: boaty@gmail.com:']
Email: ['boaty@gmail.com']
Email, with more specific search: ['boaty@gmail.com']
Email, only domain: ['gmail.com']


In [7]:
# Example of escaping to find actual character that is otherwised used for searches in regex,
# e.g. the $
exStr = 'Girl scout cookies cost $10.00, which is quite high I think.'

# Can do this by escaping the $ with \
# [0-9.] indicates it should be looking for floating points! I think just
# through looking for .
print(re.findall('\$[0-9.]+', exStr))

['$10.00']


In [8]:
# Running through HW from course; good example of reading a file as well using with
import re

sumVals = 0

with open('regex-homework.txt') as f:
    content = f.readlines()
    
for line in content:
    numsInLine = re.findall('[0-9]+', line)
    for val in numsInLine:
        sumVals += int(val)
        
print(sumVals)

299461


# Accessing web data, super basic internet architecture

Starting with a little bit about internet architecture.

A socket is, I guess, sort of like the full process of a single communication between a computer and a web server or more generally between two applications over the internet. To create a socket, you need to establish which web server you're trying to communicate with, and which application on that server. Each server has a name and a number (endpoints). Different applications listen on different ports on the server. Common TCP ports include 80 (HTTP), 443 (HTTPS), and 22 (SSH). Python has a library for handling sockets.

In [9]:
# Create a socket
import socket
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.connect(('data.pr4e.org', 80)) # String is the host, int is the port (web server here)

HTTP - Hypertext Transport Protocol - is the dominant application layer protocol on the internet. It's used to retrieve HTML, images, documents, etc. Basically, it is a set of rules that allows browsers or other applications to retrieve web documents from servers over the internet. URL = uniform resource locator. The http:// but is telling the browser to use the http protocol! Hypertext is referring to the ability to link to other pages. Browsers basically send a GET request to the location specified by the URL, get the webpage HTML and associated images/other data/documents, and display that webpage. Request/response cycle. RFC standards give detailed explanations of how things work, specifically, on the internet and how these protocols are used to interact with web servers/etc. Responses to GET requests include both the actual webpage and associated files as well as metadata such as the type of file(s) and the date that these files were last modified. The code below will demonstrate how to avtually communicate with a server using the socket created in the previous cell.

In [10]:
cmd = 'GET http://data.pr4e.org/romeo.txt HTTP/1.0\r\n\r\n'.encode() # Have to send UTF8, not unicode!
sock.send(cmd)

# Note that with HTTP we send first/server receives first, then we receive/server sends
while True:
    data = sock.recv(512)
    # If we get no data back, that means end of file
    if len(data) < 1:
        break
    print(data.decode()) # Convert UTF8 received strings to unicode
sock.close()

HTTP/1.1 200 OK
Date: Sun, 23 Jun 2019 20:19:28 GMT
Server: Apache/2.4.18 (Ubuntu)
Last-Modified: Sat, 13 May 2017 11:22:22 GMT
ETag: "a7-54f6609245537"
Accept-Ranges: bytes
Content-Length: 167
Cache-Control: max-age=0, no-cache, no-store, must-revalidate
Pragma: no-cache
Expires: Wed, 11 Jan 1984 05:00:00 GMT
Content-Type: text/plain
X-Cache: MISS from atlwifi5.atlanta-airport.com
Via: 1.1 atlwifi5.atlanta-airport.com (squid/4.0.21)
Connection: close


But soft what light through yonder window breaks
It is the east and Juliet is the sun
Arise fair sun and kill the envious moon
Who is already sick and pale with grief



In [11]:
# Course homework
import socket

mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
mysock.connect(('data.pr4e.org', 80))
cmd = 'GET http://data.pr4e.org/intro-short.txt HTTP/1.0\r\n\r\n'.encode()
mysock.send(cmd)

while True:
    data = mysock.recv(512)
    if len(data) < 1:
        break
    print(data.decode(),end='')

mysock.close()

HTTP/1.1 200 OK
Date: Sun, 23 Jun 2019 20:19:34 GMT
Server: Apache/2.4.18 (Ubuntu)
Last-Modified: Sat, 13 May 2017 11:22:22 GMT
ETag: "1d3-54f6609240717"
Accept-Ranges: bytes
Content-Length: 467
Cache-Control: max-age=0, no-cache, no-store, must-revalidate
Pragma: no-cache
Expires: Wed, 11 Jan 1984 05:00:00 GMT
Content-Type: text/plain
X-Cache: MISS from atlwifi5.atlanta-airport.com
Via: 1.1 atlwifi5.atlanta-airport.com (squid/4.0.21)
Connection: close

Why should you learn to write programs?

Writing programs (or programming) is a very creative 
and rewarding activity.  You can write programs for 
many reasons, ranging from making your living to solving
a difficult data analysis problem to having fun to helping
someone else solve a problem.  This book assumes that 
everyone needs to know how to program, and that once 
you know how to program you will figure out what you want 
to do with your newfound skills.  


# Retrieving data from websites and APIs

Certain python packages make retrieving web data relatively simple. In particular, the use of urllib and beautifulsoup will be demonstrated.

In [21]:
import urllib.request, urllib.parse, urllib.error

fhand = urllib.request.urlopen('https://www.ncbi.nlm.nih.gov/pubmed/19053884')
for line in fhand:
    print(line.decode().strip())

<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
<head xmlns:xi="http://www.w3.org/2001/XInclude"><meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<!-- meta -->
<meta name="robots" content="index,nofollow,noarchive" /><meta property="og:image" content="http://www.ncbi.nlm.nih.gov/coreutils/img/pubmed256blue.png" /><meta property="og:image:secure_url" content="https://www.ncbi.nlm.nih.gov/coreutils/img/pubmed256blue.png" />
<meta name="ncbi_app" content="entrez" /><meta name="ncbi_db" content="pubmed" /><meta name="ncbi_report" content="abstract" /><meta name="ncbi_format" content="html" /><meta name="ncbi_pagesize" content="20" /><meta name="ncbi_sortorder" content="default" /><meta name="ncbi_pageno" content="1" /><meta name="ncbi_resultcount" content="1" /><meta name="ncbi_op" content="

</script></h4><div id="send_to_menu" class="tabPopper send_to"><fieldset><legend>Choose Destination</legend><ul class="column_list"><li><input type="radio" name="EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_DisplayBar.SendTo" sid="1" value="File" id="dest_File" /><label for="dest_File">File</label></li><li><input type="radio" name="EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_DisplayBar.SendTo" sid="2" value="AddToClipboard" id="dest_AddToClipboard" /><label for="dest_AddToClipboard">Clipboard</label></li><li><input type="radio" name="EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_DisplayBar.SendTo" sid="3" value="AddToCollections" id="dest_AddToCollections" /><label for="dest_AddToCollections">Collections</label></li><li><input type="radio" name="EntrezSystem2.PEntrez.PubMed.Pubmed_ResultsPanel.Pubmed_DisplayBar.SendTo" sid="4" value="Mail" id="dest_Mail" /><label for="dest_Mail">E-mail</label></li><li><input type="radio" name="EntrezSystem2.PEntrez.PubM

HTML is super forgiving for syntax errors and inconsistencies. Trying to scrape data or specific text from web pages is apparently really tough to do with regex or string splitting for this reson; you have to account for a huge number of variations. This is where beautifulsoup comes in.

In [20]:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup

url = 'https://www.ncbi.nlm.nih.gov/pubmed/19053884'
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')

# Retrieve all of the anchor tags
tags = soup('a')
for tag in tags:
    print(tag.get('href', None))

/guide/browsers/#enablejs
/
#maincontent
#navcontent
/static/header_footer_ajax/submenu/#resources
/guide/all/
#chemicals-bioassays
/biosystems/
/pcassay
/pccompound
https://pubchem.ncbi.nlm.nih.gov/search/search.cgi
/pcsubstance
/guide/chemicals-bioassays/
#dna-rna
https://blast.ncbi.nlm.nih.gov/Blast.cgi
https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=Download
/books/NBK25501/ 
/genbank/
/WebSub/?tool=genbank
/projects/Sequin/
/genbank/tbl2asn2/
/tools/gbench/
/genomes/FLU/
/nuccore
/popset
/tools/primer-blast/
/sutils/static/prosplign/prosplign.html
/refseq/
/refseq/rsg/
/Traces/sra/sra.cgi?
/sutils/splign/
/Traces/trace.cgi
/unigene
/guide/dna-rna/
#data-software
https://blast.ncbi.nlm.nih.gov/Blast.cgi
https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=Download
/Structure/CN3D/cn3d.shtml
/Structure/cdd/wrpsb.cgi
/books/NBK25501/ 
/WebSub/?tool=genbank
/projects/Sequin/
/genbank/tbl2asn2/
/sutils/protmap.cgi
/tools/gbench/
/t

Code for Coursera assignment.

In [27]:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup

url = 'http://py4e-data.dr-chuck.net/comments_179245.html'
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')

# Retrieve all of the anchor tags
tags = soup('span')
sumval = 0
for tag in tags:
    sumval += int(tag.contents[0])

sumval

2795

In [28]:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup

url = 'http://py4e-data.dr-chuck.net/known_by_Jordanna.html'

# Retrieve all of the anchor tags
for i in range(0, 7):
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')
    tags = soup('a')
    url = tags[17].get('href', None)
print(url)

http://py4e-data.dr-chuck.net/known_by_Kyle.html
