In [1]:
data = 'From stephen.marquard@uct.ac.za Sat Jan 5 09:14:16 2008'

In [3]:
# command for finding the index position
atpos = data.find('@')
print(atpos)

21


In [4]:
# a second parameter finds a space after a starting index position
sppos = data.find(' ', atpos)
print(sppos)

31


In [5]:
# retreive the host
host = data[atpos+1 : sppos]
print(host)

uct.ac.za


In [13]:
# double split pattern
words = data.split()
print(words)

['From', 'stephen.marquard@uct.ac.za', 'Sat', 'Jan', '5', '09:14:16', '2008']


In [14]:
email = words[1]
print(email)

stephen.marquard@uct.ac.za


In [15]:
# email is split - hence the double split
pieces = email.split('@')
print(pieces[1])

uct.ac.za


In [16]:
import re
# [^ ] means find non-blank char, and match 0 or many of them, greedy matcher
y = re.findall('@([^ ]*)', data)
print(y)

['uct.ac.za']


In [18]:
# look for string from at beginning of the line
# . means any character
# [] means everything but
# * means 0 or many instances, greedy matcher
y = re.findall('^From .*@([^ ]*)', data)
print(y)

['uct.ac.za']


In [21]:
hand = open('DATA/mbox-short.txt')
numlist = list()
for line in hand:
    line = line.rstrip()
    stuff = re.findall('^X-DSPAM-Confidence: ([0-9.]+)', line)
    if len(stuff) != 1: continue
    num = float(stuff[0])
    numlist.append(num)
print('Maximum: ', max(numlist))

Maximum:  0.9907


### Meta-characters: . ^ $ * + ? { [ ] \ | ( ) 

* . (a period) -- matches any single character except newline '\n'
* \w -- (lowercase w) matches a "word" character: a letter or digit or underbar [a-zA-Z0-9_]. Note that although "word" is the mnemonic for this, it only matches a single word char, not a whole word. \W (upper case W) matches any non-word character.
* \b -- boundary between word and non-word
* \s -- (lowercase s) matches a single whitespace character -- space, newline, return, tab, form [ \n\r\t\f]. \S (upper case S) matches any non-whitespace character.
* \t, \n, \r -- tab, newline, return
* \d -- decimal digit [0-9] (some older regex utilities do not support but \d, but they all support \w and \s)
* ^ = start, $ = end -- match the start or end of the string
* \ -- inhibit the "specialness" of a character. So, for example, use \. to match a period or \\ to match a slash. If you are unsure if a character has special meaning, such as '@', you can put a slash in front of it, \@, to make sure it is treated just as a character.
* \+ -- 1 or more occurrences of the pattern to its left, e.g. 'i+' = one or more i's
* \* -- 0 or more occurrences of the pattern to its left
* ? -- match 0 or 1 occurrences of the pattern to its left
* \+ and * are said to be greedy as it goes as far as it can
* Square brackets can be used to indicate a set of chars, so [abc] matches 'a' or 'b' or 'c'. The codes \w, \s etc. work inside square brackets too with the one exception that dot (.) just means a literal dot.
* (More square-bracket features) You can also use a dash to indicate a range, so [a-z] matches all lowercase letters. To use a dash without indicating a range, put the dash last, e.g. [abc-]. An up-hat (^) at the start of a square-bracket set inverts it, so [^ab] means any char except 'a' or 'b'.
* The "group" feature of a regular expression allows you to pick out parts of the matching text. Suppose for the emails problem that we want to extract the username and host separately. To do this, add parenthesis ( ) around the username and host in the pattern, like this: r'([\w.-]+)@([\w.-]+)'. In this case, the parenthesis do not change what the pattern will match, instead they establish logical "groups" inside of the match text. On a successful search, match.group(1) is the match text corresponding to the 1st left parenthesis, and match.group(2) is the text corresponding to the 2nd left parenthesis. The plain match.group() is still the whole match text as usual.

In [31]:
# matching a dollar digit
hand = open('DATA/data.txt')
monies = list()
for line in hand:
    x = re.findall('\$[0-9.,\s]+', line)
    monies.append(x)
print(monies)

[['$49,383. 20 ', '$550. 20 ', '$550. 20 ', '$550. 20 ', '$1,200. 00 ', '$2,234. 00 ', '$3984. 20 ', '$12,426. 32 ', '$12,426. 32 ', '$356. 20 ', '$15,333. 00 ', '$888. 00 ', '$250. 00 ', '$29,253. 52 ', '$590. 10 ', '$590. 10 ', '$15,556. 00 ', '$16,146. 10 ', '$24,455 ', '$2,358 ', '$218 ', '$1,177 ', '$2,268 ', '$21,946 ', '$2,490 ', '$43,093 ', '$86,186, ', '$1,500 ', '$1,500 ', '$38,770 ', '$1460 6 ', '$40,230 ', '$15,979 ', '$2,049 ', '$350 ', '$ 64,380 ', '$25,677 ', '$90,057 ', '$364,146 ', '$2,077 ', '$3,001 ', '$2,695 4 ', '$358 ', '$291 ', '$2,838 ', '$1,395 ', '$218 ', '$1,177 ', '$1,755 ', '$4,338 ', '$4,536 ', '$2,298 ', '$1,123 ', '$86,185 ', '$1,500 ', '$237,502 2 ', '$4,098 ', '$52,589 ', '$700 ', '$123,163 ', '$51,354 ', '$5,598 ', '$180,115 ', '$2,472 ', '$364,146 ', '$20,466. 96 ', '$2,472. 00. ', '$12,946. 52 ', '$520. 20 ', '$520. 20 ', '$520. 20 ', '$520. 20 ', '$12,426. 32 ', '$12,426. 32 ', '$12,426. 32 ', '$49,383. 20 ', '$550. 20 ', '$550. 20 ', '$550. 20 ', 