# Working With Text

In [1]:
text1 = "Ethics are built right into the ideals and objectives of the United Nations "

len(text1) # The length of text1

76

In [2]:
text2 = text1.split(' ') # Return a list of the words in text2, separating by ' '.

len(text2)

14

In [3]:
text2

['Ethics',
 'are',
 'built',
 'right',
 'into',
 'the',
 'ideals',
 'and',
 'objectives',
 'of',
 'the',
 'United',
 'Nations',
 '']

<br>
List comprehension allows us to find specific words:

In [4]:
[w for w in text2 if len(w) > 3] # Words that are greater than 3 letters long in text2

['Ethics',
 'built',
 'right',
 'into',
 'ideals',
 'objectives',
 'United',
 'Nations']

In [5]:
[w for w in text2 if w.istitle()] # Capitalized words in text2

['Ethics', 'United', 'Nations']

In [6]:
[w for w in text2 if w.endswith('s')] # Words in text2 that end in 's'

['Ethics', 'ideals', 'objectives', 'Nations']

<br>
We can find unique words using `set()`.

In [7]:
text3 = 'To be or not to be'
text4 = text3.split(' ')

len(text4)

6

In [8]:
len(set(text4))

5

In [9]:
set(text4)

{'To', 'be', 'not', 'or', 'to'}

In [10]:
len(set([w.lower() for w in text4])) # .lower converts the string to lowercase.

4

In [11]:
set([w.lower() for w in text4])

{'be', 'not', 'or', 'to'}

### Processing free-text

- s.lower();s.upper();s.titlecase()
- s.split(t)
- s.splitlines()
- s.join(t)
- s.strip();s.rstrip()
- s.find(t); s.rfind(t)
- s.replace(u,v)


### File operations
f = open(filename, mode) <br>
f.readline(); f.read(); f.read(n) <br>
for line in f: doSomething(line) <br>
f.seek(n) resetting the reading position <br>
f.write(message) <br>
f.close() <br>

In [12]:
text5 = '"Ethics are built right into the ideals and objectives of the United Nations" \
#UNSG @ NY Society for Ethical Culture bit.ly/2guVelr'
text6 = text5.split(' ')

text6

['"Ethics',
 'are',
 'built',
 'right',
 'into',
 'the',
 'ideals',
 'and',
 'objectives',
 'of',
 'the',
 'United',
 'Nations"',
 '#UNSG',
 '@',
 'NY',
 'Society',
 'for',
 'Ethical',
 'Culture',
 'bit.ly/2guVelr']

<br>
Finding hastags:

In [13]:
[w for w in text6 if w.startswith('#')]

['#UNSG']

<br>
Finding callouts:

In [14]:
[w for w in text6 if w.startswith('@')]

['@']

In [15]:
text7 = '@UN @UN_Women "Ethics are built right into the ideals and objectives of the United Nations" \
#UNSG @ NY Society for Ethical Culture bit.ly/2guVelr'
text8 = text7.split(' ')

<br>

We can use regular expressions to help us with more complex parsing. 

For example `'@[A-Za-z0-9_]+'` will return all words that: 
* start with `'@'` and are followed by at least one: 
* capital letter (`'A-Z'`)
* lowercase letter (`'a-z'`) 
* number (`'0-9'`)
* or underscore (`'_'`)

## Regular Expressions
. :wildcard, matches a single character <br>
^ :start of a string <br>
$ :end of a string <br>
[] :matches one of the set of characters within [] <br>
[a-z] :matches one of the range of characters a, b, ...z <br>
[^abc] :matches a character that is not a,b, or, c <br>
a|b :matches either a or b, where a and b are strings <br>
() :Scoping for operators <br>
\ :Escape character for special characters (\t, \n, \b) <br>
\b :Matches word boundary <br>
\d :Any digit, equivalent to [0-9] <br>
\D :Any non-digit, equivalent to [^0-9] <br>
\s :Any non-digit, equivalent to [ \t\n\r\f\v] <br>
\S :Any non-whitespace, equivalent to [^ \t\n\r\f\v] <br>
\w :Alphanumeric character, equivalent to [a-zA-Z0-9_] <br>
\W :Alphanumeric character, equivalent to [a-zA-Z0-9_] <br>
'*' :matches zero or more occurences <br>
'+' :matches one or more occurences <br>
? :matches zero or one occurences <br>
{n} : exactly n repetitions, n >= 0 <br>
{n,} : at least n repetitions <br>
{,n} : at most n repetitions <br>
{m,n} : at least m and at most n repetitions <br>

In [16]:
import re # import re - a module that provides support for regular expressions

[w for w in text8 if re.search('@[A-Za-z0-9_]+', w)]

['@UN', '@UN_Women']

In [17]:
[w for w in text8 if re.search('@\w+', w)]

['@UN', '@UN_Women']

#### Finding specific characters

In [18]:
# find everything that is a vowel
text100 = 'ouagadougou'
re.findall(r'[aeiou]',text100)

['o', 'u', 'a', 'a', 'o', 'u', 'o', 'u']

In [19]:
# find everything that is not a vowel
re.findall(r'[^aeiou]',text100)

['g', 'd', 'g']

In [20]:
text10 = '"Ethics are built right into the ideals and objectives of the United Nations" #UNSG @ NY Society for Ethical Culture bit.ly/2guVelr @UN @UN_Women'

In [21]:
text11 = text10.split(' ')
[i for i in text11 if i.startswith('#')]

['#UNSG']

In [22]:
tweet = "@nltk Text analysis is awesome! #regex #pandas #python"
print([word for word in tweet.split() if word.startswith('#')])

['#regex', '#pandas', '#python']


In [23]:
dateStr = '23-10-2002\n23/10/2002\n23/10/02\n10/23/2002\n23 Oct 2002\n23 October 2002\nOct 23, 2002\nOctober 23, 2002\n'

In [26]:
re.findall(r'\d{2}[/-]\d{2}[/-]\d{4}', dateStr)

['23-10-2002', '23/10/2002', '10/23/2002']

In [27]:
re.findall(r'\d{2}[/-]\d{2}[/-]\d{2,4}', dateStr)

['23-10-2002', '23/10/2002', '23/10/02', '10/23/2002']

In [29]:
re.findall(r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', dateStr)

['23-10-2002', '23/10/2002', '23/10/02', '10/23/2002']

In [30]:
re.findall(r'\d{2} (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \d{4}', dateStr)

['Oct']

In [31]:
re.findall(r'\d{2} (?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) \d{4}', dateStr)

['23 Oct 2002']

In [32]:
re.findall(r'\d{2} (?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{4}', dateStr)

['23 Oct 2002', '23 October 2002']

In [37]:
re.findall(r'(?:\d{2} )?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{2}, )?\d{4}', dateStr)

['23 Oct 2002', '23 October 2002', 'Oct 23, 2002', 'October 23, 2002']

In [38]:
re.findall(r'(?:\d{1,2} )?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{1,2}, )?\d{4}', dateStr)

['23 Oct 2002', '23 October 2002', 'Oct 23, 2002', 'October 23, 2002']