# Basics

In [1]:
sen = 'Ethics are built right into the ideals and objectives of the United Nations '

In [2]:
len(sen)

76

In [3]:
text2 = sen.split(' ')

In [4]:
len(text2)

14

In [5]:
text2

['Ethics',
 'are',
 'built',
 'right',
 'into',
 'the',
 'ideals',
 'and',
 'objectives',
 'of',
 'the',
 'United',
 'Nations',
 '']

In [6]:
[w for w in text2 if len(w) >3]

['Ethics',
 'built',
 'right',
 'into',
 'ideals',
 'objectives',
 'United',
 'Nations']

In [7]:
[w for w in text2 if w.istitle()]

['Ethics', 'United', 'Nations']

In [8]:
[w for w in text2 if w.endswith('s')]

['Ethics', 'ideals', 'objectives', 'Nations']

In [9]:
text3 = 'To be or not to be'

In [10]:
text4 = text3.split()

In [11]:
text4

['To', 'be', 'or', 'not', 'to', 'be']

In [12]:
len(set(text4)) # but it should show 4 words as to & To are same.

5

In [13]:
set(text4)

{'To', 'be', 'not', 'or', 'to'}

In [14]:
len(set([w.lower() for w in text4]))

4

# Some points
1. s.startswith(t)
2. s.endswith(t)
3. t in s - for searching substrings
4. s.isupper(), s.islower(), s.istitle() - Return True or False
5. s.isaplha(), s.isdigit(), s.isalnum() - Return True or False
6. s.lower(), s.upper(), s.titlecase - String Operations
7. s.split(t) - Splitting at particular character
8. s.splitlines() - Splitting at newline character
9. s.join(t) 
10. s.strip() - takes spaces, tab from string.
11. s.rstrip() - takes spaces, tab from end of string
12. s.find(t) - find substring from string.
13. s.rfind(t) - find substring from end of string.
14. s.replace(u,v) - u(which needs to be replaced), v(the replacement)

In [15]:
text5 = 'ouagadougou'
text6 = text5.split('ou')

In [16]:
text6

['', 'agad', 'g', '']

In [17]:
'ou'.join(text6)

'ouagadougou'

In [18]:
[c for c in text5]

['o', 'u', 'a', 'g', 'a', 'd', 'o', 'u', 'g', 'o', 'u']

In [19]:
text9 = 'A quick brown fox jumped over the lazy dog'

In [20]:
text9.find('o')

10

In [21]:
text9.rfind('o')

40

In [22]:
text9.replace('o','O')

'A quick brOwn fOx jumped Over the lazy dOg'

# Regular Expressions

In [23]:
 text5 = '"Ethics are built right into the ideals and objectives of the United Nations #UNSG @ NY Society for Ethical Culture bit.ly/2guVelr'

In [24]:
txt6 = text5.split()

In [25]:
[w for w in txt6 if w.startswith('#')]

['#UNSG']

In [26]:
text7 = '@UN @UN_Women "Ethics are built right into the ideals and objectives of the United Nations" \
#UNSG @ NY Society for Ethical Culture bit.ly/2guVelr'
text8 = text7.split(' ')

In [27]:
[w for w in text8 if w.startswith('@')]

['@UN', '@UN_Women', '@']

We can use regular expressions to help us with more complex parsing. 

For example `'@[A-Za-z0-9_]+'` will return all words that: 
* start with `'@'` and are followed by at least one: 
* capital letter (`'A-Z'`)
* lowercase letter (`'a-z'`) 
* number (`'0-9'`)
* or underscore (`'_'`)

In [28]:
import re

In [29]:
[w for w in text8 if re.search('@[A-Za-z0-9_]+', w)]

['@UN', '@UN_Women']

# Meta-characters
1. '.' : is a wildcard, matches a single character (any character, but just once)
2. ^ : start of the string
3. $ : end of the string
4. [ ] : matches one of the set of characters within [ ]
5. [a-z] : matches one of the range of characters a,b,...,z
6. [^abc] : matches a character that is not a,b or c.
7. a|b : matches either a or b, where a and b are strings
8. () : scoping for operators
9. \ : Escape character for special characetrs (\t, \n, \b)
10. \b : matches word boundary
11. \d : any digit equivalent to [0-9]
12. \D : any non digit equivalent to [^0-9]
13. \s : any whitespace, equivalent to [ \t\n\r\f\v]
14. \S : any non-whitespace, equivalent to [^\t\n\r\f\v]
15. \w : any alphanumeric, equivalent to [a-zA-Z0-9_]
16. \W : any non-aplhanumeric, equivalent to [^a-zA-Z0-9_]
17.  `*` : matches zero or more occurances
18.  `+` : matches one or more occurances
19.  `?` : matches zero or one occurances
20. {n}: exactly n repitions, n>=0 
21. {n,}: at least n repitions
22. {,n}: at most n repitions
23. {m,n}: at least m repitions and at most n repitions

# Applying Them

In [30]:
example = '@UN @UN_Women "Ethics are built right into the ideals and objectives of the United Nations" \
#UNSG @ NY Society for Ethical Culture bit.ly/2guVelr'
final = example.split(' ')

In [31]:
[w for w in final if re.search('@\w+', w)]

['@UN', '@UN_Women']

In [32]:
capital = 'ouagadougou'
re.findall(r'[aeiou]', capital)

['o', 'u', 'a', 'a', 'o', 'u', 'o', 'u']

In [33]:
re.findall(r'[^aeiou]', capital)

['g', 'd', 'g']

# Handling Dates

In [34]:
datestr = '23-10-2002\n23/10/2002\n23/12/02\n10/23/2002\n23 Oct 2002\n23 October 2002\nOct 23, 2002\nOctober 23, 2002\n'

In [35]:
re.findall(r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', datestr)

['23-10-2002', '23/10/2002', '23/12/02', '10/23/2002']

In [36]:
re.findall(r'\d{1,2} (Jan|Feb|Mar|Apr|May|Ju|Jul|Aug|Sep|Oct|Nov|Dec) \d{4}',datestr )

['Oct']

In regular expressions, when you use the bracket, it also indicates scoping. So it says, I want to pull out only something that matched that part. It doesn't match the whole string, but it pulls out and gives you back only the thing that matched between Jan, Feb, March up to December. That's what gave me October as O-C-T.

In [37]:
re.findall(r'\d{1,2} (?:Jan|Feb|Mar|Apr|May|Ju|Jul|Aug|Sep|Oct|Nov|Dec) \d{4}',datestr )

['23 Oct 2002']

In [38]:
re.findall(r'\d{1,2} (?:Jan|Feb|Mar|Apr|May|Ju|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{4}',datestr )

['23 Oct 2002', '23 October 2002']

In [39]:
re.findall(r'(?:\d{1,2} )?(?:Jan|Feb|Mar|Apr|May|Ju|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{1,2}, )?\d{4}',datestr )

['23 Oct 2002', '23 October 2002', 'Oct 23, 2002', 'October 23, 2002']

# Working with text data in pandas

In [40]:
import pandas as pd

time_sentences = ["Monday: The doctor's appointment is at 2:45pm.", 
                  "Tuesday: The dentist's appointment is at 11:30 am.",
                  "Wednesday: At 7:00pm, there is a basketball game!",
                  "Thursday: Be back home by 11:15 pm at the latest.",
                  "Friday: Take the train at 08:10 am, arrive at 09:00am."]

df = pd.DataFrame(time_sentences, columns=['text'])
df

Unnamed: 0,text
0,Monday: The doctor's appointment is at 2:45pm.
1,Tuesday: The dentist's appointment is at 11:30...
2,"Wednesday: At 7:00pm, there is a basketball game!"
3,Thursday: Be back home by 11:15 pm at the latest.
4,"Friday: Take the train at 08:10 am, arrive at ..."


In [41]:
df['text'].str.len()

0    46
1    50
2    49
3    49
4    54
Name: text, dtype: int64

In [42]:
df.text.str.split().str.len()

0     7
1     8
2     8
3    10
4    10
Name: text, dtype: int64

In [43]:
df.text.str.contains('appointment')

0     True
1     True
2    False
3    False
4    False
Name: text, dtype: bool

In [44]:
df.text.str.count(r'\d') # how many times a digit occurs in each string

0    3
1    4
2    3
3    4
4    8
Name: text, dtype: int64

In [45]:
df.text.str.findall(r'\d') # all digits in the string

0                   [2, 4, 5]
1                [1, 1, 3, 0]
2                   [7, 0, 0]
3                [1, 1, 1, 5]
4    [0, 8, 1, 0, 0, 9, 0, 0]
Name: text, dtype: object

In [47]:
df.text.str.findall(r'(\d?\d):(\d\d)') #grouping date patterns

0               [(2, 45)]
1              [(11, 30)]
2               [(7, 00)]
3              [(11, 15)]
4    [(08, 10), (09, 00)]
Name: text, dtype: object

In [48]:
df.text.str.replace(r'\w+day\b', '???')

0          ???: The doctor's appointment is at 2:45pm.
1       ???: The dentist's appointment is at 11:30 am.
2          ???: At 7:00pm, there is a basketball game!
3         ???: Be back home by 11:15 pm at the latest.
4    ???: Take the train at 08:10 am, arrive at 09:...
Name: text, dtype: object

In [53]:
df.text.str.replace(r'\w+day\b', lambda x: x.group()[:3])

0          Mon: The doctor's appointment is at 2:45pm.
1       Tue: The dentist's appointment is at 11:30 am.
2          Wed: At 7:00pm, there is a basketball game!
3         Thu: Be back home by 11:15 pm at the latest.
4    Fri: Take the train at 08:10 am, arrive at 09:...
Name: text, dtype: object

In [54]:
df.text.str.extract(r'(\d?\d):(\d\d)')

Unnamed: 0,0,1
0,2,45
1,11,30
2,7,0
3,11,15
4,8,10


In [62]:
df.text.str.extractall(r'((\d?\d):(\d\d) ?([ap]m))')

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,2:45pm,2,45,pm
1,0,11:30 am,11,30,am
2,0,7:00pm,7,0,pm
3,0,11:15 pm,11,15,pm
4,0,08:10 am,8,10,am
4,1,09:00am,9,0,am


In [64]:
df.text.str.extractall(r'(?P<Time>(?P<Hour>\d?\d):(?P<Minute>\d\d) ?(?P<Period>[ap]m))')

Unnamed: 0_level_0,Unnamed: 1_level_0,Time,Hour,Minute,Period
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,2:45pm,2,45,pm
1,0,11:30 am,11,30,am
2,0,7:00pm,7,0,pm
3,0,11:15 pm,11,15,pm
4,0,08:10 am,8,10,am
4,1,09:00am,9,0,am
