In [1]:
# This file contains a few examples of how to use the re module and
# how to deal with regular expressions.
import re                         #  Import 're' module.
re.sub(r'K', r'L', 'King Arthur') #  Replace pattern in string.


'Ling Arthur'

In [2]:
re.match(r'abc', 'abcdef')        # Match a substring.

<re.Match object; span=(0, 3), match='abc'>

In [3]:
re.match(r'abc','abcdef').group()

'abc'

In [4]:

re.match(r'\w+', 'Hello world!') # Match a word.


<re.Match object; span=(0, 5), match='Hello'>

In [5]:
re.match(r'[a-z0-9 ]+', 'lowercase and nums like 8, but no commas.')


<re.Match object; span=(0, 25), match='lowercase and nums like 8'>

In [6]:
re.split(r'\s+', 'This is a test.') # Returns list split on spaces, e.g. tokenization.


['This', 'is', 'a', 'test.']

In [7]:
re.findall(r'\w+', "Let's write regex!") # Find all words.


['Let', 's', 'write', 'regex']

In [8]:
# Split into sentences.
re.split(r'[.?!]', "Hello world! Let's write regex. Isn't this great?")


['Hello world', " Let's write regex", " Isn't this great", '']

In [9]:
# Find all capitalized words.
re.findall(r'[A-Z]\w*', 'Hello world, I love Hong Kong.')


['Hello', 'I', 'Hong', 'Kong']

In [10]:
re.findall(r'\d+', 'The novel 1984 was published in 1949.')


['1984', '1949']

In [11]:
 # Match digits and words (but not anything else, e.g. punctuation).
re.findall('\d+|\w+', 'He has 12 cats.')


['He', 'has', '12', 'cats']

In [12]:
re.findall('\w+', 'He has 12 cats.')

['He', 'has', '12', 'cats']

In [13]:
m = re.search(r'coconuts', 'I love coconuts.')


In [14]:
print(m.start(), m.end())       # Print start and end indices.
# Find square bracket containing a word (but no space or anything else).



7 15


In [15]:
m.group()

'coconuts'

In [16]:
re.search(r'\[\w+\]', 'Hello [wind bla] this is [nice].')

<re.Match object; span=(25, 31), match='[nice]'>

In [17]:
# This file shows some tokenization examples using the NLTK package.
from nltk.tokenize import word_tokenize, sent_tokenize, regexp_tokenize, TweetTokenizer


In [19]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/Ben/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [20]:
word_tokenize("Hi there!")

['Hi', 'there', '!']

In [21]:
sent_tokenize('Hello world. I love HK!')

['Hello world.', 'I love HK!']

In [22]:
# Make set of unique tokens.
set(word_tokenize('I love HK. I love NYC'))

{'.', 'HK', 'I', 'NYC', 'love'}

In [23]:
# Tokenize based on regular expression.
regexp_tokenize('SOLDIER #1: Found them?', r'(\w+|#\d|\?|!)')

['SOLDIER', '#1', 'Found', 'them', '?']

In [24]:
# remove ? mark
# Find hastags in tweets.
regexp_tokenize('This is a great #NLP exercise.', r'#\w+')

['#NLP']

In [26]:
# Find mentions and hashtags.
regexp_tokenize('great #NLP exercise $sp500 from @blabla.', r'[#$@]\w+')


['#NLP', '$sp500', '@blabla']

In [27]:
tknzr = TweetTokenizer()    # Create instance of TweetTokenizer.

In [28]:
[tknzr.tokenize(t) for t in ['thanks @blabla', '#NLP is fun!']]

[['thanks', '@blabla'], ['#NLP', 'is', 'fun', '!']]

In [3]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
ps_s=[ps.stem('program'),ps.stem('program'),ps.stem('programs'),ps.stem('programer'),ps.stem('programing'),ps.stem('programers')]

ps_s

['program', 'program', 'program', 'program', 'program', 'program']

In [12]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
# import nltk
# nltk.download('wordnet')

wnls=[wnl.lemmatize('dogs'),  wnl.lemmatize('churches'),  wnl.lemmatize('aardwolves'),
  wnl.lemmatize('abaci') ,   wnl.lemmatize('hardrock')]

In [13]:
wnls

['dog', 'church', 'aardwolf', 'abacus', 'hardrock']