In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
hn = pd.read_csv('hacker_news.csv')

In [3]:
titles = hn.title

In [5]:
titles.count()
titles.shape

(20099,)

#### Lookaround

  *  Inside the parentheses, the first character of a lookaround is always ?.
  *  If the lookaround is a lookbehind, the next character will be <, which you can think of as an arrow head pointing behind the match.
  *  The next character indicates whether the lookaround is positive (=) or negative (!).


In [11]:
test_cases = ['Red_Green_Blue', 'Yellow_Green_Red', 'Red_Green_Red', 'Yellow_Green_Blue', 'Green']
def run_test_cases(pattern):
    for tc in test_cases:
        result = re.search(pattern, tc)
        print(result or "NO MATCH")

In [13]:
run_test_cases(r"Green(?=_Blue)") # positive lookahead

<re.Match object; span=(4, 9), match='Green'>
NO MATCH
NO MATCH
<re.Match object; span=(7, 12), match='Green'>
NO MATCH


In [14]:
run_test_cases(r"Green(?!_Red)") # negative lookahead

<re.Match object; span=(4, 9), match='Green'>
NO MATCH
NO MATCH
<re.Match object; span=(7, 12), match='Green'>
<re.Match object; span=(0, 5), match='Green'>


In [15]:
run_test_cases(r"(?<=Red_)Green") # positive lookbehind

<re.Match object; span=(4, 9), match='Green'>
NO MATCH
<re.Match object; span=(4, 9), match='Green'>
NO MATCH
NO MATCH


In [16]:
run_test_cases(r"(?<!Yellow_)Green") # negative lookbehind

<re.Match object; span=(4, 9), match='Green'>
NO MATCH
<re.Match object; span=(4, 9), match='Green'>
NO MATCH
<re.Match object; span=(0, 5), match='Green'>


In [18]:
run_test_cases(r"Green(?=.{5})") #postitie lookahead to match Green followed by exactly five characters

<re.Match object; span=(4, 9), match='Green'>
NO MATCH
NO MATCH
<re.Match object; span=(7, 12), match='Green'>
NO MATCH


In [21]:
def first_10_matches(pattern):
    return titles[titles.str.contains(pattern)].head(10)

In [22]:
first_10_matches(r"\b[Cc]\b[^.+]")

365                      The new C standards are worth it
444           Moz raises $10m Series C from Foundry Group
521          Fuchsia: Micro kernel written in C by Google
1307            Show HN: Yupp, yet another C preprocessor
1326                     The C standard formalized in Coq
1365                          GNU C Library 2.23 released
1429    Cysignals: signal handling (SIGINT, SIGSEGV, )...
1620                        SDCC  Small Device C Compiler
1949    Rewriting a Ruby C Extension in Rust: How a Na...
2195    MyHTML  HTML Parser on Pure C with POSIX Threa...
Name: title, dtype: object

In [36]:
pattern = r"(?<!(?:Series ))\b[Cc]\b(?![\.\+])"
# Identify titles that mentions C, not exclude C at end, exclude Series C, C++, C.E.O

In [37]:
first_10_matches(pattern)

365                      The new C standards are worth it
521          Fuchsia: Micro kernel written in C by Google
1307            Show HN: Yupp, yet another C preprocessor
1326                     The C standard formalized in Coq
1365                          GNU C Library 2.23 released
1429    Cysignals: signal handling (SIGINT, SIGSEGV, )...
1620                        SDCC  Small Device C Compiler
1949    Rewriting a Ruby C Extension in Rust: How a Na...
2195    MyHTML  HTML Parser on Pure C with POSIX Threa...
2589    Phalcon  PHP framework delivered as a C extension
Name: title, dtype: object

In [38]:
c_mentions = titles.str.contains(pattern).sum()
c_mentions

102

#### backreferences
* (Hello)(Goodbye)\2\1 matches HelloGoodbyeGoodbyeHello
* (\w)\1 same two word characters

In [74]:
test_cases = [
              "I'm going to read a book book.",
              "Green is my favorite color.",
              "My name is Aaron.",
              "No doubles here.",
              "I have a pet eel.",
              "Niantic (Pokemon Go) appears to be hosting the entire world on one server",
              "Google's self-driving car is the victim in a serious crash"
             ]
for tc in test_cases:
    print(re.search(r"(\b\w+\b) \1", tc))

<re.Match object; span=(20, 29), match='book book'>
None
None
None
None
<re.Match object; span=(60, 65), match='on on'>
<re.Match object; span=(7, 10), match='s s'>


In [83]:
#pattern_repeated = r"(\b\w+\b) \b\1\b" #\1 only capture the word, not \b \b
pattern_reapeated = r"\b(\w+)\s\1\b"
repeated_words = titles[titles.str.contains(pattern_repeated)]
first_10_matches(pattern_repeated)

3102                  Silicon Valley Has a Problem Problem
3176                Wire Wire: A West African Cyber Threat
3178                         Flexbox Cheatsheet Cheatsheet
4797                            The Mindset Mindset (2015)
7276     Valentine's Day Special: Bye Bye Tinder, Flirt...
10371    Mcdonalds copying cyriak  cows cows cows in th...
11575                                    Bang Bang Control
11901          Cordless Telephones: Bye Bye Privacy (1991)
12697          Solving the the Monty-Hall-Problem in Swift
15049    Bye Bye Webrtc2SIP: WebRTC with Asterisk and A...
Name: title, dtype: object

In [84]:
titles[titles.str.contains(pattern_repeated)].iloc[0]

'Silicon Valley Has a Problem Problem'

In [85]:
repeated_words.shape

(11,)

#### re.sub()
* re.sub(pattern, repl, string, flags = 0)
* equals Series.str.replace(pat, repl, flags = 0)

In [86]:
string = 'aBcDEfGHIj'
print(re.sub(r"[A-Z]", "-", string))

a-c--f---j


In [87]:
sql_variations = pd.Series(['SQL', 'Sql', 'sql'])
sql_variations.str.replace(r'sql', 'SQL', flags = re.I)

0    SQL
1    SQL
2    SQL
dtype: object

In [90]:
email_variations = pd.Series(['email', 'Email', 'e Mail','e mail', 'E-mail', 'e-mail',
                        'eMail', 'E-Mail', 'EMAIL'])
email_uniform = email_variations.str.replace(r"e[\s-]?mail", "email", flags = re.I)

email_uniform

0    email
1    email
2    email
3    email
4    email
5    email
6    email
7    email
8    email
dtype: object

In [91]:
titles_clean = titles.str.replace(r"e[-\s]?mail", 'email', flags = re.I)

In [92]:
test_urls = pd.Series([
 'https://www.amazon.com/Technology-Ventures-Enterprise-Thomas-Byers/dp/0073523429',
 'http://www.interactivedynamicvideo.com/',
 'http://www.nytimes.com/2007/11/07/movies/07stein.html?_r=0',
 'http://evonomics.com/advertising-cannot-maintain-internet-heres-solution/',
 'HTTPS://github.com/keppel/pinn',
 'Http://phys.org/news/2015-09-scale-solar-youve.html',
 'https://iot.seeed.cc',
 'http://www.bfilipek.com/2016/04/custom-deleters-for-c-smart-pointers.html',
 'http://beta.crowdfireapp.com/?beta=agnipath',
 'https://www.valid.ly?param',
 'http://css-cursor.techstream.org'
])

In [100]:
test_urls_clean = test_urls.str.extract(r"https?\://([\w\.-]+)\b", flags=re.I)
test_urls_clean

Unnamed: 0,0
0,www.amazon.com
1,www.interactivedynamicvideo.com
2,www.nytimes.com
3,evonomics.com
4,github.com
5,phys.org
6,iot.seeed.cc
7,www.bfilipek.com
8,beta.crowdfireapp.com
9,www.valid.ly


In [105]:
test_urls.str.extract(r"https?\://([\w\.-]+)", flags=re.I)

Unnamed: 0,0
0,www.amazon.com
1,www.interactivedynamicvideo.com
2,www.nytimes.com
3,evonomics.com
4,github.com
5,phys.org
6,iot.seeed.cc
7,www.bfilipek.com
8,beta.crowdfireapp.com
9,www.valid.ly


In [107]:
domains = hn.url.str.extract(r"https?\://([\w\.-]+)", flags = re.I, expand = False)
domains

0        www.interactivedynamicvideo.com
1                        www.thewire.com
2                         www.amazon.com
3                        www.nytimes.com
4                        arstechnica.com
                      ...               
20094                            puri.sm
20095                         medium.com
20096                 blog.darknedgy.net
20097                         medium.com
20098                         github.com
Name: url, Length: 20099, dtype: object

In [109]:
domains.value_counts()[:5]

github.com             1008
medium.com              825
www.nytimes.com         525
www.theguardian.com     248
techcrunch.com          245
Name: url, dtype: int64

In [110]:
created_at = hn['created_at'].head()
print(created_at)

0     8/4/2016 11:52
1    6/23/2016 22:20
2     6/17/2016 0:01
3     9/30/2015 4:12
4    10/31/2015 9:48
Name: created_at, dtype: object


####  Use capture groups to extract the dates and times into two columns
* (.+)\s(.+)

In [113]:
pattern = r"(.+)\s(.+)"
dates_times = created_at.str.extract(pattern, expand = True)
dates_times

Unnamed: 0,0,1
0,8/4/2016,11:52
1,6/23/2016,22:20
2,6/17/2016,0:01
3,9/30/2015,4:12
4,10/31/2015,9:48


In [133]:
pattern_captureURL = r"(https?)://([\w\.-]+)/?(.*)"
test_urls.str.extract(pattern_captureURL, flags = re.I)

Unnamed: 0,0,1,2
0,https,www.amazon.com,Technology-Ventures-Enterprise-Thomas-Byers/dp...
1,http,www.interactivedynamicvideo.com,
2,http,www.nytimes.com,2007/11/07/movies/07stein.html?_r=0
3,http,evonomics.com,advertising-cannot-maintain-internet-heres-sol...
4,HTTPS,github.com,keppel/pinn
5,Http,phys.org,news/2015-09-scale-solar-youve.html
6,https,iot.seeed.cc,
7,http,www.bfilipek.com,2016/04/custom-deleters-for-c-smart-pointers.html
8,http,beta.crowdfireapp.com,?beta=agnipath
9,https,www.valid.ly,?param


In [118]:
test_urls

0     https://www.amazon.com/Technology-Ventures-Ent...
1               http://www.interactivedynamicvideo.com/
2     http://www.nytimes.com/2007/11/07/movies/07ste...
3     http://evonomics.com/advertising-cannot-mainta...
4                        HTTPS://github.com/keppel/pinn
5     Http://phys.org/news/2015-09-scale-solar-youve...
6                                  https://iot.seeed.cc
7     http://www.bfilipek.com/2016/04/custom-deleter...
8           http://beta.crowdfireapp.com/?beta=agnipath
9                            https://www.valid.ly?param
10                     http://css-cursor.techstream.org
dtype: object

In [134]:
url_parts = hn.url.str.extract(pattern_captureURL, flags = re.I)
url_parts

Unnamed: 0,0,1,2
0,http,www.interactivedynamicvideo.com,
1,http,www.thewire.com,entertainment/2013/04/florida-djs-april-fools-...
2,https,www.amazon.com,Technology-Ventures-Enterprise-Thomas-Byers/dp...
3,http,www.nytimes.com,2007/11/07/movies/07stein.html?_r=0
4,http,arstechnica.com,business/2015/10/comcast-and-other-isps-boost-...
...,...,...,...
20094,https,puri.sm,philosophy/how-purism-avoids-intels-active-man...
20095,https,medium.com,@zreitano/the-yc-application-broken-down-and-t...
20096,http,blog.darknedgy.net,technology/2016/01/01/0/
20097,https,medium.com,@benjiwheeler/how-product-hunt-really-works-d8...


In [135]:
# Rename the captured columns
pattern_captureURL = r"(?P<protocol>https?)://(?P<domain_name>[\w\.-]+)/?(?P<path>.*)"
hn.url.str.extract(pattern_captureURL, flags = re.I)

Unnamed: 0,protocol,domain_name,path
0,http,www.interactivedynamicvideo.com,
1,http,www.thewire.com,entertainment/2013/04/florida-djs-april-fools-...
2,https,www.amazon.com,Technology-Ventures-Enterprise-Thomas-Byers/dp...
3,http,www.nytimes.com,2007/11/07/movies/07stein.html?_r=0
4,http,arstechnica.com,business/2015/10/comcast-and-other-isps-boost-...
...,...,...,...
20094,https,puri.sm,philosophy/how-purism-avoids-intels-active-man...
20095,https,medium.com,@zreitano/the-yc-application-broken-down-and-t...
20096,http,blog.darknedgy.net,technology/2016/01/01/0/
20097,https,medium.com,@benjiwheeler/how-product-hunt-really-works-d8...
