# Data and Text Using Pandas and RegEx

Define patterns to:
- Matching
- Searching
- Extraction
- Substitution

In [5]:
import re

In [6]:
s = 'this is a simple string'

pattern = 'is'

match_st = re.search(pattern,s).start()
match_en = re.search(pattern,s).end()
s[match_st:match_en]

'is'

In [7]:
s2 = 'This is a sentence. Here is another sentence. And yet, another sentence!'

pattern = 'sentence|This'
all_stuff = re.findall(pattern, s2)
print('We found ', len(all_stuff), ' mateches')

We found  4  mateches


In [8]:
for i in all_stuff:
  print(i)

This
sentence
sentence
sentence


In [9]:
emails = '''
  kit@radbits.io
  kit.masaracchia@gmail.com
  me@tech.net
  mark@caltech.edu
  student@gmail.com
'''

# capture a whole work
pattern = re.compile('\w+')

re.findall(pattern, emails)

['kit',
 'radbits',
 'io',
 'kit',
 'masaracchia',
 'gmail',
 'com',
 'me',
 'tech',
 'net',
 'mark',
 'caltech',
 'edu',
 'student',
 'gmail',
 'com']

In [10]:
pattern = re.compile('\w+@\w+.(\w+)')

re.findall(pattern, emails)

['io', 'com', 'net', 'edu', 'com']

In [11]:
phone_nums = '''
  555-1231212
  555-6890012 Ext: 123
  221-3367891-331
  366-4509987
'''

# pattern = re.compile('\d\d\d-\d\d\d\d\d\d\d')
pattern = re.compile('\d{3}-\d{7}')
re.findall(pattern, phone_nums)

['555-1231212', '555-6890012', '221-3367891', '366-4509987']

In [12]:
# get the area code
import numpy as np

pattern = re.compile('(\d{3})-\d{7}')
area_codes = np.unique(re.findall(pattern, phone_nums))
area_codes

array(['221', '366', '555'], dtype='<U3')

In [13]:
text = 'The quick brown fox jumped over the lazy dog. The dog barked, and the fox ran away.'

three_letters = re.compile(r'\b\w{3}\b') # match three-letter words

matches = re.findall(three_letters, text)
print(matches)

['The', 'fox', 'the', 'dog', 'The', 'dog', 'and', 'the', 'fox', 'ran']


# Using RegEx with Pandas

In [14]:
import pandas as pd

In [15]:
# Activate RegEx in Pandas

my_list = ['Mark', 'Mark', 'Will']
my_series = pd.Series(my_list)
# print(my_series)

# Simple replace
my_series.replace('Mark', 'Mike')

0    Mike
1    Mike
2    Will
dtype: object

In [16]:
data = pd.Series([
  'https://google.com',
  'https://msn.com',
  'https://cnn.com',
  'http://python.org',
  'www.academymuseum.org',
])
data

0       https://google.com
1          https://msn.com
2          https://cnn.com
3        http://python.org
4    www.academymuseum.org
dtype: object

In [17]:
domains_only = r'^(https?:\/\/)|(www\.)?'

urls = data.str.replace(domains_only, '', regex=True)
urls

0           google.com
1              msn.com
2              cnn.com
3           python.org
4    academymuseum.org
dtype: object

# Strip: Deal with Whitespace

In [18]:
data_with_space = pd.Series([
  'John Doe ',
  '   Mary Jones ',
  'Will Smith    '
])

data_wo_space = data_with_space.str.strip()

print(data_wo_space)

0      John Doe
1    Mary Jones
2    Will Smith
dtype: object


In [19]:
df_contacts = pd.read_csv('./contacts_email_phone.csv')
df_contacts.head()
# df_contacts.shape

Unnamed: 0,phone,email
0,504-845-1427,jbutt@gmail.com
1,810-374-9840,josephine_darakjy@darakjy.org
2,856-264-4130,art@venere.org
3,907-921-2010,lpaprocki@hotmail.com
4,513-549-4561,donette.foller@cox.net


In [20]:
first_dot_last = df_contacts['email'].str.split('@').str[0].str.contains('\.')
df_dot = df_contacts[first_dot_last]
# df_dot.shape


In [21]:
df_dot.loc['FirstName', 'LastName'] = df_dot['email'].str.split('@').str[0].str.split('.', expand=True)
df_dot

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dot.loc['FirstName', 'LastName'] = df_dot['email'].str.split('@').str[0].str.split('.', expand=True)


Unnamed: 0,phone,email,LastName
4,513-549-4561,donette.foller@cox.net,
12,310-254-3084,kiley.caldarera@aol.com,
17,414-377-2880,gladys.rim@rim.org,
19,815-426-5657,fletcher.flosi@yahoo.com,
26,913-899-1103,chanel.caudy@caudy.org,
37,805-609-1531,rozella.ostrosky@ostrosky.com,
39,785-219-7724,kati.rulapaugh@hotmail.com,
44,907-227-6777,erick.ferencz@aol.com,
48,608-658-7940,emerson.bowley@bowley.org,
58,401-559-8961,delmy.ahle@hotmail.com,


In [22]:
# Try to parse firstname.lastname out of email addresses
# how_many = r'^(\w+\.\w+)?'


# Putting it all together

The BEST $mvie ever made about writer's block and one of the scariest tales ever made regarding cabin fever, 
The Shining took a simple concept of a      haunted hotel and built it ~up into an unforgettable, 
psychological ^horror mvie that will withstand the test of 
time despite being slated by it's original creator. scary mvie ---!!!!

In [51]:
f = open('./regex_test.txt')
messy = f.read()
print(messy)

The BEST $mvie ever made about writer's block and one of the scariest tales ever made regarding cabin fever, 
The Shining took a simple concept of a      haunted hotel and built it ~up into an unforgettable, 
psychological ^horror mvie that will withstand the test of 
time despite being slated by it's original creator. scary mvie ---!!!!


In [32]:
# re.findall('\$mvie', messy)

[]

In [56]:
# Replace
messy = re.sub('mvie', 'movie', messy)
# messy = re.sub(r'\n', '', messy)
print(messy)

The BEST movie ever made about writer's block and one of the scariest tales ever made regarding cabin fever,The Shining took a simple concept of ahaunted hotel and built it up into an unforgettable,psychological horror movie that will withstand the test oftime despite being slated by it's original creator. scary movie 


In [55]:
# Remove unwanted characters
messy = re.sub(r'[^a-zA-Z0-9\s\'\,\.]', '', messy)
print(messy)

The BEST mvie ever made about writer's block and one of the scariest tales ever made regarding cabin fever,The Shining took a simple concept of ahaunted hotel and built it up into an unforgettable,psychological horror mvie that will withstand the test oftime despite being slated by it's original creator. scary mvie 


In [57]:
# Remove the big'ol space
messy = re.sub(r'\s\s+', ' ', messy)
print(messy)

The BEST movie ever made about writer's block and one of the scariest tales ever made regarding cabin fever,The Shining took a simple concept of ahaunted hotel and built it up into an unforgettable,psychological horror movie that will withstand the test oftime despite being slated by it's original creator. scary movie 
