In [1]:
# Import the json module
import json

In [2]:
# Assign the string data to a variable. 
data = "{'contact_id': 4661, 'name': 'Cecilia Velasco', 'email': 'cecilia.velasco@rodrigues.fr'}"

# Convert the string data to a dictionary.
converted_data = json.loads(data)

# Iterate through the dictionary (row) and get the values.
row_values = [v for k, v in converted_data.items()]

print(row_values)

JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)

In [3]:
# To resolve the error message above, replace the single quotes of the data dictionary keys to double quotes
data = data.replace("'", '"')
print(data)

{"contact_id": 4661, "name": "Cecilia Velasco", "email": "cecilia.velasco@rodrigues.fr"}


In [4]:
# Rerun the previous code with the new data dictionary format:

# 1. Convert the string data to a dictionary.
converted_data = json.loads(data)

# 2. Iterate through the dictionary (row) and get the values.
row_values = [v for k, v in converted_data.items()]

print(row_values)

[4661, 'Cecilia Velasco', 'cecilia.velasco@rodrigues.fr']


In [5]:
# Using findall function to extract specific strings needed
# findall function syntax = re.findall(pattern, string)

# Import the regular expression module
import re

# Assign the string data to a variable
string_data = "contact_id 4661 name Cecilia Velasco email cecilia.velasco@rodrigues.fr"

# Extract the 4 digit number from the string data using regex (use r before pattern to tell pythhon to treat our regular expressions as raw strings of text)
contact_id = re.findall(r'(\d{4})', string_data)
print(contact_id)

['4661']


In [6]:
# Using str.extract function to extract strings from multiple rows in a dataframe

# Import the Pandas dependency
import pandas as pd

# Read the contacts string data into a Pandas dataframe
contacts_string_df = pd.read_csv('contacts_string_data.csv')

contacts_string_df.head()

Unnamed: 0,contact_info
0,contact_id 4661 name Cecilia Velasco email cec...
1,contact_id 3765 name Mariana Ellis email maria...
2,contact_id 4187 name Sofie Woods email sofie.w...
3,contact_id 4941 name Jeanette Iannotti email j...
4,contact_id 2199 name Samuel Sorgatz email samu...


In [7]:
# Extract the 4 digit number from the string data using regex & store in new column
contacts_string_df['contact_id'] = contacts_string_df['contact_info'].str.extract(r'(\d{4})')
contacts_string_df.head()

Unnamed: 0,contact_info,contact_id
0,contact_id 4661 name Cecilia Velasco email cec...,4661
1,contact_id 3765 name Mariana Ellis email maria...,3765
2,contact_id 4187 name Sofie Woods email sofie.w...,4187
3,contact_id 4941 name Jeanette Iannotti email j...,4941
4,contact_id 2199 name Samuel Sorgatz email samu...,2199


In [8]:
# DIFFERENT TYPES OF CHARACTERS IN A REGEX

# 1. Literal character - A regular expression that’s composed of literal characters will match any string that exactly contains the expression as a substring

# 2. Special character - each consists of a backslash (\) and other character

# 3. Character sets - use a set of brackets [] to define a character set i.e. "\[ceh\]at" will match: "cat", "eat", "hat". Can also specify ranges i.e. "[A-Z]" will match any uppercase letter

# 4. Caret (^) - specifies a character that we do not want to include

# 5. Wildcard(.) - will match any single character

# 6. Escape character - indicates that the next character gets to escape its duties as a metacharacter and act like a literal character.

# 7. Special counting chcaracters - specify the number of times that a character can appear. Includes \*, +, {}, ?
    # \* - the previous character can repeat any number of times, including zero
    # + - specifies that the character must appear at least once
    # {} - searches for a character that appears an exact number of times. Place two numbers inside the braces to match a number of characters within a range
    # ? - used for an optional character. The character can appear zero times or one time

# 8. Alternation character (|) - searches for either a particular string or alternate strings. Functions as logial OR

# 9. String boundary character - matches only the beginning or the end of a string
    # ^ - represents the beginning of a string
    # $ - represents the end of a string

# 10. Capture groups - grouping in regex, is how a regular expression defines the information to extract. Use () to define a capture group.

# 11a. Non-capturing groups (?:) - specifies that we want to use the grouping structure but not capture the information
# 11b. Negative lookahead (?!) - also a non-capturing group, but it looks ahead in the text to make sure that a string doesn't exist after the match.

In [10]:
# REGEX FOR DATA TRANSFORMATION

# 1. Extract the first and last name after the word "name" from the contacts_string_data.csv
name = re.findall(r'([^nameil\s+][A-Za-z]+\s+[A-Za-z]+)', string_data)
name

['Cecilia Velasco']

In [15]:
# 2. Extract the name from the "contact_info" column of contact_string_df, and add it to a new column named "name".
contacts_string_df['name'] = contacts_string_df['contact_info'].str.extract(r'([^nameil\s+][A-Za-z]+\s+[A-Za-z]+)', expand=True)
contacts_string_df.head()

Unnamed: 0,contact_info,contact_id,name
0,contact_id 4661 name Cecilia Velasco email cec...,4661,Cecilia Velasco
1,contact_id 3765 name Mariana Ellis email maria...,3765,Mariana Ellis
2,contact_id 4187 name Sofie Woods email sofie.w...,4187,Sofie Woods
3,contact_id 4941 name Jeanette Iannotti email j...,4941,Jeanette Iannotti
4,contact_id 2199 name Samuel Sorgatz email samu...,2199,Samuel Sorgatz


In [17]:
# 3. Extract the email address using a regular expression pattern. 
email_address = re.findall(r'(\S+@\S+)', string_data)
email_address

['cecilia.velasco@rodrigues.fr']

In [18]:
# 4. Extract the email address from the "contact_info" column of contact_string_df, and add it to a new column named "email".
contacts_string_df['email'] = contacts_string_df['contact_info'].str.extract(r'(\S+@\S+)', expand=True)
contacts_string_df.head()

Unnamed: 0,contact_info,contact_id,name,email
0,contact_id 4661 name Cecilia Velasco email cec...,4661,Cecilia Velasco,cecilia.velasco@rodrigues.fr
1,contact_id 3765 name Mariana Ellis email maria...,3765,Mariana Ellis,mariana.ellis@rossi.org
2,contact_id 4187 name Sofie Woods email sofie.w...,4187,Sofie Woods,sofie.woods@riviere.com
3,contact_id 4941 name Jeanette Iannotti email j...,4941,Jeanette Iannotti,jeanette.iannotti@yahoo.com
4,contact_id 2199 name Samuel Sorgatz email samu...,2199,Samuel Sorgatz,samuel.sorgatz@gmail.com
