In [1]:
import pandas as pd
import re
from faker import Faker
import warnings
warnings.filterwarnings('ignore')

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_seq_items', None)

In [3]:
faker = Faker()

n = 100000
address= pd.Series([faker.address() for i in range(n)])

In [4]:
pd.DataFrame(address).head(10)

Unnamed: 0,0
0,USCGC Bailey\nFPO AE 87727
1,"99965 Davis Path Suite 338\nWilliamsberg, OR 29575"
2,"69567 Brown Squares Suite 606\nNew Randyville, AL 73559"
3,"488 Dennis Fields\nSouth Thomaschester, OK 59340"
4,"976 Wilson Turnpike Suite 401\nLeahshire, MS 52084"
5,"2324 Gallegos Squares\nNew Ashley, VA 35068"
6,"88571 King Rapid Apt. 105\nNew Darrellshire, CT 01862"
7,"27252 Warren Rapids\nEast Williamchester, OK 01249"
8,"8255 Mckinney Fields Apt. 709\nNew Arianaborough, HI 99631"
9,"0023 Patrick Hill Suite 023\nHintonmouth, OH 68208"


In [5]:
(
    (address)
    .map(lambda x: (re.search(r'\w{2} \d{5}', x).group()))
    .str.split(' ', expand=True)
    .rename(columns={0: 'state', 1: 'zip_code'})
).head(10)

Unnamed: 0,state,zip_code
0,AE,87727
1,OR,29575
2,AL,73559
3,OK,59340
4,MS,52084
5,VA,35068
6,CT,1862
7,OK,1249
8,HI,99631
9,OH,68208


In [6]:
address.str.extract(r'(?P<state>\w{2}) (?P<zip_code>\d{5})').head(10)

Unnamed: 0,state,zip_code
0,AE,87727
1,OR,29575
2,AL,73559
3,OK,59340
4,MS,52084
5,VA,35068
6,CT,1862
7,OK,1249
8,HI,99631
9,OH,68208


In [7]:
df = pd.DataFrame(
    {
        'location': [
            'New York 10001',
            'California 90001',
            'Texas-75201',
            'Georgia 30301',
            'Oregon97205',
            'Arizona 85001',
            'Illinois 60601',
            'Florida 33101',
            'Ohio 44101',
            'Pennsylvania-19104',
        ]
    }
)

df

Unnamed: 0,location
0,New York 10001
1,California 90001
2,Texas-75201
3,Georgia 30301
4,Oregon97205
5,Arizona 85001
6,Illinois 60601
7,Florida 33101
8,Ohio 44101
9,Pennsylvania-19104


In [8]:
df['location'].str.split(r'[\s|-]*(\d{5})', expand=True)

Unnamed: 0,0,1,2
0,New York,10001,
1,California,90001,
2,Texas,75201,
3,Georgia,30301,
4,Oregon,97205,
5,Arizona,85001,
6,Illinois,60601,
7,Florida,33101,
8,Ohio,44101,
9,Pennsylvania,19104,


In [9]:
data = {
    'office_serial_number': [
        'US101-001',
        'UK201-006',
        'CA301-003',
        'AU401-004',
        'UK202-005',
        'IN302-006',
        'IR102-007',
        'AU402-006',
        'SL303-009',
        'UK203-010',
        'FR403-011',
        'US103-012',
    ]
}

df = pd.DataFrame(data)
df

Unnamed: 0,office_serial_number
0,US101-001
1,UK201-006
2,CA301-003
3,AU401-004
4,UK202-005
...,...
7,AU402-006
8,SL303-009
9,UK203-010
10,FR403-011


In [10]:
df.loc[df['office_serial_number'].str.contains('^(UK|IN|AU)\d{3}-006')]

Unnamed: 0,office_serial_number
1,UK201-006
5,IN302-006
7,AU402-006


In [11]:
# create a sample dataframe (dummy)
df = pd.DataFrame({'phone': ['+1 555-555-5555', '+44 20 7123 4567', '+81 3-1234-5678', '0049 30 12345678', '+61 2 1234 5678', '+33 1 23 45 67 89', '+86 10 1234 5678', '011 52 55 1234 5678', '+971 4 123 4567', '+49 911 12345678', '(+81) 3-1234-5678']})
df

Unnamed: 0,phone
0,+1 555-555-5555
1,+44 20 7123 4567
2,+81 3-1234-5678
3,0049 30 12345678
4,+61 2 1234 5678
...,...
6,+86 10 1234 5678
7,011 52 55 1234 5678
8,+971 4 123 4567
9,+49 911 12345678


In [12]:
# define a regular expression pattern to match the country code
pattern = r'^\+?\d{1,3}[\s-]?'   # match + or digit(s) followed by space/hyphen

# apply the pattern to the 'phone' column and replace the matches with an empty string
df['phone'] = df['phone'].apply(lambda x: re.sub(pattern, '', x))
df

Unnamed: 0,phone
0,555-555-5555
1,20 7123 4567
2,3-1234-5678
3,9 30 12345678
4,2 1234 5678
...,...
6,10 1234 5678
7,52 55 1234 5678
8,4 123 4567
9,911 12345678
