In [1]:
import pandas as pd
import numpy as np
import json

## Data

In [2]:
get_df = lambda: pd.DataFrame(json.loads('''[
    {
        "Customer_ID": "ID_001",
        "Phone_Number": "123-456-7890",
        "Email": "john.doe@example.com",
        "Address": "123 Main St, Apt 4B",
        "Order_Amount": "USD 1,234.56"
    },
    {
        "Customer_ID": "ID 002",
        "Phone_Number": "(123) 456-7890",
        "Email": "jane.doe at example dot com",
        "Address": "456 Elm St",
        "Order_Amount": "567.89 EUR"
    },
    {
        "Customer_ID": "ID-003",
        "Phone_Number": "123.456.7890",
        "Email": "test_user@example.com",
        "Address": null,
        "Order_Amount": null
    },
    {
        "Customer_ID": "ID_004",
        "Phone_Number": "1234567890",
        "Email": "test@subdomain.example.com",
        "Address": "789 Maple Ave",
        "Order_Amount": "GBP 1,234.56"
    },
    {
        "Customer_ID": null,
        "Phone_Number": null,
        "Email": null,
        "Address": "PO Box 123",
        "Order_Amount": "999.99 INR"
    },
    {
        "Customer_ID": "ID-005",
        "Phone_Number": "123-456-7890",
        "Email": "user@example",
        "Address": "123 Main Street Apt 4B",
        "Order_Amount": "1234.56 USD"
    },
    {
        "Customer_ID": "ID 006",
        "Phone_Number": "(123) 456-7890",
        "Email": "invalid-email.com",
        "Address": "456 Elm Street",
        "Order_Amount": "999.00 EUR"
    },
    {
        "Customer_ID": "ID_007",
        "Phone_Number": "123 456 7890",
        "Email": "correct@sample.org",
        "Address": "123 First Ave, Suite 500",
        "Order_Amount": "1,111.11 GBP"
    },
    {
        "Customer_ID": "ID-008",
        "Phone_Number": "123-456-7890",
        "Email": "person@example.com",
        "Address": "PO Box 789",
        "Order_Amount": "INR 1,234.56"
    },
    {
        "Customer_ID": "ID 009",
        "Phone_Number": "123 456.7890",
        "Email": null,
        "Address": "789 Maple Ave, Apt 1A",
        "Order_Amount": null
    },
    {
        "Customer_ID": null,
        "Phone_Number": null,
        "Email": "first.last@company.co",
        "Address": null,
        "Order_Amount": "1,234.00 USD"
    },
    {
        "Customer_ID": "ID_010",
        "Phone_Number": "123.456.7890",
        "Email": "sample@website.org",
        "Address": "PO Box 321",
        "Order_Amount": "EUR 567.89"
    }
]
'''))

df = get_df()
df

Unnamed: 0,Customer_ID,Phone_Number,Email,Address,Order_Amount
0,ID_001,123-456-7890,john.doe@example.com,"123 Main St, Apt 4B","USD 1,234.56"
1,ID 002,(123) 456-7890,jane.doe at example dot com,456 Elm St,567.89 EUR
2,ID-003,123.456.7890,test_user@example.com,,
3,ID_004,1234567890,test@subdomain.example.com,789 Maple Ave,"GBP 1,234.56"
4,,,,PO Box 123,999.99 INR
5,ID-005,123-456-7890,user@example,123 Main Street Apt 4B,1234.56 USD
6,ID 006,(123) 456-7890,invalid-email.com,456 Elm Street,999.00 EUR
7,ID_007,123 456 7890,correct@sample.org,"123 First Ave, Suite 500","1,111.11 GBP"
8,ID-008,123-456-7890,person@example.com,PO Box 789,"INR 1,234.56"
9,ID 009,123 456.7890,,"789 Maple Ave, Apt 1A",


## Parsing Data

#### Contains

In [13]:
df.loc[df['Email'].str.contains(r'example').fillna(False), 'Email']

Unnamed: 0,Email
0,john.doe@example.com
1,jane.doe at example dot com
2,test_user@example.com
3,test@subdomain.example.com
5,user@example
8,person@example.com


#### Match

In [15]:
df.loc[df['Phone_Number'].str.contains(r'([0-9]{3})-([0-9]{3})-([0-9]{4})', regex = True).fillna(False), 'Phone_Number']

  df.loc[df['Phone_Number'].str.contains(r'([0-9]{3})-([0-9]{3})-([0-9]{4})', regex = True).fillna(False), 'Phone_Number']


Unnamed: 0,Phone_Number
0,123-456-7890
5,123-456-7890
8,123-456-7890


#### Extract

In [23]:
df['Order_Amount'].str.extract(r'([a-zA-Z]{3})')

Unnamed: 0,0
0,USD
1,EUR
2,
3,GBP
4,INR
5,USD
6,EUR
7,GBP
8,INR
9,


#### Replace

In [27]:
df['Order_Amount'].str.replace(r'[^0-9\.]+', '', regex=True)

Unnamed: 0,Order_Amount
0,1234.56
1,567.89
2,
3,1234.56
4,999.99
5,1234.56
6,999.0
7,1111.11
8,1234.56
9,


#### Starts With

In [30]:
df.loc[df['Customer_ID'].str.startswith('ID-').fillna(False), 'Customer_ID']

Unnamed: 0,Customer_ID
2,ID-003
5,ID-005
8,ID-008


In [31]:
# Can be acheived with Match
df.loc[df['Customer_ID'].str.match('^ID-.*').fillna(False), 'Customer_ID']

Unnamed: 0,Customer_ID
2,ID-003
5,ID-005
8,ID-008


#### Ends With

In [36]:
df.loc[df['Email'].str.endswith('.com').fillna(False), 'Email']

Unnamed: 0,Email
0,john.doe@example.com
2,test_user@example.com
3,test@subdomain.example.com
6,invalid-email.com
8,person@example.com


In [37]:
# Can be achieved with Match
df.loc[df['Email'].str.match('.*\.com$').fillna(False), 'Email']

Unnamed: 0,Email
0,john.doe@example.com
2,test_user@example.com
3,test@subdomain.example.com
6,invalid-email.com
8,person@example.com
