In [85]:
import pandas as pd
from openpyxl import load_workbook


In [86]:
data = pd.read_excel('C:/Users/DELL/Desktop/Kaizen/Cold_Outreach/Data/Healthcare_Leads.xlsx') # Give loc of req file

In [87]:
personal_domains = ['gmail.com', 'yahoo.com', 'hotmail.com', '.edu']


In [88]:
def filter_rows(row: dict) -> bool:
    """
    Filter rows based on specific criteria.

    This function filters rows from a dataset based on the values of
    'company_domain' and 'email_first' fields. It removes rows with
    empty 'email_first' values and those containing personal email
    domains in the 'email_first' field.

    :param row: A dictionary representing a row of data.
                It should contain keys 'company_domain' and 'email_first'.
    :return:    True if the row meets the filtering criteria, False otherwise.
    """

    # Extract company_domain and email_first from the row
    company_domain = str(row.get('company_domain', '')).strip()
    email_first = str(row.get('email_first', '')).strip()
    
    # Remove rows with empty 'email_first'
    if not email_first:
        return False

    # Extract the domain from 'email_first'
    email_domain = email_first.split('@')[-1]
    
    # Define personal email domains
    personal_domains = ['gmail.com', 'yahoo.com', 'hotmail.com']

    if company_domain:
        # Keep the row if the email domain is equal to company_domain
        # and it's not a personal email
        return email_domain == company_domain and not any(domain in email_first for domain in personal_domains)
    else:
        # If company_domain is empty, keep the row if email_first
        # is not a personal email
        return not any(domain in email_first for domain in personal_domains)  


In [89]:
filter_mask = data.apply(filter_rows, axis=1)

filtered_data = data[filter_mask]
removed_data = data[~filter_mask]

columns_to_keep = ['email_first', 'first_name', 'last_name', 'job_title', 'company_name', 'company_domain', 'city']
filtered_data = filtered_data[columns_to_keep]

filtered_data = filtered_data[filtered_data['email_first'].notnull()]


# Save filtered data and removed data to new sheets in the same Excel file
output_path = 'C:/Users/DELL/Desktop/Kaizen/Cold_Outreach/Data/Healthcare_Leads.xlsx'
with pd.ExcelWriter(output_path, mode='a', engine='openpyxl') as writer:
    filtered_data.to_excel(writer, sheet_name='Filtered_Data', index=False)
    removed_data.to_excel(writer, sheet_name='Removed_Data', index=False)

In [90]:
num_removed = len(removed_data)
num_total = len(data)
num_kept = len(filtered_data)
percent_reduction = (num_removed / num_total) * 100

print(f"Total rows: {num_total}")
print(f"Rows removed: {num_removed}")
print(f"Rows kept: {num_kept}")
print(f"Percentage reduction: {percent_reduction:.2f}%")

Total rows: 699
Rows removed: 149
Rows kept: 543
Percentage reduction: 21.32%
