In [1]:
# Instructions For Eva:
# Download this code and save it in your Python script directory on your computer. 
# Open it in your Jupyter Notebook or Vscode environment
# Run it like we have been doing
# Ensure that the file: name_data.xlsx exist in the folder input_data


In [4]:

# WORKING CODE FOR PRODUCTION
# TEST WITH A SAMPLE OF THE TEST DATA
# READ THE COMMENTS IN THE CODE TO UNDERSTAND WHAT IT IS DOING
# Run this cell block for your real data

import pandas as pd
import re

def clean_business_terms(name):
    """
    Remove business terms and titles from a name string.
    
    Args:
        name (str): Input name that might contain business terms or titles
        
    Returns:
        str: Cleaned name with business terms and titles removed
        
    Example:
        "Mr. John Smith LLC" -> "John Smith"
    """
    # Define pairs of (pattern, replacement) for cleaning
    # Each pattern represents a business term or title we want to remove
    business_patterns = [
        (r'\bC\/O\b', ''),  # Matches "C/O" (Care of) term
        (r'\b(Inc|Ltd|Co|Corp|LLC|Dba|Dr\.|DBA|CEO|CFO|Sr\.Eng|Eng|Prof\.|Mr\.|Ms\.|Mrs\.)\b', '')  # Matches business entities and titles
    ]
    
    # Apply each cleaning pattern sequentially
    for pattern, replacement in business_patterns:
        # re.sub replaces all matches of the pattern with the replacement
        # flags=re.IGNORECASE makes the matching case-insensitive
        name = re.sub(pattern, replacement, name, flags=re.IGNORECASE)
    return name.strip()  # Remove leading/trailing whitespace

def extract_name_suffix(name):
    """
    Extract and standardize name suffixes (Jr., Sr., III, etc.).
    
    Args:
        name (str): Input name that might contain a suffix
        
    Returns:
        tuple: (name without suffix, standardized suffix)
        
    Example:
        "John Smith Jr." -> ("John Smith", "Jr")
    """
    # Pattern to match common name suffixes
    # ?: at the start means "might or might not have a comma"
    # Captures Jr., Sr., III, IV, V, II with or without periods
    suffix_pattern = r',?\s*(Jr\.?|Sr\.?|III|IV|V|II)\.?\s*$'
    
    # Try to find a suffix match at the end of the name
    suffix_match = re.search(suffix_pattern, name, re.IGNORECASE)
    
    if suffix_match:
        # Extract the matched suffix
        suffix = suffix_match.group(1)
        # Remove the suffix from the original name and clean up punctuation
        name = re.sub(suffix_pattern, '', name, flags=re.IGNORECASE).strip(' ,.')
        # Standardize suffix by removing periods
        suffix = re.sub(r'\.', '', suffix)
        return name, suffix
    return name, ''  # Return original name and empty suffix if no match

def extract_name_components(name):
    """
    Split a cleaned name into first name and last name components.
    
    Args:
        name (str): Cleaned name string without suffix
        
    Returns:
        tuple: (first name, last name)
        
    Examples:
        "John Smith" -> ("John", "Smith")
        "Ben O.and Maggi N. Goddy" -> ("Ben", "Goddy")
    """
    # Split the name into individual words
    parts = name.split()
    
    if not parts:
        return '', ''  # Handle empty input
        
    # First word is always treated as the first name
    first_name = parts[0]
    last_name = ''
    
    if len(parts) > 1:
        # Get everything after the first name
        remaining = ' '.join(parts[1:])
        
        # Handle compound names with 'and' or '&'
        if ' and ' in remaining.lower() or ' & ' in remaining:
            # Split on 'and' or '&' and take the last part
            last_parts = re.split(r'\s+(?:and|&)\s+', remaining, flags=re.IGNORECASE)
            if last_parts:
                # Take the last word of the last part as the last name
                last_name = last_parts[-1].split()[-1]
        else:
            # Filter out middle initials (words that are 2 chars long and end with period)
            cleaned_parts = [p for p in parts[1:] if not (len(p) == 2 and p.endswith('.'))]
            if cleaned_parts:
                # Take the last remaining word as the last name
                last_name = cleaned_parts[-1].strip(' ,.')
    
    return first_name, last_name

def normalize_name(name):
    """
    Main normalization function that processes a name through all cleaning steps.
    
    Args:
        name (str): Raw input name
        
    Returns:
        tuple: (first name, last name, suffix)
        
    Example:
        "Mr. John H. Smith Jr." -> ("John", "Smith", "Jr")
    """
    # Input validation
    if not isinstance(name, str) or not name.strip():
        return '', '', ''
        
    # Step 1: Remove business terms and titles
    cleaned_name = clean_business_terms(name)
    
    # Step 2: Extract any suffix present
    name_without_suffix, suffix = extract_name_suffix(cleaned_name)
    
    # Step 3: Split into first and last name
    first_name, last_name = extract_name_components(name_without_suffix)
    
    # Step 4: Final cleanup - remove any remaining periods and extra spaces
    first_name = re.sub(r'\.', '', first_name).strip()
    last_name = re.sub(r'\.', '', last_name).strip()
    
    return first_name, last_name, suffix

def process_name_file(input_file, sheet_name='Sheet1'):
    """
    Process an entire Excel file containing names and normalize them.
    
    Args:
        input_file (str): Path to the Excel file
        sheet_name (str): Name of the sheet containing the data
        
    Returns:
        pandas.DataFrame: Processed data with normalized names, or None if error
    """
    try:
        # Step 1: Load the Excel file
        print(f"Loading data from {input_file}...")
        name_data = pd.read_excel(input_file, sheet_name=sheet_name)
        total_rows = len(name_data)
        
        # Step 2: Apply normalization to each name in the dataset
        print("Normalizing names...")
        # Use pandas apply to process each row and create new columns
        name_data[['FirstName', 'LastName', 'Suffix']] = name_data['Client_Name_AMS'].apply(
            lambda x: pd.Series(normalize_name(str(x)))
        )
        
        # Step 3: Save the processed data back to Excel
        print("Saving normalized data...")
        with pd.ExcelWriter(input_file, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
            name_data.to_excel(writer, sheet_name=sheet_name, index=False)
        
        print(f"Completed normalizing {total_rows} rows of data.")
        return name_data
        
    except Exception as e:
        print(f"Error processing file: {str(e)}")
        return None

# Main execution block
if __name__ == "__main__":
    # Specify the input file path
    input_file = 'input_data/name_data.xlsx'
    
    # Process the file and get results
    result = process_name_file(input_file)
    
    # Display sample results if processing was successful
    if result is not None:
        print("\nSample of processed names:")
        print(result[['Client_Name_AMS', 'FirstName', 'LastName', 'Suffix']].head())

Loading data from input_data/name_data.xlsx...
Normalizing names...
Saving normalized data...
Completed normalizing 17 rows of data.

Sample of processed names:
                 Client_Name_AMS FirstName LastName Suffix
0               Ian H. Macot, Jr       Ian    Macot     Jr
1                       Jane Doe      Jane      Doe       
2                G C Waters, Sr.         G   Waters     Sr
3       Ben O.and Maggi N. Goddy       Ben    Goddy       
4  Willy P. Pear Maria Emmy pear     Willy     pear       
