In [None]:
# Working Code 
import pandas as pd
import re

# Load the Excel file
input_file = 'input_data/name_data.xlsx'  # Replace with your file path
output_data = ''
sheet_name = 'Sheet1'
name_data = pd.read_excel(input_file, sheet_name=sheet_name)

# Function to clean and standardize names
def parse_name_with_rules(name):
    # Remove "C/O" strings regardless of their position
    name = re.sub(r'\bC\/O\b', '', name, flags=re.IGNORECASE)
    
    # Remove extra spaces
    name = re.sub(r'\s+', ' ', name.strip())

    # Define patterns for handling suffixes
    suffix_pattern = r'(Jr\.|Sr\.|III|IV|V)$'

    # Split name into parts
    name_parts = name.split()

    # Initialize variables
    first_name = ""
    last_name = ""
    suffix = ""

    # Check if the last part is a suffix
    if len(name_parts) > 1 and re.match(suffix_pattern, name_parts[-1]):
        suffix = name_parts.pop(-1)  # Extract the suffix

    # Process name parts
    if len(name_parts) > 2:  # More than two parts
        first_name = name_parts[0]
        last_name = name_parts[-1] if len(name_parts) > 2 else ""
        if suffix:
            last_name += f" {suffix}"  # Append suffix to last name if present
    elif len(name_parts) == 2:  # Two parts
        first_name, last_name = name_parts
        if suffix:
            last_name += f" {suffix}"  # Append suffix to last name
    elif len(name_parts) == 1:  # Only one part
        first_name = name_parts[0]
        last_name = suffix  # Use suffix only if no other last name exists

    # Handle capitalization
    first_name = first_name.title()
    last_name = last_name.title()
    
    return first_name, last_name

# Apply the function to the dataset
name_data[['FirstName', 'LastName']] = name_data['Client_Name_AMS'].apply(
    lambda x: pd.Series(parse_name_with_rules(str(x)))
)

# Save the updated data back to the same file and sheet
with pd.ExcelWriter(input_file, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
    name_data.to_excel(writer, sheet_name=sheet_name, index=False)

# Display the updated data in Jupyter
display(name_data.head())


In [18]:
import pandas as pd
import re

def clean_business_terms(name):
    """Remove business and title terms from the name."""
    business_patterns = [
        (r'\bC\/O\b', ''),
        (r'\b(Inc|Ltd|Co|Corp|LLC|Dba|Dr\.|DBA|CEO|CFO|Sr\.Eng|Eng|Prof\.|Mr\.|Ms\.|Mrs\.)\b', '')
    ]
    
    for pattern, replacement in business_patterns:
        name = re.sub(pattern, replacement, name, flags=re.IGNORECASE)
    return name.strip()

def extract_name_suffix(name):
    """Extract and standardize name suffix if present."""
    suffix_pattern = r',?\s*(Jr\.?|Sr\.?|III|IV|V|II)\.?\s*$'
    suffix_match = re.search(suffix_pattern, name, re.IGNORECASE)
    
    if suffix_match:
        suffix = suffix_match.group(1)
        name = re.sub(suffix_pattern, '', name, flags=re.IGNORECASE).strip(' ,.')
        suffix = re.sub(r'\.', '', suffix)  # Remove periods from suffix
        return name, suffix
    return name, ''

def extract_name_components(name):
    """Extract first name and last name from cleaned name string."""
    parts = name.split()
    
    if not parts:
        return '', ''
        
    first_name = parts[0]
    last_name = ''
    
    if len(parts) > 1:
        remaining = ' '.join(parts[1:])
        if ' and ' in remaining.lower() or ' & ' in remaining:
            # Handle compound names
            last_parts = re.split(r'\s+(?:and|&)\s+', remaining, flags=re.IGNORECASE)
            if last_parts:
                last_name = last_parts[-1].split()[-1]
        else:
            # Remove middle initials and take last word
            cleaned_parts = [p for p in parts[1:] if not (len(p) == 2 and p.endswith('.'))]
            if cleaned_parts:
                last_name = cleaned_parts[-1].strip(' ,.')
    
    return first_name, last_name

def normalize_name(name):
    """Main function to normalize a name into components."""
    if not isinstance(name, str) or not name.strip():
        return '', '', ''
        
    # Step 1: Clean business terms
    cleaned_name = clean_business_terms(name)
    
    # Step 2: Extract suffix
    name_without_suffix, suffix = extract_name_suffix(cleaned_name)
    
    # Step 3: Extract name components
    first_name, last_name = extract_name_components(name_without_suffix)
    
    # Step 4: Final cleanup - remove periods and extra spaces
    first_name = re.sub(r'\.', '', first_name).strip()
    last_name = re.sub(r'\.', '', last_name).strip()
    
    return first_name, last_name, suffix

def process_name_file(input_file, sheet_name='Sheet1'):
    """Process the entire Excel file of names."""
    try:
        # Load data
        print(f"Loading data from {input_file}...")
        name_data = pd.read_excel(input_file, sheet_name=sheet_name)
        total_rows = len(name_data)
        
        # Process names
        print("Normalizing names...")
        name_data[['FirstName', 'LastName', 'Suffix']] = name_data['Client_Name_AMS'].apply(
            lambda x: pd.Series(normalize_name(str(x)))
        )
        
        # Save results
        print("Saving normalized data...")
        with pd.ExcelWriter(input_file, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
            name_data.to_excel(writer, sheet_name=sheet_name, index=False)
        
        print(f"Completed normalizing {total_rows} rows of data.")
        return name_data
        
    except Exception as e:
        print(f"Error processing file: {str(e)}")
        return None

if __name__ == "__main__":
    input_file = 'input_data/name_data.xlsx'
    result = process_name_file(input_file)
    if result is not None:
        print("\nSample of processed names:")
        print(result[['Client_Name_AMS', 'FirstName', 'LastName', 'Suffix']].head())

Loading data from input_data/name_data.xlsx...
Normalizing names...
Saving normalized data...
Completed normalizing 17 rows of data.

Sample of processed names:
                 Client_Name_AMS FirstName LastName Suffix
0               Ian H. Macot, Jr       Ian    Macot     Jr
1                       Jane Doe      Jane      Doe       
2                G C Waters, Sr.         G   Waters     Sr
3       Ben O.and Maggi N. Goddy       Ben    Goddy       
4  Willy P. Pear Maria Emmy pear     Willy     pear       
