# Working Code

In [41]:


import pandas as pd
import re

def clean_business_terms(name):
    """
    Remove business terms and titles from a name string.
    
    Args:
        name (str): Input name that might contain business terms or titles
        
    Returns:
        str: Cleaned name with business terms and titles removed
        
    Example:
        "Mr. John Smith LLC" -> "John Smith"
    """
    # Define pairs of (pattern, replacement) for cleaning
    # Each pattern represents a business term or title we want to remove
    business_patterns = [
        (r'\bC\/O\b', ''),  # Matches "C/O" (Care of) term
        (r'\b(Inc|Ltd|Co|Corp|LLC|Dba|Dr\.|DBA|CEO|CFO|Sr\.Eng|Eng|Prof\.|Mr\.|Ms\.|Mrs\.)\b', '')  # Matches business entities and titles
    ]
    
    # Apply each cleaning pattern sequentially
    for pattern, replacement in business_patterns:
        # re.sub replaces all matches of the pattern with the replacement
        # flags=re.IGNORECASE makes the matching case-insensitive
        name = re.sub(pattern, replacement, name, flags=re.IGNORECASE)
    return name.strip()  # Remove leading/trailing whitespace

def extract_name_suffix(name):
    """
    Extract and standardize name suffixes (Jr., Sr., III, etc.).
    
    Args:
        name (str): Input name that might contain a suffix
        
    Returns:
        tuple: (name without suffix, standardized suffix)
        
    Example:
        "John Smith Jr." -> ("John Smith", "Jr")
    """
    # Pattern to match common name suffixes
    # ?: at the start means "might or might not have a comma"
    # Captures Jr., Sr., III, IV, V, II with or without periods
    suffix_pattern = r',?\s*(Jr\.?|Sr\.?|III|IV|V|II)\.?\s*$'
    
    # Try to find a suffix match at the end of the name
    suffix_match = re.search(suffix_pattern, name, re.IGNORECASE)
    
    if suffix_match:
        # Extract the matched suffix
        suffix = suffix_match.group(1)
        # Remove the suffix from the original name and clean up punctuation
        name = re.sub(suffix_pattern, '', name, flags=re.IGNORECASE).strip(' ,.')
        # Standardize suffix by removing periods
        suffix = re.sub(r'\.', '', suffix)
        return name, suffix
    return name, ''  # Return original name and empty suffix if no match

def extract_name_components(name):
    """
    Split a cleaned name into first name and last name components.
    
    Args:
        name (str): Cleaned name string without suffix
        
    Returns:
        tuple: (first name, last name)
        
    Examples:
        "John Smith" -> ("John", "Smith")
        "Ben O.and Maggi N. Goddy" -> ("Ben", "Goddy")
    """
    # Split the name into individual words
    parts = name.split()
    print(f"parts = {parts})
    
    if not parts:
        return '', ''  # Handle empty input
        
    # First word is always treated as the first name
    first_name = parts[0]
    last_name = ''
    
    if len(parts) > 1:
        # Get everything after the first name
        remaining = ' '.join(parts[1:])
          
        print(f"remaining {remaining}")
        
        # Handle compound names with 'and' or '&'
        if ' and ' in remaining.lower() or ' & ' in remaining:
            # Split on 'and' or '&' and take the last part
            last_parts = re.split(r'\s+(?:and|&)\s+', remaining, flags=re.IGNORECASE)
            if last_parts:
                # Take the last word of the last part as the last name
                last_name = last_parts[-1].split()[-1]
        else:
            # Filter out middle initials (words that are 2 chars long and end with period)
            cleaned_parts = [p for p in parts[1:] if not (len(p) == 2 and p.endswith('.'))]
            if cleaned_parts:
                # Take the last remaining word as the last name
                last_name = cleaned_parts[-1].strip(' ,.')
    
    return first_name, last_name

def split_by_delimiter(name):
    regex_pattern =  r'\s*(?:&|and|,)\s*'
    name_parts = re.split(regex_pattern, name)
    return ' '.join(str(e) for e in name_parts)
#     print("What is nameparts", name_parts)
    
#     if len(name_parts) >= 2:
# #         print(f"name_parts: {len(name_parts[0])}")
#         return name_parts[0]
    
#     else:
#         return name_parts


def remove_special_chars(name):
    name_ = re.sub(r'[^a-zA-Z\s]', '', name)
    
    return name_.strip()
    
    
def normalize_name(name):
    """
    Main normalization function that processes a name through all cleaning steps.
    
    Args:
        name (str): Raw input name
        
    Returns:
        tuple: (first name, last name, suffix)
        
    Example:
        "Mr. John H. Smith Jr." -> ("John", "Smith", "Jr")
    """
    # Input validation
    if not isinstance(name, str) or not name.strip():
        return '', '', ''
        
    # Step 1: Remove business terms and titles
    cleaned_name = clean_business_terms(name)
    
    cleaned_name = split_by_delimiter(cleaned_name) 
    
    # Step 2: Extract any suffix present
    name_without_suffix, suffix = extract_name_suffix(cleaned_name)
    
    # Step 3: Split into first and last name
    first_name, last_name = extract_name_components(name_without_suffix)
    
    # Step 4: Final cleanup - remove any remaining periods and extra spaces
    first_name = remove_special_chars(first_name)
    last_name = remove_special_chars(last_name)
    
    return first_name, last_name, suffix

def process_name_file(input_file, sheet_name):
    """
    Process an entire Excel file containing names and normalize them.
    
    Args:
        input_file (str): Path to the Excel file
        sheet_name (str): Name of the sheet containing the data
        
    Returns:
        pandas.DataFrame: Processed data with normalized names, or None if error
    """
    try:
        # Step 1: Load the Excel file
        print(f"Loading data from {input_file}...")
        name_data = pd.read_excel(input_file, sheet_name=sheet_name)
        total_rows = len(name_data)
        
        # Step 2: Apply normalization to each name in the dataset
        print("Normalizing names...")
        # Use pandas apply to process each row and create new columns
        name_data[['FirstName', 'LastName', 'Suffix']] = name_data['Client_Name_AMS'].apply(
            lambda x: pd.Series(normalize_name(str(x)))
        )
   
        
        # Combine Lastname and Suffix (only add space if Suffix is not empty)
        name_data["Full Lastname"] = name_data.apply(lambda row: f"{row['LastName']} {row['Suffix']}".strip(), axis=1)
#         name_data.drop(labels='LastName', axis=1, inplace=True)
        name_data.drop(labels='Suffix', axis=1, inplace=True)

    
        # Step 3: Save the processed data back to Excel
        print("Saving normalized data...")
        with pd.ExcelWriter(input_file, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
            name_data.to_excel(writer, sheet_name=sheet_name, index=False)
        
#         print(f"Completed normalizing {total_rows} rows of data.")
        return name_data
        
    except Exception as e:
        print(f"Error processing file: {str(e)}")
        return None

# Main execution block
if __name__ == "__main__":
    # Specify the input file path
    input_file = 'EPIC_Contacts_2025_0212.xlsx'
    sheet_name = 'EPIC_Contacts_2025'
    # Process the file and get results
    result = process_name_file(input_file, sheet_name)
    
    # Display sample results if processing was successful
    if result is not None:
        print("\nSample of processed names:")
        print(result)




Loading data from EPIC_Contacts_2025_0212.xlsx...
Normalizing names...
Saving normalized data...

Sample of processed names:
                            Client_Name_AMS FirstName  LastName Full Lastname  \
0              Thomas Luke & Caroline Bryan    Thomas     Bryan         Bryan   
1   Davis Polk & Wardwell Group Pers Excess     Davis    Excess        Excess   
2           Stuart Rabin & Kristin Gervasio    Stuart  Gervasio      Gervasio   
3                     Frank H. McCourt, Jr.     Frank   McCourt    McCourt Jr   
4                            Scott Blattner     Scott  Blattner      Blattner   
..                                      ...       ...       ...           ...   
95                   Gregory and Susan Palm   Gregory      Palm          Palm   
96                             Rusty Turner     Rusty    Turner        Turner   
97                              Cleves Delp    Cleves      Delp          Delp   
98                              Joel Carter      Joel    Carter  

#  new code 

In [51]:
import pandas as pd
import re

def clean_business_terms(name):
    """
    Remove business terms and titles from a name string.
    """
    business_patterns = [
        (r'\bC\/O\b', ''),
        (r'\b(Inc|Ltd|Co|Corp|LLC|Dba|Dr\.|DBA|CEO|CFO|Sr\.Eng|Eng|Prof\.|Mr\.|Ms\.|Mrs\.)\b', '')
    ]
    
    for pattern, replacement in business_patterns:
        name = re.sub(pattern, replacement, name, flags=re.IGNORECASE)
    return name.strip()

def extract_name_suffix(name):
    """
    Extract and standardize name suffixes (Jr., Sr., III, etc.).
    """
    suffix_pattern = r',?\s*(Jr\.?|Sr\.?|III|IV|V|II)\.?\s*$'
    
    suffix_match = re.search(suffix_pattern, name, re.IGNORECASE)
    
    if suffix_match:
        suffix = suffix_match.group(1)
        name = re.sub(suffix_pattern, '', name, flags=re.IGNORECASE).strip(' ,. ')
        suffix = re.sub(r'\.', '', suffix)
        return name, suffix
    return name, ''

def remove_special_chars(name):
    """
    Removes any non-alphabetic characters and trims extra spaces.
    """
    name_ = re.sub(r'[^a-zA-Z\s]', '', name)
    return name_.strip()

def identify_common_last_name(name_parts):
    """
    Identify the most common last name in the name parts. This function assumes
    the name parts are already split and cleaned.
    """
    last_names = [part.split()[-1] for part in name_parts]
    
    # Identify the most common last name
    common_last_name = max(set(last_names), key=last_names.count)
    
    return common_last_name

def normalize_name(name):
    """
    Main normalization function that processes a name through all cleaning steps.
    """
    if not isinstance(name, str) or not name.strip():
        return '', '', ''
    
    # Step 1: Remove business terms and titles
    cleaned_name = clean_business_terms(name)
    
    # Step 2: Split by delimiter, but keep the parts for analysis
    name_parts = re.split(r'\s*(?:&|and|,)\s*', cleaned_name)

    # Step 3: Identify the most common last name
    common_last_name = identify_common_last_name(name_parts)

    # Step 4: Handle the case where the last name is the same across parts
    if len(name_parts) > 1 and all(common_last_name in part.split() for part in name_parts):
        # If the last name is common across all parts, take the first name and the common last name
        first_name = name_parts[0]
        last_name = common_last_name
    else:
        # Otherwise, just take the first two names as first name and last name
        first_name = name_parts[0]
        last_name = name_parts[1] if len(name_parts) > 1 else ''

    # Final cleanup
    first_name = remove_special_chars(first_name)
    last_name = remove_special_chars(last_name)
    
    return first_name, last_name

def process_name_file(input_file, sheet_name):
    """
    Process an entire Excel file containing names and normalize them.
    """
    try:
        print(f"Loading data from {input_file}...")
        name_data = pd.read_excel(input_file, sheet_name=sheet_name)
        
        print("Normalizing names...")
        name_data[['FirstName', 'LastName']] = name_data['Client_Name_AMS'].apply(
            lambda x: pd.Series(normalize_name(str(x)))
        )

        # Combine Lastname
        name_data["Full Lastname"] = name_data['LastName']
        
        print("Saving normalized data...")
        with pd.ExcelWriter(input_file, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
            name_data.to_excel(writer, sheet_name=sheet_name, index=False)
        
        return name_data
        
    except Exception as e:
        print(f"Error processing file: {str(e)}")
        return None

# Main execution block
if __name__ == "__main__":
    input_file = 'dummy_data.xlsx'  # Replace with your actual file path
    sheet_name = 'EPIC_Contacts_2025'           # Replace with your sheet name
    result = process_name_file(input_file, sheet_name)
    
    if result is not None:
        print("\nSample of processed names:")
        print(result)


Loading data from dummy_data.xlsx...
Normalizing names...
Saving normalized data...

Sample of processed names:
                Client_Name_AMS     FirstName        LastName   Full Lastname
0           Lydia & Gary Orange         Lydia     Gary Orange     Gary Orange
1   Thomas Jacob & Caroline Kik  Thomas Jacob    Caroline Kik    Caroline Kik
2    Ben J. and Maggie E. Goddy         Ben J  Maggie E Goddy  Maggie E Goddy
3     Ben Janet & Nana E. Goddy     Ben Janet    Nana E Goddy    Nana E Goddy
4  Ben Janet  and Nana E. Goddy     Ben Janet    Nana E Goddy    Nana E Goddy


# Instructions:


- If names in the list have the same (example Lydia & Gary Orange) lastname take Lydia Orange as First and last name
 - if first name and last name and last name on the list are incositent (example Thomas Jacob & Caroline Kik), take Thomas Jacob as First and last name and ingnore the other names
 
 - if first name and last name and last name on the list are incositent (example Ben Janet  and Nana E. Goddy), take Ben Janet as First and last name and ingnore the other names