## Task 1 

In [187]:
import pandas as pd
import spacy
import re
import string

In [188]:
nlp = spacy.load('en_core_web_sm')

In [189]:
name_pattern = re.compile(
    r'^(Mr\.|Ms\.|Mrs\.|Dr\.|Miss|Mx\.|Sir|Madam)?\s*'  # Optional titles
    r'([A-Z][a-zA-Z\'\-]+,\s*)?'  # Optional last name followed by a comma and space (for "Last, First" format)
    r'[A-Z][a-zA-Z\'\-]+'  # First name
    r'(\s[A-Z][a-zA-Z\'\-]+)*'  # Optional middle names
    r'(\s(Jr\.|Sr\.|II|III))?$'  # Optional suffixes
)

In [190]:
def validate_name(df, column_name):
    def is_valid_name(text):
        # Detect "PERSON" entities
        doc = nlp(text)
        for ent in doc.ents:
            if ent.label_ == 'PERSON':
                return True
        
        # Check using reg exp
        if bool(name_pattern.match(str(text))):
            return True

        # Return False if neither verifies the name
        return False
    
    # Apply the combined validation function
    validation_results = df[column_name].apply(is_valid_name)
    
    return validation_results

In [191]:
if __name__ == "__main__":
    # Assuming that recall is more important in this case
    input_file = 'dev_data.csv'
    df = pd.read_csv(input_file)
    
    # Validate Person Names
    df['Person Name Valid'] = validate_name(df, 'Person Name')
    
    # Save df
    df.to_csv('processed_dev_data.csv', index=False)

    # Load df
    df = pd.read_csv('processed_dev_data.csv')

    validation_counts = df['Person Name Valid'].value_counts()
    print(validation_counts)

    invalid_names = df[df['Person Name Valid'] == False]['Person Name']
    print(invalid_names)

Person Name Valid
True    1000
Name: count, dtype: int64
Series([], Name: Person Name, dtype: object)


## Task 2

In [483]:
def clean_name(df, col_name, split=False, name_format="first_last", case_type="sentence", remove_punctuation=True, standardize="FN,LN"):
    # split (boolean): True, False
    # name_format options: first_last, last_first (applied only when split = True)
    # standardize options: "FN, LN", "LN,FN" (without spaces) (applied only when split = False)
    # case types: upper, lower, sentence 
    
    # Helper function to detect salutation
    def detect_salutation(name):
        salutations = ["Mr", "Mrs", "Ms", "Dr", "Prof", "Miss", "Sir", "Lady", "Mr.", "Ms.", "Dr."]
        tokens = name.split()
        if tokens[0] in salutations:
            return tokens[0], ' '.join(tokens[1:])
        return "", name

    # Helper function to split names based on the presence of a comma
    def split_name(name):
        if ',' in name:
            # Case where format is "LName, FName"
            last, first = name.split(',', 1)
            last, first = last.strip(), first.strip()
        else:
            # Assume "FName LName" format
            tokens = name.split()
            if len(tokens) == 1:
                first, last = tokens[0], ""
            else:
                first, last = tokens[0], ' '.join(tokens[1:])
        return first, last

    # Create new columns if split=True
    if split:
        df['Salutation'] = ""
        df['First Name'] = ""
        df['Last Name'] = ""

    # Process each name
    for idx, row in df.iterrows():
        original_name = row[col_name]

        # Remove punctuation according to param
        if remove_punctuation:
            original_name = original_name.translate(str.maketrans('', '', string.punctuation))

        # Handle salutations
        salutation, cleaned_name = detect_salutation(original_name)

        # Split or Standardize the names based on split param
        if split:
            # Remove salutation for splitting
            first, last = split_name(cleaned_name)
            df.at[idx, 'Salutation'] = salutation

            if name_format == "last_first":
                df.at[idx, 'First Name'] = last  # In last_first, first name goes to last
                df.at[idx, 'Last Name'] = first  # Last name goes to first
            else:
                df.at[idx, 'First Name'] = first
                df.at[idx, 'Last Name'] = last
        else:
            first, last = split_name(cleaned_name)

            if standardize == "LN,FN":
                # Split cleaned name and assume the last token is the last name
                name_tokens = cleaned_name.split()
                if len(name_tokens) > 1:
                    last = name_tokens[-1]  # Last token is treated as the last name
                    first = ' '.join(name_tokens[:-1])  # Rest are treated as first name
                else:
                    first = name_tokens[0]  # If only one token, it's the first name, no last name

                standardized_name = f"{last} {first}"
            else:
                standardized_name = f"{first} {last}"

            if salutation and salutation not in standardized_name.split():
                standardized_name = f"{salutation} {standardized_name}"

            df.at[idx, col_name] = standardized_name

        # Apply casing
        if case_type == "upper":
            df.at[idx, col_name] = df.at[idx, col_name].upper()
        elif case_type == "lower":
            df.at[idx, col_name] = df.at[idx, col_name].lower()
        elif case_type == "sentence":
            df.at[idx, col_name] = ' '.join(word.capitalize() for word in df.at[idx, col_name].split())
            
    if split and name_format == "last_first":
        df = df.rename(columns={"First Name": "Last Name", "Last Name": "First Name"})

    return df

In [484]:
def main(file_name):
    df = pd.read_csv(file_name)
    processed_df = clean_name(df, "Person Name", split=True, name_format="last_first", case_type="sentence", remove_punctuation=True, standardize="LN,FN")
    processed_df.to_csv('processed_' + file_name, index=False)

In [485]:
main("dev_data.csv")

In [486]:
pd.read_csv("processed_dev_data.csv")

Unnamed: 0,Person Name,Company Name,Currency,Email,Website,Phone,Datetime,Record ID,Address,Picklist Value,Salutation,Last Name,First Name
0,Ms. Stephanie Frazier,"Carter, Combs and Boyd","$7,450",ricksloan@example.com,http://weaver.com/,(602) 555-2345,11/15/21 3:17,00QGC00001r6q9d2AA,"7230 John Burg Apt. 392, Georgeberg, ME 75678",Banned,Ms,Frazier,Stephanie
1,Mr Mateo Cortez-gomez,Humphrey Ltd,6277 CAD,ckemp@example.net,https://www.patterson.biz/,+1-681-235-6613x943,12/5/21 1:31,00QGC00001r6q9V2AQ,"3344 Ash St, Washington, D.C. 20001",Qualified,Mr,CortezGomez,Mateo
2,Dr Ahmed Abdel Rahman,"Prime Industries, Inc.",$9812 CAD,sheri87@example.net,https://roberts.org/,831-622-8628,3/30/22 23:39,00QGC00001r6q9m2AA,"7243 Vasquez Way Suite 389, East Sabrinaberg, ...",Unqualified,Dr,Abdel Rahman,Ahmed
3,Dr. Isabella Rodriguez,Alpha Technologies,$2817 CAD,burchashley@example.net,https://richards.com/,(415) 555-0987,10/13/23 13:15,00QGC00001r6q9f2AA,"76857 Dustin Run, East Amy, TN 16418",Underqualified,Dr,Rodriguez,Isabella
4,Mr. Matthew Jensen,Blue Wave Technologies,712 AUD,merrittjustin@example.com,https://walker-cabrera.net/,(212) 555-4567 Ext 5678,8/25/22 19:46,00QGC00001r6q9m2AA,"7788 Redwood Blvd, Apt C, Seattle, WA 98101",Overqualified,Mr,Jensen,Matthew
5,"Smith, Patricia",NextGen Enterprises,$4015 USD,marcusanderson@example.net,https://best.com/,(202) 555-6666,9/8/21 1:58,00QGC00001r6q9i2AA,"1234 Elm St, Springfield, IL 62704",Qualified,,Patricia,Smith
6,Ms Sarah Clark,"Innovative Designs, Inc.",5064 AUD,christine43@example.net,http://boone.net/,(305) 555-4321,7/27/24 9:31,00QGC00001r6q9c2AA,"36134 Boyd Springs Suite 215, Port Pattyland, ...",Underqualified,Ms,Clark,Sarah
7,Christian Dyer,"Kelly, Gray and Faulkner",$3916 AUD,brandon63@example.com,https://www.martinez-davis.com/,896.599.8522,4/8/22 21:41,00QGC00001r6q9U2AQ,"133 Maureen Wall Apt. 833, Porterside, FL 29515",Qualified,,Dyer,Christian
8,Mrs Diane Walker,Henderson-Sexton,$2460 CAD,emartin@example.net,http://www.harris.com/,(312) 345-9876 Ext 1230,12/12/22 23:10,00QGC00001r6q9W2AQ,"3063 Jacob Course, West Emily, KY 60552",Overqualified,Mrs,Walker,Diane
9,Mrs. Monica Ross,"Creative Minds, Ltd.",6309 AUD,xperkins@example.org,https://allen-gonzales.com/,438.334.7182x054,6/30/21 20:31,00QGC00001r6q9l2AA,"310 Patrick Rue, South Gary, AZ 28892",Unqualified,Mrs,Ross,Monica
