In [6]:
import joblib
import gzip
import string
import pandas as pd
from collections import Counter

# Define the race and gender mappings (ensure these match the training mappings)
race_mapping = {
    'white': 0,
    'hispanic': 1,
    'Asian_or_Pacific_Islander': 2,
    'black': 3,
    'Indian_or_AlaskaNative': 4,
    'other': 5
}
race_mapping_inverse = {v: k.capitalize().replace("_", " ") for k, v in race_mapping.items()}  # Reverse mapping

gender_mapping = {0: 'Male', 1: 'Female'}

def preprocess_name(name):
    """
    Preprocess the input name into a letter frequency vector.
    """
    alphabet = list(string.ascii_lowercase)
    name = str(name).strip().lower()
    letter_count = Counter(name)
    return [letter_count.get(letter, 0) for letter in alphabet]

# Load models once to avoid redundant loading during predictions
try:
    # Load the pretrained race model with gzip compression
    race_model_path = 'knn_race_prediction_model_Nov_2024.pkl.gz'
    with gzip.open(race_model_path, 'rb') as f:
        race_model = joblib.load(f)
    print(f"Race model loaded from '{race_model_path}'")

    # Load the pretrained gender model with gzip compression
    gender_model_path = 'svm_gender_model_Nov_2024.pkl.gz'
    with gzip.open(gender_model_path, 'rb') as f:
        gender_model = joblib.load(f)
    print(f"Gender model loaded from '{gender_model_path}'")

except FileNotFoundError as e:
    print(f"Error: Model file not found - {e}")
    exit()
except Exception as e:
    print(f"Error while loading models: {e}")
    exit()

# Function for race prediction
def predict_race(surname):
    """
    Predict the race of a person based on their surname using the loaded race model.
    
    Parameters:
        surname (str): The surname to predict race for.
    
    Returns:
        str: Predicted race in human-readable format.
    """
    # Preprocess the surname
    race_features = pd.DataFrame([preprocess_name(surname)], columns=list(string.ascii_lowercase))
    # Predict race
    predicted_race_label = race_model.predict(race_features)[0]
    predicted_race = race_mapping_inverse[predicted_race_label]
    return predicted_race

# Function for gender prediction
def predict_gender(first_name):
    """
    Predict the gender of a person based on their first name using the loaded gender model.
    
    Parameters:
        first_name (str): The first name to predict gender for.
    
    Returns:
        str: Predicted gender in human-readable format.
    """
    # Preprocess the first name
    gender_features = pd.DataFrame([preprocess_name(first_name)], columns=list(string.ascii_lowercase))
    # Predict gender
    predicted_gender_label = gender_model.predict(gender_features)[0]
    predicted_gender = gender_mapping[predicted_gender_label]
    return predicted_gender

# Example usage
if __name__ == "__main__":
    # Input for race and gender prediction
    input_first_name = "John"
    input_surname = "Smith"

    # Predict race
    predicted_race = predict_race(input_surname)
    print(f"Predicted Race for '{input_surname}': {predicted_race}")

    # Predict gender
    predicted_gender = predict_gender(input_first_name)
    print(f"Predicted Gender for '{input_first_name}': {predicted_gender}")

Race model loaded from 'knn_race_prediction_model_Nov_2024.pkl.gz'
Gender model loaded from 'svm_gender_model_Nov_2024.pkl.gz'
Predicted Race for 'Smith': White
Predicted Gender for 'John': Male


#### Example Use

In [7]:
predict_race('Chang')

'Asian or pacific islander'

In [8]:
predict_gender('Emma')

'Female'