In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df = pd.read_excel("data/feng.xlsx")

In [3]:
# Comprehensive Job Mapping (Based on provided list)
job_attributes = {
    # Students
    "Student: High School": {"rigidity": 1, "fixed_schedule": 0, "income_level": 1, "stability": 1},
    "Student: College": {"rigidity": 1, "fixed_schedule": 0, "income_level": 1, "stability": 2},

    # Medical Professions
    "Medical: Nurse": {"rigidity": 5, "fixed_schedule": 1, "income_level": 4, "stability": 5},
    "Medical: Doctor, Provider": {"rigidity": 5, "fixed_schedule": 1, "income_level": 5, "stability": 5},
    "Medical: Pharmacist": {"rigidity": 5, "fixed_schedule": 1, "income_level": 4, "stability": 5},
    "Medical: Admin": {"rigidity": 4, "fixed_schedule": 1, "income_level": 3, "stability": 5},
    "Medical: Healthcare Worker": {"rigidity": 5, "fixed_schedule": 1, "income_level": 3, "stability": 4},

    # Business & Finance
    "Business: Marketing": {"rigidity": 3, "fixed_schedule": 1, "income_level": 4, "stability": 4},
    "Business: Sales": {"rigidity": 3, "fixed_schedule": 1, "income_level": 3, "stability": 4},
    "Business: Mgt, Admin": {"rigidity": 4, "fixed_schedule": 1, "income_level": 5, "stability": 5},
    "Finance": {"rigidity": 4, "fixed_schedule": 1, "income_level": 5, "stability": 5},
    "Finance: Accountant": {"rigidity": 4, "fixed_schedule": 1, "income_level": 5, "stability": 5},
    "Finance: Banking": {"rigidity": 4, "fixed_schedule": 1, "income_level": 5, "stability": 5},

    # Self-Employed
    "Self-Employed, Entrepreneur": {"rigidity": 1, "fixed_schedule": 0, "income_level": 3, "stability": 2},
    
    # Law Enforcement
    "Law: Police Officer": {"rigidity": 5, "fixed_schedule": 1, "income_level": 4, "stability": 5},
    "Law: Lawyer": {"rigidity": 5, "fixed_schedule": 1, "income_level": 5, "stability": 5},
    "Law: Paralegal": {"rigidity": 4, "fixed_schedule": 1, "income_level": 4, "stability": 4},
    "Law: Security Officer": {"rigidity": 4, "fixed_schedule": 1, "income_level": 3, "stability": 3},

    # Retail & Hospitality
    "Retail: Sales": {"rigidity": 2, "fixed_schedule": 0, "income_level": 2, "stability": 2},
    "Retail: Mgt": {"rigidity": 3, "fixed_schedule": 1, "income_level": 3, "stability": 4},
    "Service: Restaurant": {"rigidity": 2, "fixed_schedule": 0, "income_level": 2, "stability": 2},
    "Service: Hotel": {"rigidity": 2, "fixed_schedule": 0, "income_level": 2, "stability": 2},
    
    # Education
    "Education: Teacher": {"rigidity": 4, "fixed_schedule": 1, "income_level": 3, "stability": 5},
    "Education: College Professor": {"rigidity": 4, "fixed_schedule": 1, "income_level": 5, "stability": 5},
    "Education: Teacher Asst/Aid": {"rigidity": 3, "fixed_schedule": 1, "income_level": 2, "stability": 4},

    # Tech & Engineering
    "Tech: Computer/Programmer": {"rigidity": 3, "fixed_schedule": 1, "income_level": 5, "stability": 5},
    "Tech: Engineer": {"rigidity": 4, "fixed_schedule": 1, "income_level": 5, "stability": 5},
    "Tech: Sales, Mktg": {"rigidity": 3, "fixed_schedule": 1, "income_level": 4, "stability": 4},

    # Others
    "Consultant": {"rigidity": 2, "fixed_schedule": 0, "income_level": 4, "stability": 4},
    "Transport: Pilot": {"rigidity": 5, "fixed_schedule": 1, "income_level": 5, "stability": 5},
    "Firefighter": {"rigidity": 5, "fixed_schedule": 1, "income_level": 4, "stability": 5},
    "Journalist/Media": {"rigidity": 3, "fixed_schedule": 1, "income_level": 4, "stability": 4},
    "Agriculture": {"rigidity": 2, "fixed_schedule": 0, "income_level": 2, "stability": 3},
    "Military": {"rigidity": 5, "fixed_schedule": 1, "income_level": 4, "stability": 5},
    "Unemployed": {"rigidity": 1, "fixed_schedule": 0, "income_level": 1, "stability": 1},
    "Retired": {"rigidity": 1, "fixed_schedule": 0, "income_level": 2, "stability": 5},
}

# Default category for unknown occupations
default_attributes = {"rigidity": 3, "fixed_schedule": 1, "income_level": 3, "stability": 3}


In [4]:
# Function to match occupation and assign attributes
def get_job_features(occupation):
    for key in job_attributes:
        if key.lower() in occupation.lower():
            return job_attributes[key]
    return default_attributes  # Assign default values if no match found


In [5]:
# Apply job features to dataset
job_features = df['big_occupation'].apply(get_job_features).apply(pd.Series)
job_features


Unnamed: 0,rigidity,fixed_schedule,income_level,stability
0,1,0,1,1
1,4,1,5,5
2,3,1,3,3
3,1,0,1,1
4,3,1,3,3
...,...,...,...,...
295,4,1,5,5
296,1,0,1,2
297,3,1,3,3
298,5,1,5,5


In [6]:
df = pd.concat([df, job_features], axis=1)

In [7]:
# some features engineering
df["big_age_match_start"] = abs(df['big_birthdate'].dt.year - df["match_activation_date"].dt.year)
df["little_age_match_start"] = abs(df['little_birthdate'].dt.year - df["match_activation_date"].dt.year)
df["same_gender"] = df['little_gender']==df['big_gender']



In [8]:
df.drop([
    "big_occupation", "big_gender", "little_gender", "big_birthdate",
    "match_activation_date", "little_birthdate", "late_stage_notes", "early_stage_notes",
    "program", 
], axis=1, inplace=True)

In [9]:
# Tokenize and compute Jaccard Similarity (intersection / union)
def jaccard_similarity(str1, str2):
    set1 = set(str(str1).lower().split(' '))
    set2 = set(str(str2).lower().split(' '))
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union)

df['race_similarity'] = df.apply(lambda row: jaccard_similarity(row['big_race_ethnicity'], row['little_participant__race_ethnicity']), axis=1)


In [10]:
df

Unnamed: 0,index,match_id,big_age,program_type,big_race_ethnicity,rationale_for_match,little_participant__race_ethnicity,avg_cadence_day,max_cadence_day,std,...,sentiment_change,sentiment_trend,rigidity,fixed_schedule,income_level,stability,big_age_match_start,little_age_match_start,same_gender,race_similarity
0,0,a1v2J0000027CXKQA2,25,Site,Asian;,Both are male and Hmong. They share similar in...,Asian,15.000000,30,21.213203,...,0.2407,Stable,1,0,1,1,18,11,True,0.000000
1,1,a1v2J0000027JFCQA2,38,Community,White or Caucasian;,L_first_name and B_first_name were matched bec...,Black or African American,76.363636,156,48.380312,...,0.0003,Stable,4,1,5,5,32,12,True,0.166667
2,2,a1v2J0000027KBoQAM,37,Site Based Facilitated,White or Caucasian;,L_first_name was really wanting to be rematche...,Black or African American,37.000000,82,24.951381,...,0.0016,Stable,3,1,3,3,31,14,True,0.166667
3,3,a1v2J0000027KCEQA2,24,Site,Asian;,"BS is a leader, positive, experienced, and smi...",Asian,35.400000,61,24.449949,...,0.0029,Stable,1,0,1,1,17,10,True,0.000000
4,4,a1v2J0000027KCbQAM,31,Site,White or Caucasian;,Conor was open to who he worked with and I tho...,Black or African American,43.857143,90,27.853357,...,-0.0004,Stable,3,1,3,3,23,11,True,0.166667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,295,a1vHt000005BXMnIAO,47,Site Based Facilitated,White or Caucasian;,B_first_name and L_first_name share the intere...,Black or African American,24.500000,49,34.648232,...,0.0033,Stable,4,1,5,5,46,12,True,0.166667
296,296,a1vUX0000009FvtYAE,23,Site,White or Caucasian;,B_first_name and L_first_name both appreciate ...,Other,23.600000,36,14.536162,...,0.0047,Stable,1,0,1,2,22,11,False,0.000000
297,297,a1vUX000000DcnRYAS,47,Community,White or Caucasian;,1. Distance: 10 miles (14 mins) 2. Shared Trai...,Black or African American,19.750000,38,17.366155,...,0.0090,Stable,3,1,3,3,47,10,True,0.166667
298,298,a1vUX000000U9QrYAK,47,Community,Asian - Other;,Distance: 9 min (5.2 mi) Shared Interests: spo...,Black or African American,24.500000,35,16.663333,...,0.0009,Stable,5,1,5,5,47,13,True,0.000000


In [11]:
df.to_excel("data/encoding.xlsx", index=False)