In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df = pd.read_excel("data/feng.xlsx")

In [3]:
# Comprehensive Job Mapping (Based on provided list)
job_attributes = {
    # Students
    "Student: High School": {"rigidity": 1, "fixed_schedule": 0, "income_level": 1, "stability": 1},
    "Student: College": {"rigidity": 1, "fixed_schedule": 0, "income_level": 1, "stability": 2},

    # Medical Professions
    "Medical: Nurse": {"rigidity": 5, "fixed_schedule": 1, "income_level": 4, "stability": 5},
    "Medical: Doctor, Provider": {"rigidity": 5, "fixed_schedule": 1, "income_level": 5, "stability": 5},
    "Medical: Pharmacist": {"rigidity": 5, "fixed_schedule": 1, "income_level": 4, "stability": 5},
    "Medical: Admin": {"rigidity": 4, "fixed_schedule": 1, "income_level": 3, "stability": 5},
    "Medical: Healthcare Worker": {"rigidity": 5, "fixed_schedule": 1, "income_level": 3, "stability": 4},

    # Business & Finance
    "Business: Marketing": {"rigidity": 3, "fixed_schedule": 1, "income_level": 4, "stability": 4},
    "Business: Sales": {"rigidity": 3, "fixed_schedule": 1, "income_level": 3, "stability": 4},
    "Business: Mgt, Admin": {"rigidity": 4, "fixed_schedule": 1, "income_level": 5, "stability": 5},
    "Finance": {"rigidity": 4, "fixed_schedule": 1, "income_level": 5, "stability": 5},
    "Finance: Accountant": {"rigidity": 4, "fixed_schedule": 1, "income_level": 5, "stability": 5},
    "Finance: Banking": {"rigidity": 4, "fixed_schedule": 1, "income_level": 5, "stability": 5},

    # Self-Employed
    "Self-Employed, Entrepreneur": {"rigidity": 1, "fixed_schedule": 0, "income_level": 3, "stability": 2},
    
    # Law Enforcement
    "Law: Police Officer": {"rigidity": 5, "fixed_schedule": 1, "income_level": 4, "stability": 5},
    "Law: Lawyer": {"rigidity": 5, "fixed_schedule": 1, "income_level": 5, "stability": 5},
    "Law: Paralegal": {"rigidity": 4, "fixed_schedule": 1, "income_level": 4, "stability": 4},
    "Law: Security Officer": {"rigidity": 4, "fixed_schedule": 1, "income_level": 3, "stability": 3},

    # Retail & Hospitality
    "Retail: Sales": {"rigidity": 2, "fixed_schedule": 0, "income_level": 2, "stability": 2},
    "Retail: Mgt": {"rigidity": 3, "fixed_schedule": 1, "income_level": 3, "stability": 4},
    "Service: Restaurant": {"rigidity": 2, "fixed_schedule": 0, "income_level": 2, "stability": 2},
    "Service: Hotel": {"rigidity": 2, "fixed_schedule": 0, "income_level": 2, "stability": 2},
    
    # Education
    "Education: Teacher": {"rigidity": 4, "fixed_schedule": 1, "income_level": 3, "stability": 5},
    "Education: College Professor": {"rigidity": 4, "fixed_schedule": 1, "income_level": 5, "stability": 5},
    "Education: Teacher Asst/Aid": {"rigidity": 3, "fixed_schedule": 1, "income_level": 2, "stability": 4},

    # Tech & Engineering
    "Tech: Computer/Programmer": {"rigidity": 3, "fixed_schedule": 1, "income_level": 5, "stability": 5},
    "Tech: Engineer": {"rigidity": 4, "fixed_schedule": 1, "income_level": 5, "stability": 5},
    "Tech: Sales, Mktg": {"rigidity": 3, "fixed_schedule": 1, "income_level": 4, "stability": 4},

    # Others
    "Consultant": {"rigidity": 2, "fixed_schedule": 0, "income_level": 4, "stability": 4},
    "Transport: Pilot": {"rigidity": 5, "fixed_schedule": 1, "income_level": 5, "stability": 5},
    "Firefighter": {"rigidity": 5, "fixed_schedule": 1, "income_level": 4, "stability": 5},
    "Journalist/Media": {"rigidity": 3, "fixed_schedule": 1, "income_level": 4, "stability": 4},
    "Agriculture": {"rigidity": 2, "fixed_schedule": 0, "income_level": 2, "stability": 3},
    "Military": {"rigidity": 5, "fixed_schedule": 1, "income_level": 4, "stability": 5},
    "Unemployed": {"rigidity": 1, "fixed_schedule": 0, "income_level": 1, "stability": 1},
    "Retired": {"rigidity": 1, "fixed_schedule": 0, "income_level": 2, "stability": 5},
}

# Default category for unknown occupations
default_attributes = {"rigidity": 3, "fixed_schedule": 1, "income_level": 3, "stability": 3}


In [4]:
# Function to match occupation and assign attributes
def get_job_features(occupation):
    for key in job_attributes:
        if key.lower() in occupation.lower():
            return job_attributes[key]
    return default_attributes  # Assign default values if no match found


In [5]:
# Apply job features to dataset
job_features = df['big_occupation'].apply(get_job_features).apply(pd.Series)
job_features


Unnamed: 0,rigidity,fixed_schedule,income_level,stability
0,1,0,1,1
1,1,0,1,2
2,1,0,1,1
3,1,0,1,1
4,1,0,1,2
...,...,...,...,...
3259,5,1,4,5
3260,3,1,4,4
3261,3,1,4,4
3262,3,1,3,4


In [6]:
df = pd.concat([df, job_features], axis=1)

In [7]:
# some features engineering
df["big_age_match_start"] = abs(df['big_birthdate'].dt.year - df["match_activation_date"].dt.year)
df["little_age_match_start"] = abs(df['little_birthdate'].dt.year - df["match_activation_date"].dt.year)
df["same_gender"] = df['little_gender']==df['big_gender']



In [8]:
df.drop([
    "big_occupation", "big_gender", "little_gender", "big_birthdate",
    "match_activation_date", "little_birthdate", "late_stage_notes", "early_stage_notes",
    "program", 
], axis=1, inplace=True)

In [9]:
# Tokenize and compute Jaccard Similarity (intersection / union)
def jaccard_similarity(str1, str2):
    set1 = set(str(str1).lower().split(' '))
    set2 = set(str(str2).lower().split(' '))
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union)

df['race_similarity'] = df.apply(lambda row: jaccard_similarity(row['big_race_ethnicity'], row['little_participant__race_ethnicity']), axis=1)


In [10]:
df

Unnamed: 0,index,match_id,big_age,program_type,big_race_ethnicity,rationale_for_match,little_participant__race_ethnicity,match_length,avg_cadence_day,max_cadence_day,...,sentiment_change,sentiment_trend,rigidity,fixed_schedule,income_level,stability,big_age_match_start,little_age_match_start,same_gender,race_similarity
0,0,a1v2J0000027CWYQA2,25,Site,Asian;,"Big, little and parent were in agreement with ...",Asian,5.5,35.000000,56,...,0.2265,Stable,1,0,1,1,18,12,True,0.000000
1,1,a1v2J0000027CWfQAM,26,Site,White or Caucasian;,"Both seem to like the arts, books, and present...",Black or African American,8.5,46.200000,83,...,-0.0037,Stable,1,0,1,2,19,11,True,0.166667
2,2,a1v2J0000027CWiQAM,27,Site,Asian;,Both BS and LB share similar interests. BS has...,Asian,6.9,35.333333,65,...,0.0330,Stable,1,0,1,1,19,12,True,0.000000
3,3,a1v2J0000027CWoQAM,25,Site,Asian;,B_first_name was open to the little that he is...,Asian,7.2,43.750000,106,...,-0.5276,Declined,1,0,1,1,18,8,True,0.000000
4,4,a1v2J0000027CWpQAM,27,Site,White or Caucasian;,Distance is 8 miles (21 mins). Both are talkti...,Black or African American,7.4,45.750000,85,...,-0.0059,Stable,1,0,1,2,20,11,True,0.166667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3259,3259,a1vUX000001ZI7xYAG,38,Community,White or Caucasian;,Distance: 13 min (4.6 mi) Shared Interests: sp...,Black or African American,2.4,11.666667,28,...,0.0009,Stable,5,1,4,5,37,13,True,0.166667
3260,3260,a1vUX000001ZICnYAO,56,Community,White or Caucasian;,1. Distance: 10 miles (20 mins) 2. Shared Trai...,White or Caucasian,2.3,0.000000,0,...,-0.9954,Declined,3,1,4,4,55,12,True,0.500000
3261,3261,a1vUX000001agjJYAQ,33,Community,Asian - Other;,1. Distance: 9 miles (20 mins) 2. Shared Trait...,Black or African American,1.1,0.000000,0,...,-0.9981,Declined,3,1,4,4,33,10,True,0.000000
3262,3262,a1vUX000001avDhYAI,48,Community,White or Caucasian;,Distance: 9 min (3.7 mi) Shared interests:Spor...,Black or African American,2.3,0.000000,0,...,-0.9698,Declined,3,1,3,4,48,11,True,0.166667


In [11]:
df.to_excel("data/encoding.xlsx", index=False)