# Proccess data to be compatible with Mplus/LCA
- extract only indicators/covariate variables needed for LCA
- convert all data to either be continuous, integers, or binary (0/1)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
person_data = pd.read_csv("data_outputs/final_person_data.csv")

In [None]:
# drop rows with NaN
person_clean = person_data.dropna(subset=["med_price", "med_rating", "sample_county","employment",
                                         "diversity_food_cat", "pref_modes", "med_trav_time",
                                         "med_rest_inc", "rest:non_rest", "meal"])
# dopr rows with income = Prefer not to answer
person_clean = person_clean[person_clean["hhincome_detailed"] != "Prefer not to answer"]

# drop negative travel times
person_clean = person_clean[person_clean["med_trav_time"] > 0]
print(len(person_clean))
# drop individuals who bike or other (sample size too small)
person_clean = person_clean[person_clean["pref_modes"]!="Bike"]
person_clean = person_clean[person_clean["pref_modes"]!="Other"]

In [None]:
# drop unneeded columns
person_clean = person_clean.drop(["Unnamed: 0"], axis = 1)
person_clean

# Indicators

In [None]:
# transform indicators to categorical
person_r = person_clean[["person_id", "pct_fast_food", "med_price", "med_rating", 
                        "diversity_food_cat"]]
person_r

In [None]:
# travel behavior
mode_dict = {"SOV": 1, "Walk": 2, "Transit": 3, "Other": 4, "Bike":4, "HOV":5}
purp_dict = {'Shop':1, 'Home':2, 'Errand/Other':3, 'Work':4, 'Escort':3,
       'Social/Recreation':5, 'Work-related':4, 'School':4, 'Meal':8,
       'Change mode':9, 'Missing: Non-response':10}
tod_dict = {"Breakfast": 1, "Lunch": 2, "Other":3, "Dinner": 4}

person_r[["travel_time", "rest_nonrest_trips"]] = person_clean[["med_trav_time", "rest:non_rest"]]
person_r["mode"] = person_clean["pref_modes"].apply(lambda x:mode_dict[x])
person_r["trip_purp"] = person_clean["nonrest_purp"].apply(lambda x: purp_dict[x])
person_r["tod"] = person_clean["meal"].apply(lambda x: tod_dict[x])
person_r = person_r[person_r["trip_purp"] < 7]
person_r

# Covariates

In [None]:
# built environment
person_r[["rest_empl_pop", "rest_inc", "dist_from_home"]] = person_clean[["rest_empl_pop", 
                                                                          "med_rest_inc",
                                                                         "dist_from_home"]]
person_r["home_fd"] = person_clean.home_fd.apply(lambda x: 0 if x == False else 1)
person_r

In [None]:
# sociodems
age_dict = {'Under 5 years old':1, '5-11 years':1, '12-15 years':1, '16-17 years':2,
            '18-24 years':2, '25-34 years':3,'35-44 years':3,'45-54 years':4,'55-64 years':4, 
            '65-74 years':5, '75-84 years':5, '85 or years older':5}
gender_dict = {'Male':1, 'Female':2, 'Non-Binary':3, 'Not listed here / prefer not to answer':3, 
               'Prefer not to answer':3, 'Another':3}
race_dict = {'White Only':1, 'Asian':2, 'Hispanic':3, 'African American':4,'Other':5, 
               'Missing':5}
income_dict = {'Under $10,000':1, '$10,000-$24,999':1,'$25,000-$34,999':1,'$35,000-$49,999':2,
               '$50,000-$74,999':2,'$75,000-$99,999':3, '$100,000-$149,999':3, 
               '$150,000-$199,999':4, '$200,000-$249,999':4, '$250,000 or more':4}
job_dict = {'Employed full time (35+ hours/week, paid)':1, 
            'Employed part time (fewer than 35 hours/week, paid)':2, 'Homemaker':3,
            'Retired':4,'Employed but not currently working (e.g., on leave, furloughed 100%)':4,
            'Not currently employed':4, 'Missing: Skip Logic':4, 'Self-employed':4,
            'Unpaid volunteer or intern':4}

person_r[["numchildren"]] = person_clean[["numchildren"]]
person_r["age"] = person_clean["age"].apply(lambda x: age_dict[x])
person_r["gender"] = person_clean["gender"].apply(lambda x: gender_dict[x])
person_r["race"] = person_clean["hh_race_category"].apply(lambda x: race_dict[x])
person_r["income"] = person_clean["hhincome_detailed"].apply(lambda x: income_dict[x])
person_r["job"] = person_clean["employment"].apply(lambda x: job_dict[x])
person_r["veh_hh_ratio"] = person_clean["vehicle_count"].apply(lambda x: int(x[:1]))/person_clean["hhsize"].apply(lambda x: int(x[:1]))
person_r

# Turn all covariates into binary variables

In [None]:
# one-hot encode
lca_data = pd.get_dummies(person_r, columns=["age", "gender", "race", "income", "job"], dtype=int)
# rename columns to be more descriptive
lca_data = lca_data.rename(columns={"gender_1": "male", "gender_2":"female", "gender_3":"gender_other",
                                   "race_1":"white", "race_2":"asian", "race_3":"hispanic",
                                   "race_4":"black", "race_5":"race_other", "job_1":"full_time",
                                   "job_2":"part_time", "job_3":"homemaker", "job_4":"unemployed"})

In [None]:
lca_data.to_csv("data_outputs/lca_data.csv")