# Clean up for HCMST_2017-2022.csv

## Loading CSV

Load the necessary libraries for the clean-up, as well as the provided dataset

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../data/raw/how-couples-meet-and-stay-together/HCMST_2017-2022.csv")
df.head()

Unnamed: 0,caseid_new,w3_Weight,w3_Weight_LGB,w3_combo_weight,w3_attrition_adj_weight,w2_weight_genpop,w2_weight_LGB,w2_combo_weight,w2_attrition_adj_weights,w1_weight_combo,...,p20_pppa1634,p20_pppa1902,p20_pppa1903,p20_pppa1904,p20_ppp22001,p20_pppa1905,p20_pppa1648,p20_ppp20072,p20_ppp20071,p20_ppp2date2020
0,53001,0.4422,,0.495308,0.400185,0.3856,,0.43767,0.380351,0.426861,...,2.0,0.0,1.0,0.0,0.0,0.0,13.0,6.0,,20210506.0
1,71609,0.8284,,0.927891,0.879258,0.9196,,1.043778,0.953948,1.295508,...,2.0,0.0,1.0,0.0,0.0,0.0,2.0,5.0,1.0,20201118.0
2,106983,0.8255,,0.924643,0.706467,0.7748,,0.879425,0.724682,1.126573,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,4.0,2.0,20210429.0
3,121759,,,,,0.9177,,1.041622,0.793093,0.93344,...,1.0,1.0,0.0,0.0,0.0,0.0,11.0,2.0,1.0,20210507.0
4,158083,0.881,,0.986809,0.655467,0.8697,,0.98714,0.735473,0.931291,...,1.0,1.0,0.0,0.0,0.0,0.0,13.0,6.0,,20210602.0


In [3]:
df.shape

(3510, 725)

## Full List of Variables 

The next cell displays the full list of variables in the original data set. We are not going to use everything...

We'll focus on variables of interest for specific inquiries.

In the following links, we can find the encoding of each variable.

Data source: https://data.stanford.edu/hcmst2017

Variable List per year (Note w1 = 2017, w2 = 2020, w3 = 2022): https://stacks.stanford.edu/file/druid:tq903pj6286/HCMST%202017-%202022%20user%27s%20guide%20v2.3.pdf

Detailed Info on variables: https://stacks.stanford.edu/file/druid:hg921sg6829/HCMST%202017%20to%202022%20v2.2%20codebook.pdf

In [4]:
#df.columns.tolist()

## Variables of Interest

The following lists are not actually used in the rest of the notebook for data cleaning purposes. 

Still, they display the selected variables of interest (and how their names vary from year to year), so, we are leaving them in the notebook for documentation.

In [5]:
# ppage = Age
# ppeduc = Education (Highest Degree Received)
# ppgender = Gender
# ppethm =  Race / Ethnicity 
# ppincimp = Household Income
# ppwork = Current Employment Status 

subject_demographics_2017 = [
    'w1_ppage', 'w1_ppeduc', 'w1_ppgender', 'w1_ppethm', 'w1_ppincimp', 'w1_ppwork'
]
subject_demographics_2020 = [
    'w2_ppage', 'w2_ppeduc', 'w2_ppgender', 'w2_ppethm', 'w2_ppincimp', 'w2_ppwork'
]
subject_demographics_2022 = [
    'w3_ppage', 'w3_ppeduc', 'w3_ppgender', 'w3_ppethm', 'w3_ppincimp', 'w3_ppwork'
]

In [6]:
# q4 = Partner Gender
# q9 = Partner Age
# q6b = Partner's Race
# q10 = Partner's Education (Highest Degree Received)

partner_demographics_2017 = [
    'w1_q4', 'w1_q9', 'w1_q6b', 'w1_q10'
]
partner_demographics_2020 = [
    'w2_Q4', 'w2_Q9', 'w2_Q6B', 'w2_Q10'
]
partner_demographics_2022 = [
    'w3_Q4', 'w3_Q9', 'w3_Q6B', 'w3_Q10'
]

In [7]:
# same_sex_couple = same-sex couple
# married = Married
# q34/rel_qual_combo/rel_qual = Relationship quality 1 (Excellent) to 5 (Very Poor) - Potential Target

relationship_status_2017 = [
    'w1_same_sex_couple', 'w1_married', 'w1_q34'
]
relationship_status_2020 = [
    'w2_same_sex_couple', 'w2_married', 'w2_rel_qual_combo'
]
relationship_status_2022 = [
    'w3_same_sex_couple', 'w3_married', 'w3_rel_qual'
]

In [8]:
# q21b - Year current relationship started
# q21d - Year of marriage
# q21e - Year relationship ended
# relate/relationship duration = Duration of relationship in years

relationship_time_2017 = [
    'w1_q21b_year', 'w1_q21d_year', 'w1_q21e_year', 'w1_relate_duration_in2017_years'
]
relationship_time_2020 = [
    'w2_q21b_year', 'w2_q21d_year', 'w2_q21e_year', 'w2_relationship_duration'
]
relationship_time_2022 = [
    'w3_Q21B_year', 'w3_Q21D_year', 'w3_Q21E_year', 'w3_relationship_duration_yrs'
]

In [9]:
# PPT01 = # of children in the household ages 0-1
# PPT25 = # of children in the household ages 2-5
# PPT612 = # of children in the household ages 6-12
# PPT1317 = # of children in the household ages 13-17
# PPT18OV = # of children in the household ages 18-Over

children_info_2017 = [
    'w1_PPT01', 'w1_PPT25', 'w1_PPT612', 'w1_PPT1317', 'w1_PPT18OV'
]
children_info_2020 = [
    'w2_PPT01', 'w2_PPT25', 'w2_PPT612', 'w2_PPT1317', 'w2_PPT18OV'
]
children_info_2022 = [
    'w3_PPT01', 'w3_PPT25', 'w3_PPT612', 'w3_PPT1317', 'w3_PPT18OV'
]

In [10]:
# Only available in 2022 in full (Post-COVID)

# coronavirus_effect_combo = Is relationship better or worse during pandemic
# pandemic_income = has income gone up or down during pandemic
# subject/partner_had_COVID = has been sick with COVID
# corona/partner_vaccine = has been vaccinated
# COVID_agreement = subject and partner agree on approach to pandemic

covid_vars_2022 = [
    'w3_coronavirus_effect_combo', 'w3_pandemic_income', 'w3_subject_had_COVID', 'w3_partner_had_COVID', 'w3_corona_vaccine',
    'w3_partner_corona_vaccine', 'w3_COVID_agreement'
]

In [11]:
# Only some variables available per year

# sex_frequency = Frequency of sex
# flirt = how often flirt
# fight = how often fight
# monogamy = subject's commitment to monogamy
# p_monogamy = expected commitment to monogamy from partner

relationship_quality_2017 = [
    'w1_sex_frequency'
]
relationship_quality_2020 = [
    'w2_sex_frequency', 'w2_flirt', 'w2_fight'
]
relationship_quality_2022 = [
    'w3_sex_frequency', 'w3_flirt', 'w3_fight'
]

## Rename Columns

Some variables contain the same information but vary in name year to year. So, we'll need 3 renaming dictionaries (1 per year)

First, the 3 renaming maps (one per year):

In [12]:
column_renames_2017 = {
    #subject_demographics
    'w1_ppage': 'subject_age',
    'w1_ppeduc': 'subject_education',
    'w1_ppgender': 'subject_gender',
    'w1_ppethm': 'subject_ethnicity',
    'w1_ppincimp': 'subject_income_category',
    'w1_ppwork': 'subject_employment_status',
    #partner_demographics
    'w1_q4': 'partner_gender',
    'w1_q9': 'partner_age',
    'w1_q6b': 'partner_ethnicity',
    'w1_q10': 'partner_education',
    #relationship_status
    'w1_same_sex_couple': 'same_sex_couple',
    'w1_married': 'married',
    'w1_q34': 'relationship_quality',
    #relationship_time
    'w1_q21b_year': 'relationship_start_year',
    'w1_q21d_year': 'marriage_year',
    'w1_q21e_year': 'break_up_year',
    'w1_relate_duration_in2017_years': 'relationship_duration',
    #children_info
    'w1_PPT01': 'kids_0_1',
    'w1_PPT25': 'kids_2_5',
    'w1_PPT612': 'kids_6_12',
    'w1_PPT1317': 'kids_13_17',
    'w1_PPT18OV': 'kids_18_plus',
    #covidvars
    #'w1_coronavirus_effect_combo': 'rel_change_during_pandemic',
    #'w1_pandemic_income': 'inc_change_during_pandemic',
    #'w1_subject_had_COVID': 'subject_had_covid', 
    #'w1_partner_had_COVID': 'partner_had_covid', 
    #'w1_corona_vaccine': 'subject_vaccinated',
    #'w1_partner_corona_vaccine': 'partner_vaccinated', 
    #'w1_COVID_agreement': 'agree_covid_approach',
    #relationship_quality
    'w1_sex_frequency': 'sex_frequency',
    #'w3_flirt': 'flirts_with_partner',
    #'w3_fight': 'fights_with_partner',
}

In [13]:
column_renames_2020 = {
    #subject_demographics
    'w2_ppage': 'subject_age',
    'w2_ppeduc': 'subject_education',
    'w2_ppgender': 'subject_gender',
    'w2_ppethm': 'subject_ethnicity',
    'w2_ppincimp': 'subject_income_category',
    'w2_ppwork': 'subject_employment_status',
    #partner_demographics
    'w2_Q4': 'partner_gender',
    'w2_Q9': 'partner_age',
    'w2_Q6B': 'partner_ethnicity',
    'w2_Q10': 'partner_education',
    #relationship_status
    'w2_same_sex_couple': 'same_sex_couple',
    'w2_married': 'married',
    'w2_rel_qual_combo': 'relationship_quality',
    #relationship_time
    'w2_q21b_year': 'relationship_start_year',
    'w2_q21d_year': 'marriage_year',
    'w2_q21e_year': 'break_up_year',
    'w2_relationship_duration': 'relationship_duration',
    #children_info
    'w2_PPT01': 'kids_0_1',
    'w2_PPT25': 'kids_2_5',
    'w2_PPT612': 'kids_6_12',
    'w2_PPT1317': 'kids_13_17',
    'w2_PPT18OV': 'kids_18_plus',
    #covidvars
    #'w2_coronavirus_effect_combo': 'rel_change_during_pandemic',
    #'w2_pandemic_income': 'inc_change_during_pandemic',
    #'w2_subject_had_COVID': 'subject_had_covid', 
    #'w2_partner_had_COVID': 'partner_had_covid', 
    #'w2_corona_vaccine': 'subject_vaccinated',
    #'w2_partner_corona_vaccine': 'partner_vaccinated', 
    #'w2_COVID_agreement': 'agree_covid_approach',
    #relationship_quality
    'w2_sex_frequency': 'sex_frequency',
    'w2_flirt': 'flirts_with_partner',
    'w2_fight': 'fights_with_partner',
}

In [14]:
column_renames_2022 = {
    #subject_demographics
    'w3_ppage': 'subject_age',
    'w3_ppeduc': 'subject_education',
    'w3_ppgender': 'subject_gender',
    'w3_ppethm': 'subject_ethnicity',
    'w3_ppincimp': 'subject_income_category',
    'w3_ppwork': 'subject_employment_status',
    #partner_demographics
    'w3_Q4': 'partner_gender',
    'w3_Q9': 'partner_age',
    'w3_Q6B': 'partner_ethnicity',
    'w3_Q10': 'partner_education',
    #relationship_status
    'w3_same_sex_couple': 'same_sex_couple',
    'w3_married': 'married',
    'w3_rel_qual': 'relationship_quality',
    #relationship_time
    'w3_Q21B_year': 'relationship_start_year',
    'w3_Q21D_year': 'marriage_year',
    'w3_Q21E_year': 'break_up_year',
    'w3_relationship_duration_yrs': 'relationship_duration',
    #children_info
    'w3_PPT01': 'kids_0_1',
    'w3_PPT25': 'kids_2_5',
    'w3_PPT612': 'kids_6_12',
    'w3_PPT1317': 'kids_13_17',
    'w3_PPT18OV': 'kids_18_plus',
    #covidvars
    'w3_coronavirus_effect_combo': 'rel_change_during_pandemic',
    'w3_pandemic_income': 'inc_change_during_pandemic',
    'w3_subject_had_COVID': 'subject_had_covid', 
    'w3_partner_had_COVID': 'partner_had_covid', 
    'w3_corona_vaccine': 'subject_vaccinated',
    'w3_partner_corona_vaccine': 'partner_vaccinated', 
    'w3_COVID_agreement': 'agree_covid_approach',
    #relationship_quality
    'w3_sex_frequency': 'sex_frequency',
    'w3_flirt': 'flirts_with_partner',
    'w3_fight': 'fights_with_partner',
}

### Dataframe per year

We need to create dataframes per year while renaming variables in each dictionary.

We will also create a column named "Wave" that contains the year of the survey.

First, 2017:

In [15]:
df_2017 = df[['caseid_new'] + list(column_renames_2017.keys())].rename(columns=column_renames_2017)
df_2017['wave'] = '2017'
df_2017.head()

Unnamed: 0,caseid_new,subject_age,subject_education,subject_gender,subject_ethnicity,subject_income_category,subject_employment_status,partner_gender,partner_age,partner_ethnicity,...,marriage_year,break_up_year,relationship_duration,kids_0_1,kids_2_5,kids_6_12,kids_13_17,kids_18_plus,sex_frequency,wave
0,53001,48,9,2,5,13,1,1.0,46.0,1.0,...,2014.0,,3.583333,0,0,0,0,1,3.0,2017
1,71609,68,10,2,1,12,1,1.0,71.0,1.0,...,1969.0,,52.75,0,0,0,0,2,5.0,2017
2,106983,39,11,1,1,15,1,2.0,49.0,1.0,...,2002.0,,17.583334,0,0,2,0,3,3.0,2017
3,121759,54,9,1,1,16,1,2.0,59.0,4.0,...,1991.0,,27.416666,0,0,0,0,4,4.0,2017
4,158083,48,10,1,1,14,1,2.0,34.0,1.0,...,2013.0,2014.0,,0,0,0,0,1,,2017


In [16]:
df_2017.shape

(3510, 25)

Then, 2020:

In [17]:
df_2020 = df[['caseid_new'] + list(column_renames_2020.keys())].rename(columns=column_renames_2020)
df_2020['wave'] = '2020'
df_2020.head()

Unnamed: 0,caseid_new,subject_age,subject_education,subject_gender,subject_ethnicity,subject_income_category,subject_employment_status,partner_gender,partner_age,partner_ethnicity,...,relationship_duration,kids_0_1,kids_2_5,kids_6_12,kids_13_17,kids_18_plus,sex_frequency,flirts_with_partner,fights_with_partner,wave
0,53001,51.0,9.0,2.0,1.0,10.0,1.0,1.0,51.0,1.0,...,0.0,0.0,0.0,0.0,0.0,2.0,5.0,6.0,1.0,2020
1,71609,71.0,10.0,2.0,1.0,13.0,1.0,,,,...,56.0,0.0,0.0,0.0,0.0,1.0,5.0,6.0,4.0,2020
2,106983,42.0,11.0,1.0,1.0,15.0,1.0,,,,...,21.0,0.0,0.0,1.0,1.0,3.0,3.0,2.0,3.0,2020
3,121759,57.0,9.0,1.0,1.0,16.0,1.0,,,,...,31.0,0.0,0.0,0.0,0.0,2.0,4.0,6.0,1.0,2020
4,158083,52.0,10.0,1.0,1.0,18.0,1.0,,,,...,,0.0,0.0,0.0,0.0,3.0,,,,2020


In [18]:
df_2020.shape

(3510, 27)

Then, 2022:

In [19]:
df_2022 = df[['caseid_new'] + list(column_renames_2022.keys())].rename(columns=column_renames_2022)
df_2022['wave'] = '2022'
df_2022.head()

Unnamed: 0,caseid_new,subject_age,subject_education,subject_gender,subject_ethnicity,subject_income_category,subject_employment_status,partner_gender,partner_age,partner_ethnicity,...,inc_change_during_pandemic,subject_had_covid,partner_had_covid,subject_vaccinated,partner_vaccinated,agree_covid_approach,sex_frequency,flirts_with_partner,fights_with_partner,wave
0,53001,53.0,9.0,2.0,1.0,10.0,1.0,,,,...,3.0,0.0,1.0,4.0,4.0,1.0,3.0,2.0,1.0,2022
1,71609,72.0,10.0,2.0,1.0,14.0,1.0,,,,...,2.0,0.0,0.0,1.0,1.0,2.0,5.0,6.0,8.0,2022
2,106983,43.0,11.0,1.0,1.0,14.0,1.0,,,,...,2.0,0.0,0.0,1.0,1.0,1.0,3.0,2.0,3.0,2022
3,121759,,,,,,,,,,...,,,,,,,,,,2022
4,158083,53.0,10.0,1.0,1.0,18.0,1.0,,,,...,3.0,0.0,,1.0,,,,,,2022


In [20]:
df_2022.shape

(3510, 34)

## A single Dataframe

Now, we can concatenate the 3 dataframes since they have the same variables. We'll have the "wave" column to differentiate by year.

In [21]:
df_cleaned = pd.concat([df_2017, df_2020, df_2022], ignore_index=True)

In [22]:
df_cleaned.shape

(10530, 34)

In [23]:
df_cleaned.head()

Unnamed: 0,caseid_new,subject_age,subject_education,subject_gender,subject_ethnicity,subject_income_category,subject_employment_status,partner_gender,partner_age,partner_ethnicity,...,wave,flirts_with_partner,fights_with_partner,rel_change_during_pandemic,inc_change_during_pandemic,subject_had_covid,partner_had_covid,subject_vaccinated,partner_vaccinated,agree_covid_approach
0,53001,48.0,9.0,2.0,5.0,13.0,1.0,1.0,46.0,1.0,...,2017,,,,,,,,,
1,71609,68.0,10.0,2.0,1.0,12.0,1.0,1.0,71.0,1.0,...,2017,,,,,,,,,
2,106983,39.0,11.0,1.0,1.0,15.0,1.0,2.0,49.0,1.0,...,2017,,,,,,,,,
3,121759,54.0,9.0,1.0,1.0,16.0,1.0,2.0,59.0,4.0,...,2017,,,,,,,,,
4,158083,48.0,10.0,1.0,1.0,14.0,1.0,2.0,34.0,1.0,...,2017,,,,,,,,,


We are just going to rename the 'caseid_new' to something more simple:

In [24]:
df_cleaned = df_cleaned.rename(columns={'caseid_new': 'id'})

### Selected variables

In [25]:
df_cleaned.columns.tolist()

['id',
 'subject_age',
 'subject_education',
 'subject_gender',
 'subject_ethnicity',
 'subject_income_category',
 'subject_employment_status',
 'partner_gender',
 'partner_age',
 'partner_ethnicity',
 'partner_education',
 'same_sex_couple',
 'married',
 'relationship_quality',
 'relationship_start_year',
 'marriage_year',
 'break_up_year',
 'relationship_duration',
 'kids_0_1',
 'kids_2_5',
 'kids_6_12',
 'kids_13_17',
 'kids_18_plus',
 'sex_frequency',
 'wave',
 'flirts_with_partner',
 'fights_with_partner',
 'rel_change_during_pandemic',
 'inc_change_during_pandemic',
 'subject_had_covid',
 'partner_had_covid',
 'subject_vaccinated',
 'partner_vaccinated',
 'agree_covid_approach']

Merging kids of all ages to a single feature:

In [26]:
df_cleaned['children'] = df_cleaned['kids_0_1'] + df_cleaned['kids_2_5'] + df_cleaned['kids_6_12'] + df_cleaned['kids_13_17'] + df_cleaned['kids_18_plus']

df_cleaned.drop(columns=['kids_0_1', 'kids_2_5', 'kids_6_12', 'kids_13_17', 'kids_18_plus'], inplace=True)

In [27]:
df_cleaned.head()

Unnamed: 0,id,subject_age,subject_education,subject_gender,subject_ethnicity,subject_income_category,subject_employment_status,partner_gender,partner_age,partner_ethnicity,...,flirts_with_partner,fights_with_partner,rel_change_during_pandemic,inc_change_during_pandemic,subject_had_covid,partner_had_covid,subject_vaccinated,partner_vaccinated,agree_covid_approach,children
0,53001,48.0,9.0,2.0,5.0,13.0,1.0,1.0,46.0,1.0,...,,,,,,,,,,1.0
1,71609,68.0,10.0,2.0,1.0,12.0,1.0,1.0,71.0,1.0,...,,,,,,,,,,2.0
2,106983,39.0,11.0,1.0,1.0,15.0,1.0,2.0,49.0,1.0,...,,,,,,,,,,5.0
3,121759,54.0,9.0,1.0,1.0,16.0,1.0,2.0,59.0,4.0,...,,,,,,,,,,4.0
4,158083,48.0,10.0,1.0,1.0,14.0,1.0,2.0,34.0,1.0,...,,,,,,,,,,1.0


## Data Type Exploration

We can see what data type each column has. We can note that all columns are encoded with numerical values.

In [28]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

print(df_cleaned.dtypes)

id                              int64
subject_age                   float64
subject_education             float64
subject_gender                float64
subject_ethnicity             float64
subject_income_category       float64
subject_employment_status     float64
partner_gender                float64
partner_age                   float64
partner_ethnicity             float64
partner_education             float64
same_sex_couple               float64
married                       float64
relationship_quality          float64
relationship_start_year       float64
marriage_year                 float64
break_up_year                 float64
relationship_duration         float64
sex_frequency                 float64
wave                           object
flirts_with_partner           float64
fights_with_partner           float64
rel_change_during_pandemic    float64
inc_change_during_pandemic    float64
subject_had_covid             float64
partner_had_covid             float64
subject_vacc

## Decoding Variables

We'll need to re-map column content to something more interpretable. 

Going column by column...

First, education level:

In [29]:
edu_mapping = {
    1: 'no_education',
    2: '1st_4th_grade',
    3: '5th_6th_grade',
    4: '7th_8th_grade',
    5: '9th',
    6: '10th',
    7: '11th',
    8: '12th_nodiploma',
    9: 'high_school_grad',
    10: 'some_college',
    11: 'associate_degree',
    12: 'bach_degree',
    13: 'masters_degree',
    14: 'prof_doct_degree'
}

df_cleaned['subject_education'] = df_cleaned['subject_education'].map(edu_mapping)
df_cleaned['partner_education'] = df_cleaned['partner_education'].map(edu_mapping)

ordered_levels = [
    'no_education',
    '1st_4th_grade',
    '5th_6th_grade',
    '7th_8th_grade',
    '9th',
    '10th',
    '11th',
    '12th_nodiploma',
    'high_school_grad',
    'some_college',
    'associate_degree',
    'bach_degree',
    'masters_degree',
    'prof_doct_degree'
]

# Convert to ordered categorical
df_cleaned['subject_education'] = pd.Categorical(
    df_cleaned['subject_education'],
    categories=ordered_levels,
    ordered=True
)

df_cleaned['partner_education'] = pd.Categorical(
    df_cleaned['partner_education'],
    categories=ordered_levels,
    ordered=True
)

Then gender:

In [30]:
gender_mapping = {
    1: 'male',
    2: 'female',
    3: 'other'
}

df_cleaned['subject_gender'] = df_cleaned['subject_gender'].map(gender_mapping)
df_cleaned['partner_gender'] = df_cleaned['partner_gender'].map(gender_mapping)

Then, ethniticy/race:

In [31]:
eth_sub_mapping = {
    1: 'white',
    2: 'black',
    3: 'other',
    4: 'hispanic',
    5: '2_plus_eth'
}

df_cleaned['subject_ethnicity'] = df_cleaned['subject_ethnicity'].map(eth_sub_mapping)

eth_part_mapping = {
    1: 'white',
    2: 'black',
    3: 'american_indian',
    4: 'asian',
    5: 'other'
}

df_cleaned['partner_ethnicity'] = df_cleaned['partner_ethnicity'].map(eth_part_mapping)

Then, income level:

In [32]:
income_mapping = {
    1: 'under_5k',
    2: '5k_7k',
    3: '7k_10k',
    4: '10k_12k',
    5: '12k_15k',
    6: '15k_20k',
    7: '20k_25k',
    8: '25k_30k',
    9: '30k_35k',
    10: '35k_40k',
    11: '40k_50k',
    12: '50k_60k',
    13: '60k_75k',
    14: '75k_85k',
    15: '85k_100k',
    16: '100k_125k',
    17: '125k_150k',
    18: '150k_175k',
    19: '175k_200k',
    20: '200k_250k',
    21: 'over_250k'
}

df_cleaned['subject_income_category'] = df_cleaned['subject_income_category'].map(income_mapping)

ordered_income_levels = [
    'under_5k', '5k_7k', '7k_10k', '10k_12k', '12k_15k', '15k_20k',
    '20k_25k', '25k_30k', '30k_35k', '35k_40k', '40k_50k', '50k_60k',
    '60k_75k', '75k_85k', '85k_100k', '100k_125k', '125k_150k',
    '150k_175k', '175k_200k', '200k_250k', 'over_250k'
]

df_cleaned['subject_income_category'] = pd.Categorical(
    df_cleaned['subject_income_category'],
    categories=ordered_income_levels,
    ordered=True
)

Then, employment status:

In [33]:
employment_mapping = {
    1: 'working_paid_employee',
    2: 'working_self_employed',
    3: 'not_working_temp_layoff',
    4: 'not_working_looking',
    5: 'not_working_retired',
    6: 'not_working_disabled',
    7: 'not_working_other'
}

df_cleaned['subject_employment_status'] = df_cleaned['subject_employment_status'].map(employment_mapping)

Then, same-sex couple encoding: 

In [34]:
same_sex_couple_map = {
    0: 'no',
    1: 'yes'
}

df_cleaned['same_sex_couple'] = df_cleaned['same_sex_couple'].map(same_sex_couple_map)

Then, marital status:

In [35]:
married_mapping = {
    0: 'not_married',
    1: 'married'
}

df_cleaned['married'] = df_cleaned['married'].map(married_mapping)

Then, relationship quality **(THIS COULD BE A TARGET VARIABLE FOR MANY STUDIES)**:

In [36]:
rel_qual_mapping = {
    1: 'excellent',
    2: 'good',
    3: 'fair',
    4: 'poor',
    5: 'very_poor'
}

df_cleaned['relationship_quality'] = df_cleaned['relationship_quality'].map(rel_qual_mapping)

Then, the effect of COVID on the relationship. Note that all COVID variables only appear in the 2022 survey.

In [37]:
covid_effect_mapping = {
    1: 'better_than_before',
    2: 'no_change',
    3: 'worse_than_before'
}

df_cleaned['rel_change_during_pandemic'] = df_cleaned['rel_change_during_pandemic'].map(covid_effect_mapping)

Then, the effect of COVID on income level.

In [38]:
covid_income_mapping = {
    1: 'much_worse',
    2: 'worse',
    3: 'no_change',
    4: 'better',
    5: 'much_better'
}

df_cleaned['inc_change_during_pandemic'] = df_cleaned['inc_change_during_pandemic'].map(covid_income_mapping)

Then, if subject or partner had COVID

In [39]:
had_covid_mapping = {
    0: 'no',
    1: 'yes'
}

df_cleaned['subject_had_covid'] = df_cleaned['subject_had_covid'].map(had_covid_mapping)
df_cleaned['partner_had_covid'] = df_cleaned['partner_had_covid'].map(had_covid_mapping)

Then, status of COVID vaccination:

In [40]:
corona_vaccine_mapping = {
    1: 'fully_vaccinated_and_booster',
    2: 'fully_vaccinated_no_booster',
    3: 'partially_vaccinated',
    4: 'not_vaccinated'
}

df_cleaned['subject_vaccinated'] = df_cleaned['subject_vaccinated'].map(corona_vaccine_mapping)
df_cleaned['partner_vaccinated'] = df_cleaned['partner_vaccinated'].map(corona_vaccine_mapping)

Then, if the couple agrees on COVID approach:

In [41]:
covid_agreement_mapping = {
    1: 'completely_agree',
    2: 'mostly_agree',
    3: 'mostly_disagree',
    4: 'completely_disagree'
}

df_cleaned['agree_covid_approach'] = df_cleaned['agree_covid_approach'].map(covid_agreement_mapping)

Then, sex frequency:

In [42]:
sex_frequency_mapping = {
    1: 'once_or_more_a_day',
    2: '3_to_6_times_a_week',
    3: 'once_or_twice_a_week',
    4: '2_to_3_times_a_month',
    5: 'once_a_month_or_less'
}

df_cleaned['sex_frequency'] = df_cleaned['sex_frequency'].map(sex_frequency_mapping)

Then, how often the subject flirts with partner:

In [43]:
flirt_mapping = {
    1: 'every_day',
    2: 'a_few_times_a_week',
    3: 'once_a_week',
    4: '1_to_3_times_a_month',
    5: 'less_than_once_a_month',
    6: 'never'
}

df_cleaned['flirts_with_partner'] = df_cleaned['flirts_with_partner'].map(flirt_mapping)

ordered_flirt_frequency_levels = [
    'every_day',
    'a_few_times_a_week',
    'once_a_week',
    '1_to_3_times_a_month',
    'less_than_once_a_month',
    'never'
]

df_cleaned['flirts_with_partner'] = pd.Categorical(
    df_cleaned['flirts_with_partner'],
    categories=ordered_flirt_frequency_levels,
    ordered=True
)

Then, how often the subject fights with partner:

In [44]:
fight_mapping = {
    1: '0_times',
    2: '1_time',
    3: '2_times',
    4: '3_times',
    5: '4_times',
    6: '5_times',
    7: '6_times',
    8: '7_or_more_times'
}

df_cleaned['fights_with_partner'] = df_cleaned['fights_with_partner'].map(fight_mapping)

ordered_fight_frequency_levels = [
    '0_times',
    '1_time',
    '2_times',
    '3_times',
    '4_times',
    '5_times',
    '6_times',
    '7_or_more_times'
]

df_cleaned['fights_with_partner'] = pd.Categorical(
    df_cleaned['fights_with_partner'],
    categories=ordered_fight_frequency_levels,
    ordered=True
)

Now, we can see the updated data types for each variable.

We can also see a sample of how the data looks after the recoding process.

In [45]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

print(df_cleaned.dtypes)

id                               int64
subject_age                    float64
subject_education             category
subject_gender                  object
subject_ethnicity               object
subject_income_category       category
subject_employment_status       object
partner_gender                  object
partner_age                    float64
partner_ethnicity               object
partner_education             category
same_sex_couple                 object
married                         object
relationship_quality            object
relationship_start_year        float64
marriage_year                  float64
break_up_year                  float64
relationship_duration          float64
sex_frequency                   object
wave                            object
flirts_with_partner           category
fights_with_partner           category
rel_change_during_pandemic      object
inc_change_during_pandemic      object
subject_had_covid               object
partner_had_covid        

In [46]:
df_cleaned.sample(20)

Unnamed: 0,id,subject_age,subject_education,subject_gender,subject_ethnicity,subject_income_category,subject_employment_status,partner_gender,partner_age,partner_ethnicity,partner_education,same_sex_couple,married,relationship_quality,relationship_start_year,marriage_year,break_up_year,relationship_duration,sex_frequency,wave,flirts_with_partner,fights_with_partner,rel_change_during_pandemic,inc_change_during_pandemic,subject_had_covid,partner_had_covid,subject_vaccinated,partner_vaccinated,agree_covid_approach,children
2214,2534569,47.0,bach_degree,female,black,75k_85k,working_paid_employee,male,55.0,black,12th_nodiploma,no,not_married,,,,2004.0,,,2017,,,,,,,,,,1.0
9529,2690979,51.0,high_school_grad,female,white,20k_25k,working_paid_employee,,,,,no,married,excellent,,,,19.5,once_a_month_or_less,2022,every_day,0_times,no_change,better,no,no,fully_vaccinated_and_booster,fully_vaccinated_and_booster,completely_agree,4.0
1313,2028203,40.0,high_school_grad,male,hispanic,85k_100k,working_paid_employee,female,35.0,white,some_college,no,married,fair,1999.0,2008.0,,18.0,2_to_3_times_a_month,2017,,,,,,,,,,3.0
8751,2267467,73.0,high_school_grad,female,hispanic,30k_35k,not_working_retired,,,,,,not_married,,,,,,,2022,,,,no_change,no,,fully_vaccinated_no_booster,,,2.0
8775,2282833,,,,,,,,,,,,,,,,,,,2022,,,,,,,,,,
6586,2887677,,,,,,,,,,,,,,,,,,,2020,,,,,,,,,,
9704,2776317,41.0,masters_degree,female,white,100k_125k,working_paid_employee,,,,,no,married,excellent,,,,11.916667,once_a_month_or_less,2022,every_day,1_time,no_change,worse,no,yes,fully_vaccinated_and_booster,fully_vaccinated_and_booster,completely_agree,5.0
3996,1385783,65.0,associate_degree,male,white,25k_30k,not_working_looking,,,,,no,married,good,,,,47.0,once_a_month_or_less,2020,a_few_times_a_week,2_times,,,,,,,,3.0
3321,2941515,37.0,7th_8th_grade,female,white,12k_15k,working_self_employed,male,37.0,white,high_school_grad,no,not_married,excellent,2003.0,,,14.333333,once_or_twice_a_week,2017,,,,,,,,,,7.0
5202,2245161,65.0,some_college,male,white,15k_20k,not_working_retired,,,,,,not_married,,,,,,,2020,never,,,,,,,,,1.0


## NaN Percentage

Below we can note the % of NaN values in each column. 

We can note that most columns still has a major % of NaN values.

These high percentages are mostly dependent on the year of the survey. 

For example:

- 2017 is missing primarily COVID related info.
- 2020 is missing primarily partner info, chronological info and COVID related info.
- 2022 is missing primarily partner info and chronological info.

This makes sense as the documentation notes that:

- Variables in survey changed from year to year slightly.
- "Response rate was 3510/6753=52% in 2017, 2107/2431= 87% in 2020, and 1722/2073=83% in 2022. The Denominators in 2020 and 2022 include only subjects who remained in the KnowledgePanel, as they were the only subjects eligible to be contacted."

In [47]:
df_cleaned.shape

(10530, 30)

In [48]:
nan_percentage_per_column = df_cleaned.isna().mean() * 100
print(nan_percentage_per_column)

id                             0.000000
subject_age                   30.303894
subject_education             30.303894
subject_gender                30.303894
subject_ethnicity             30.303894
subject_income_category       30.303894
subject_employment_status     30.303894
partner_gender                65.536562
partner_age                   65.707502
partner_ethnicity             65.660019
partner_education             65.622032
same_sex_couple               39.563153
married                       31.405508
relationship_quality          44.586895
relationship_start_year       66.666667
marriage_year                 79.116809
break_up_year                 96.077873
relationship_duration         46.087369
sex_frequency                 47.853751
wave                           0.000000
flirts_with_partner           70.940171
fights_with_partner           71.975309
rel_change_during_pandemic    87.388414
inc_change_during_pandemic    83.741690
subject_had_covid             83.732194


In [49]:
nan_percentage_per_column_by_wave = (
    df_cleaned
    .groupby('wave')
    .apply(lambda g: g.isna().mean() * 100)
    .transpose()
)

print(nan_percentage_per_column_by_wave)

wave                              2017        2020       2022
id                            0.000000    0.000000   0.000000
subject_age                   0.000000   39.971510  50.940171
subject_education             0.000000   39.971510  50.940171
subject_gender                0.000000   39.971510  50.940171
subject_ethnicity             0.000000   39.971510  50.940171
subject_income_category       0.000000   39.971510  50.940171
subject_employment_status     0.000000   39.971510  50.940171
partner_gender                3.390313   95.868946  97.350427
partner_age                   3.874644   95.868946  97.378917
partner_ethnicity             3.732194   95.897436  97.350427
partner_education             3.618234   95.897436  97.350427
same_sex_couple               3.304843   53.475783  61.908832
married                       3.304843   39.971510  50.940171
relationship_quality         18.888889   52.962963  61.908832
relationship_start_year       6.666667   95.925926  97.407407
marriage

  .apply(lambda g: g.isna().mean() * 100)


We can remove some of the rows that are fully empty (even if they have values in "id" and "wave")

In [50]:
cols_to_check = df_cleaned.columns.difference(['id', 'wave'])
df_cleaned = df_cleaned.dropna(subset=cols_to_check, how='all')

In [51]:
df_cleaned.shape

(7339, 30)

In [52]:
nan_percentage_per_column_by_wave = (
    df_cleaned
    .groupby('wave')
    .apply(lambda g: g.isna().mean() * 100)
    .transpose()
)

print(nan_percentage_per_column_by_wave)

wave                              2017        2020       2022
id                            0.000000    0.000000   0.000000
subject_age                   0.000000    0.000000   0.000000
subject_education             0.000000    0.000000   0.000000
subject_gender                0.000000    0.000000   0.000000
subject_ethnicity             0.000000    0.000000   0.000000
subject_income_category       0.000000    0.000000   0.000000
subject_employment_status     0.000000    0.000000   0.000000
partner_gender                3.390313   93.118178  94.599303
partner_age                   3.874644   93.118178  94.657375
partner_ethnicity             3.732194   93.165638  94.599303
partner_education             3.618234   93.165638  94.599303
same_sex_couple               3.304843   22.496440  22.357724
married                       3.304843    0.000000   0.000000
relationship_quality         18.888889   21.642145  22.357724
relationship_start_year       6.666667   93.213099  94.715447
marriage

  .apply(lambda g: g.isna().mean() * 100)


## Final Reorganization of Columns

Giving amount of missing data, "Partner variables" and "Chronological variables" that have significant %NaN rates in 2020 and 2022 could be removed for specific inquiries. However, I would propose to keep them in the overall dataset. 

Therefore, the final data set would be:

In [53]:
df_cleaned.columns

Index(['id', 'subject_age', 'subject_education', 'subject_gender',
       'subject_ethnicity', 'subject_income_category',
       'subject_employment_status', 'partner_gender', 'partner_age',
       'partner_ethnicity', 'partner_education', 'same_sex_couple', 'married',
       'relationship_quality', 'relationship_start_year', 'marriage_year',
       'break_up_year', 'relationship_duration', 'sex_frequency', 'wave',
       'flirts_with_partner', 'fights_with_partner',
       'rel_change_during_pandemic', 'inc_change_during_pandemic',
       'subject_had_covid', 'partner_had_covid', 'subject_vaccinated',
       'partner_vaccinated', 'agree_covid_approach', 'children'],
      dtype='object')

In [54]:
data = df_cleaned[['id', 'wave',  #Identifiers
                   
                   'subject_age', 'subject_education', 'subject_gender',  #Subject variables
                   'subject_ethnicity', 'subject_income_category', 'subject_employment_status',
                   
                   'partner_gender', 'partner_age', 'partner_ethnicity', 'partner_education',  #Partner variables
                   
                   'same_sex_couple', 'married', 'sex_frequency', 'flirts_with_partner', 'fights_with_partner', #Couple Habits
                   
                   'relationship_start_year', 'marriage_year', 'break_up_year', #Chronology
                   'relationship_duration', 
                   
                   'children',  #Kids Info
                   
                   'rel_change_during_pandemic', 'inc_change_during_pandemic', #Pandemic Vars
                   'subject_had_covid', 'partner_had_covid', 'subject_vaccinated',
                   'partner_vaccinated', 'agree_covid_approach', 
                   
                   'relationship_quality' #Outcome
       ]]

data.head()

Unnamed: 0,id,wave,subject_age,subject_education,subject_gender,subject_ethnicity,subject_income_category,subject_employment_status,partner_gender,partner_age,partner_ethnicity,partner_education,same_sex_couple,married,sex_frequency,flirts_with_partner,fights_with_partner,relationship_start_year,marriage_year,break_up_year,relationship_duration,children,rel_change_during_pandemic,inc_change_during_pandemic,subject_had_covid,partner_had_covid,subject_vaccinated,partner_vaccinated,agree_covid_approach,relationship_quality
0,53001,2017,48.0,high_school_grad,female,2_plus_eth,60k_75k,working_paid_employee,male,46.0,white,associate_degree,no,married,once_or_twice_a_week,,,2013.0,2014.0,,3.583333,1.0,,,,,,,,excellent
1,71609,2017,68.0,some_college,female,white,50k_60k,working_paid_employee,male,71.0,white,some_college,no,married,once_a_month_or_less,,,1964.0,1969.0,,52.75,2.0,,,,,,,,excellent
2,106983,2017,39.0,associate_degree,male,white,85k_100k,working_paid_employee,female,49.0,white,some_college,no,married,once_or_twice_a_week,,,1999.0,2002.0,,17.583334,5.0,,,,,,,,excellent
3,121759,2017,54.0,high_school_grad,male,white,100k_125k,working_paid_employee,female,59.0,asian,masters_degree,no,married,2_to_3_times_a_month,,,1990.0,1991.0,,27.416666,4.0,,,,,,,,excellent
4,158083,2017,48.0,some_college,male,white,75k_85k,working_paid_employee,female,34.0,white,associate_degree,no,not_married,,,,2011.0,2013.0,2014.0,,1.0,,,,,,,,


## Final Data Review

In [55]:
for column in df_cleaned.columns:
    print(f"Column: {column}")
    print(f"Data type: {df_cleaned[column].dtype}")
    print(f"Unique values: {df_cleaned[column].unique()}\n")

Column: id
Data type: int64
Unique values: [  53001   71609  106983 ... 2968971 2969933 2972135]

Column: subject_age
Data type: float64
Unique values: [48. 68. 39. 54. 59. 72. 55. 73. 46. 43. 57. 50. 61. 79. 58. 64. 81. 70.
 80. 53. 51. 74. 56. 40. 36. 22. 47. 78. 67. 25. 65. 38. 24. 66. 35. 26.
 60. 71. 27. 29. 34. 76. 21. 41. 28. 19. 49. 86. 20. 23. 44. 84. 62. 63.
 45. 52. 77. 75. 42. 82. 69. 92. 85. 32. 37. 33. 30. 31. 90. 83. 18. 87.
 93. 89. 91. 88. 95. 97. 98.]

Column: subject_education
Data type: category
Unique values: ['high_school_grad', 'some_college', 'associate_degree', 'bach_degree', 'masters_degree', ..., '11th', '7th_8th_grade', '1st_4th_grade', 'no_education', '5th_6th_grade']
Length: 14
Categories (14, object): ['no_education' < '1st_4th_grade' < '5th_6th_grade' < '7th_8th_grade' ... 'associate_degree' < 'bach_degree' < 'masters_degree' < 'prof_doct_degree']

Column: subject_gender
Data type: object
Unique values: ['female' 'male']

Column: subject_ethnicity
Data t

## Saving CSV

In [56]:
# data.to_csv("../data/clean/hcmst.csv", index=False)

## Testing a 2022 data set:

During the client meeting on May 21st, 2025. It was decided to only keep 2022 data, while removing mostly sparse columns, id and wave values.

In [57]:
clean = pd.read_csv("../data/clean/hcmst.csv")

clean.head()

Unnamed: 0,id,wave,subject_age,subject_education,subject_gender,subject_ethnicity,subject_income_category,subject_employment_status,same_sex_couple,married,sex_frequency,flirts_with_partner,fights_with_partner,relationship_duration,children,rel_change_during_pandemic,inc_change_during_pandemic,subject_had_covid,partner_had_covid,subject_vaccinated,partner_vaccinated,agree_covid_approach,relationship_quality
0,53001,2022,53.0,high_school_grad,female,white,35k_40k,working_paid_employee,no,not_married,once_or_twice_a_week,a_few_times_a_week,0_times,1.5,2.0,better_than_before,no_change,no,yes,not_vaccinated,not_vaccinated,completely_agree,excellent
1,71609,2022,72.0,some_college,female,white,75k_85k,working_paid_employee,no,married,once_a_month_or_less,never,7_or_more_times,57.416668,1.0,no_change,worse,no,no,fully_vaccinated_and_booster,fully_vaccinated_and_booster,mostly_agree,good
2,106983,2022,43.0,associate_degree,male,white,75k_85k,working_paid_employee,no,married,once_or_twice_a_week,a_few_times_a_week,2_times,22.333334,5.0,no_change,worse,no,no,fully_vaccinated_and_booster,fully_vaccinated_and_booster,completely_agree,excellent
3,164061,2022,64.0,some_college,male,white,75k_85k,working_paid_employee,no,married,once_or_twice_a_week,1_to_3_times_a_month,0_times,28.25,2.0,no_change,no_change,no,no,fully_vaccinated_and_booster,fully_vaccinated_and_booster,completely_agree,good
4,212249,2022,60.0,high_school_grad,female,black,75k_85k,working_paid_employee,no,married,once_or_twice_a_week,a_few_times_a_week,0_times,38.916668,3.0,better_than_before,no_change,no,no,not_vaccinated,partially_vaccinated,completely_agree,excellent


In [58]:
data_2022 = clean[clean['wave'] == 2022].dropna(subset=['rel_change_during_pandemic', 'relationship_quality'])

In [59]:
data_2022.columns.tolist()

['id',
 'wave',
 'subject_age',
 'subject_education',
 'subject_gender',
 'subject_ethnicity',
 'subject_income_category',
 'subject_employment_status',
 'same_sex_couple',
 'married',
 'sex_frequency',
 'flirts_with_partner',
 'fights_with_partner',
 'relationship_duration',
 'children',
 'rel_change_during_pandemic',
 'inc_change_during_pandemic',
 'subject_had_covid',
 'partner_had_covid',
 'subject_vaccinated',
 'partner_vaccinated',
 'agree_covid_approach',
 'relationship_quality']

In [60]:
data_2022 = data_2022[[
 'subject_age',
 'subject_education',
 'subject_gender',
 'subject_ethnicity',
 'subject_income_category',
 'subject_employment_status',
 'same_sex_couple',
 'married',
 'sex_frequency',
 'flirts_with_partner',
 'fights_with_partner',
 'relationship_duration',
 'children',
 'rel_change_during_pandemic',
 'inc_change_during_pandemic',
 'subject_had_covid',
 'partner_had_covid',
 'subject_vaccinated',
 'partner_vaccinated',
 'agree_covid_approach',
 'relationship_quality']]

In [61]:
data_2022.head()

Unnamed: 0,subject_age,subject_education,subject_gender,subject_ethnicity,subject_income_category,subject_employment_status,same_sex_couple,married,sex_frequency,flirts_with_partner,fights_with_partner,relationship_duration,children,rel_change_during_pandemic,inc_change_during_pandemic,subject_had_covid,partner_had_covid,subject_vaccinated,partner_vaccinated,agree_covid_approach,relationship_quality
0,53.0,high_school_grad,female,white,35k_40k,working_paid_employee,no,not_married,once_or_twice_a_week,a_few_times_a_week,0_times,1.5,2.0,better_than_before,no_change,no,yes,not_vaccinated,not_vaccinated,completely_agree,excellent
1,72.0,some_college,female,white,75k_85k,working_paid_employee,no,married,once_a_month_or_less,never,7_or_more_times,57.416668,1.0,no_change,worse,no,no,fully_vaccinated_and_booster,fully_vaccinated_and_booster,mostly_agree,good
2,43.0,associate_degree,male,white,75k_85k,working_paid_employee,no,married,once_or_twice_a_week,a_few_times_a_week,2_times,22.333334,5.0,no_change,worse,no,no,fully_vaccinated_and_booster,fully_vaccinated_and_booster,completely_agree,excellent
3,64.0,some_college,male,white,75k_85k,working_paid_employee,no,married,once_or_twice_a_week,1_to_3_times_a_month,0_times,28.25,2.0,no_change,no_change,no,no,fully_vaccinated_and_booster,fully_vaccinated_and_booster,completely_agree,good
4,60.0,high_school_grad,female,black,75k_85k,working_paid_employee,no,married,once_or_twice_a_week,a_few_times_a_week,0_times,38.916668,3.0,better_than_before,no_change,no,no,not_vaccinated,partially_vaccinated,completely_agree,excellent


In [62]:
data_2022.shape

(1328, 21)

In [63]:
data_2022.to_csv("../data/clean/hcmst.csv", index=False)