In [28]:
import pandas as pd
from variables_dict_builder import variables_dict_builder

variables_dict = variables_dict_builder()
variables_suffixes = [*variables_dict.keys()]

df = (pd.read_csv('../intermediate_data/stacked_filtered_data.csv')
      .apply(pd.to_numeric, errors='coerce')
      )

In [26]:
# variable finder
variable_suffix = "edgrm"
[col for col in df.columns if variable_suffix in col]

[]

In [29]:
## WIDE TO LONG OF SELF/SPOUSE VARIABLES
ADL_suffixes = ['batha','dressa','eata','beda','toilta','walkra','walk1a','walksa', #ADLs
                    'shopa','phonea','moneya','mealsa','medsa','mapa', #IADLs
                    'clim1a','climsa','chaira','stoopa','lifta','armsa','dimea','pusha','sita', # Other functional limitations
                    ]

melt_suffixes = ['nrshom', 'nhmliv', #outcomes
                 'mstat',
                 'urbrur',
                 'livsib', 'momage', 'dadage',
                 'arthre','cancre','diabe','hearte','hibpe','lunge','psyche','stroke', #Diagnoses
                 'bmi', 'smokev'
                 ] + ADL_suffixes

# Pre-creating a dataframe with household ID X wave to join all of our wide to long outputted variables to
waves = list(range(1, 16))
hhidpn_waves = pd.MultiIndex.from_product([df['hhidpn'].unique(), waves], names=['hhidpn', 'wave'])
final_df = pd.DataFrame(index=hhidpn_waves).reset_index()

for suffix in melt_suffixes:
    # Generate the self and spouse columns for the current suffix
    self_columns = [f'r{i}{suffix}' for i in range(1, 16)]
    spouse_columns = [f's{i}{suffix}' for i in range(1, 16)]
    
    if suffix in ADL_suffixes:
        self_columns = self_columns[1:]; spouse_columns = spouse_columns[1:]
    
    # Melt the self for this suffix
    self_df = df.melt(id_vars=['hhidpn'], value_vars=self_columns,
                          var_name='wave', value_name=f'{suffix}_self')
    self_df['wave'] = self_df['wave'].str.extract('(\d+)').astype(int)
    
    # Melt the spouse for this suffix
    spouse_df = df.melt(id_vars=['hhidpn'], value_vars=spouse_columns,
                        var_name='wave', value_name=f'{suffix}_spouse')
    spouse_df['wave'] = spouse_df['wave'].str.extract('(\d+)').astype(int)

    
    merged_df = pd.merge(self_df, spouse_df, on=['hhidpn', 'wave'])
    
    final_df = pd.merge(final_df, merged_df[['hhidpn', 'wave', f'{suffix}_self', f'{suffix}_spouse']], 
                        on=['hhidpn', 'wave'], how='left')
    
final_df = final_df.sort_values(by=['hhidpn', 'wave'])

In [30]:
## WIDE TO LONG OF HOUSEHOLD VARIABLES
household_suffixes = ['hhres','child','atotn', 'itot', 'inpov']

for suffix in household_suffixes:
    hh_columns = [f'h{i}{suffix}' for i in range(1, 16)]
    
    # Melt the self for this suffix
    hh_df = df.melt(id_vars=['hhidpn'], value_vars=hh_columns,
                          var_name='wave', value_name=f'{suffix}_household')
    hh_df['wave'] = hh_df['wave'].str.extract('(\d+)').astype(int)
    
    final_df = pd.merge(final_df, hh_df[['hhidpn', 'wave', f'{suffix}_household']], 
                        on=['hhidpn', 'wave'], how='left')

In [31]:
## JOINING OF WAVE PERSISTENT VARIABLES
wave_persistent_variables = ['radyear', 'ragender', 'rahispan', 'rarelig', 'ravetrn', 'raedyrs', 'raedegrm', 'raeduc', 'rameduc', 'rafeduc']
final_df = pd.merge(final_df, df[['hhidpn'] + wave_persistent_variables],
                    on=['hhidpn'], how='left')

In [42]:
# Removing observations once they're dead
import numpy as np

vars_to_check_na = final_df.columns.difference(["hhidpn", "wave"] + wave_persistent_variables)
final_df = final_df.loc[np.logical_not(final_df[vars_to_check_na].isna().all(axis=1))]

In [None]:
# Construction of Outcome Y:="next year nursing home entry" and dropping every observation after they enter nursing home or die
final_df["nhmliv_self_next_wave"] = final_df.groupby("hhidpn")["nhmliv_self"].shift(-1)

def mask_group(group):
    first_one_idx = group[group["nhmliv_self_next_wave"] == 1].index.min()  # Find the index of the first occurrence of nursing home entrance next wave
    if pd.isna(first_one_idx):  # If there are no 1s, keep all rows
        return group
    return group.loc[:first_one_idx]  # Keep rows up to and including the first occurrence of var_a = 1

# Apply the mask to each person_id group
final_df = final_df.groupby("hhidpn", group_keys=False).apply(mask_group)

final_df = final_df.dropna(subset=["nhmliv_self_next_wave"])

# 2963 of 41772 hhidpns enter a nursing home

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df["nhmliv_self_next_wave"] = final_df.groupby("hhidpn")["nhmliv_self"].shift(-1)


In [51]:
final_df['nhmliv_self_next_wave'].sum()

2963.0

In [None]:
# One hot encoding for categoricals

In [None]:
# Standardization of continuous variables