# Preprocessing

In [78]:
import pandas as pd
import numpy as np

In [79]:
easyshare = pd.read_stata('data/sharewX_rel8-0-0_easySHARE_stata/easySHARE_rel8-0-0.dta')

illness_before = pd.read_stata("data/SHARE-ENV - Exposure to Environmental Hazards/illness_before_module_v01.dta")
illness_during = pd.read_stata("data/SHARE-ENV - Exposure to Environmental Hazards/illness_during_module_v01.dta")
job = pd.read_stata("data/SHARE-ENV - Exposure to Environmental Hazards/job_module_v01.dta")
life = pd.read_stata("data/SHARE-ENV - Exposure to Environmental Hazards/life_module_v01.dta")
young_age = pd.read_stata("data/SHARE-ENV - Exposure to Environmental Hazards/young_age_module_v01.dta")
yearly = pd.read_stata("data/SHARE-ENV - Exposure to Environmental Hazards/yearly_module_v01.dta")


In [80]:
df = pd.merge(easyshare, life, on=['mergeid', 'wave'], how='left')
df = pd.merge(df, job, on=['mergeid'], how='left')

In [81]:
df_relevant = df[df.columns.drop(list(df.filter(regex='^euro')))]
df_relevant = df_relevant[df_relevant.columns.drop(list(df_relevant.filter(regex='^dn')))]
non_predictive_vars = [
    'mergeid',    # Used for merging records, no predictive power
    'hhid',       # Household identifier for tracking or grouping data
    'coupleid',   # Links records of individuals within a household
    'int_version',# Version of the questionnaire or interview format
    'int_year',   # Year the interview was conducted, structural rather than predictive
    'int_month',  # Month the interview was conducted, similar to int_year
    'country',    # Country code, used for stratification or adjustments
    'country_mod', # Modified country code, typically for data manipulation
    'wavepart'   # Wave part, used for stratification or adjustments
]
df_relevant = df_relevant[df_relevant.columns.drop(non_predictive_vars)]

In [82]:
def replace_dash_with_na(df):
    for column in df.columns:
        if df[column].dtype == 'category':
            # Replace entries containing '-' with NA
            df[column] = df[column].apply(lambda x: pd.NA if '-' in str(x) else x)
    return df

df_relevant = replace_dash_with_na(df_relevant)

In [83]:
na_counts = df_relevant.groupby('wave').apply(lambda x: x.isnull().sum())
# mean per number of abservation per wave
na_counts['mean']= na_counts.mean(axis=1)
na_counts['obs'] = df_relevant.groupby('wave').size()
na_counts['avg_mean'] = na_counts['mean']/ na_counts['obs']
na_counts['std'] = na_counts.std(axis=1)
na_counts = na_counts.sort_values(by='avg_mean', ascending=False)
df_sorted = df_relevant.sort_values(by=['wave'], ascending=[False])
# drop all except wave 7 
df_wave_7 = df_sorted[df_sorted['wave'] == 7]
df_wave_7 = df_wave_7.drop(columns=['wave'])
# df_most_recent_wave_per_mergeid = df_sorted.drop_duplicates(subset='mergeid', keep='first')

  na_counts = df_relevant.groupby('wave').apply(lambda x: x.isnull().sum())


In [84]:
na_counts = df_wave_7.isna().sum()

na_counts_sorted = na_counts.sort_values(ascending=False)

na_counts_sorted

emi_pm2p5_w      77202
income_pct_w8    77202
emi_pm10_w       77202
br010_mod        77202
bmi2             77202
                 ...  
thinc_m              0
hhsize               0
partnerinhh          0
female               0
language             0
Length: 415, dtype: int64

In [85]:
for column in df_wave_7.columns:
    if df_wave_7[column].dtype == object:  # Check if the column data type is object
        # Try converting the column to numeric
        converted_column = pd.to_numeric(df_wave_7[column], errors='coerce')
        # Check if the conversion did not introduce any new NaNs (i.e., all NaNs in the original are NaNs in the converted)
        if converted_column.notna().equals(df_wave_7[column].notna()):
            df_wave_7[column] = converted_column

df_wave_7.info()

<class 'pandas.core.frame.DataFrame'>
Index: 77202 entries, 22153 to 32326
Columns: 415 entries, language to yjob_tx_g30_w
dtypes: category(7), float32(309), float64(52), object(47)
memory usage: 150.4+ MB


In [86]:
columns_to_drop = na_counts[na_counts > 20000].index


df_dropped = df_wave_7.drop(columns=columns_to_drop)

shape_of_dataframe_full = df_wave_7.shape
shape_of_dataframe_dropped = df_dropped.shape

print(f"No. of samples: {shape_of_dataframe_full[0]}")
print(f"No. of columns (full): {shape_of_dataframe_full[1]}")
print(f"No. of columns (dropped): {shape_of_dataframe_dropped[1]}")

columns_to_drop

No. of samples: 77202
No. of columns (full): 415
No. of columns (dropped): 268


Index(['q34_re', 'isced1997_r', 'int_partner', 'age_partner', 'gender_partner',
       'ch001_', 'ch021_mod', 'ch007_hh', 'ch007_km', 'sp002_mod',
       ...
       'avgjob_conc_yearly_o3_mean', 'avgjob_conc_yearly_o3_median',
       'avgjob_conc_yearly_o3_w', 'avgjob_emissions_PM10_mean',
       'avgjob_emissions_PM10_median', 'avgjob_emissions_PM10_w',
       'avgjob_emissions_PM25_mean', 'avgjob_emissions_PM25_median',
       'avgjob_emissions_PM25_w', 'job_uncomfortable'],
      dtype='object', length=147)

In [ ]:
na_after_dr = df_dropped.isna().sum().sort_values(ascending=False)


In [88]:
def process_categorical_columns(df):
    for column in df.columns:
        if df[column].dtype == 'category' or df[column].dtype == 'object':
            # Convert category to string
            df[column] = df[column].astype(str)

            # Split the column on the first '.', and expand to new DataFrame
            split_data = df[column].str.split('.', expand=True, n=1)

            # If split_data has only one column, no '.' was found; skip processing
            if split_data.shape[1] < 2:
                continue

            # Clean up whitespace
            split_data[1] = split_data[1].str.strip()

            # Keep only rows where a split occurred (indicative of having a '.')
            valid_splits = split_data[1].notna()

            # Create dummy variables only for the valid name parts
            if valid_splits.any():
                dummies = pd.get_dummies(split_data.loc[valid_splits, 1], prefix=column)
                # Merge these dummy variables back to the original DataFrame
                df = pd.concat([df, dummies], axis=1)

            # Optionally, drop the original column
            df.drop(column, axis=1, inplace=True)

    return df

In [89]:
# to csv
df_dropped.to_csv('data/processed_data.csv', index=False)