In [None]:
import pandas as pd

# Merge function
def merge_data(df_main, path, merge_info):
    for dta_file, info in merge_info.items():
        all_columns = pd.read_stata(f"{path}/{dta_file}", iterator=True).varlist
        columns_to_read = list(set(['pid', 'hid', 'syear'] + list(info['keep_and_rename_columns'].keys())))
        columns_to_read = [col for col in columns_to_read if col in all_columns]
        df_temp = pd.read_stata(f"{path}/{dta_file}", columns=columns_to_read, convert_categoricals=False)
        df_temp.rename(columns=info['keep_and_rename_columns'], inplace=True)
        merge_columns = [col for col in ['pid', 'hid', 'syear'] + info['merge_on'] if col in df_main.columns and col in df_temp.columns]
        if not merge_columns:
            print(f"Skipping {dta_file} because no common merge columns are present.")
        else:
            df_main = pd.merge(df_main, df_temp, how='left', on=merge_columns)
    return df_main

# Dictionary with original variable names and newly assigned names
merge_info_dict = {
    'ppathl_coriginstr.dta': {
        'merge_on': ['hid', 'pid', 'syear'],
        'keep_and_rename_columns': {
        "hid": "hid",
        "pid": "pid",
        "corigin_str": "corigin_str",
        "syear": "syear",
        "sex": "sex",
        "sexor": "sexor",
        "gebjahr": "gebjahr",
        "gebmonat": "gebmonat",
        "migback": "migback",
        "corigin": "corigin",
        "germborn": "germborn",
        "immiyear": "immiyear",
        "sampreg": "eastwest",
        "phrf": "hochrechnungsfaktor"
        }
    },
    'pl.dta': {
        'merge_on': ['pid', 'syear'],
        'keep_and_rename_columns': {
            'plh0182': 'satisfaction',
            'plj0014_v1': 'german1',
            'plj0014_v2': 'german2',
            'plj0014_v3': 'german3',
            'plj0047': 'xenophobia',
            'plj0048_v1': 'disadv_origin1',
            'plj0048_v2': 'disadv_origin2',
            "plj0085_v1": "stay_ger",
            "plj0085_v2": "stay_ger13",
            "pli0094_h": "time_wfriends",
            "pli0181": "cntct_abroad",
            'pli0183': 'soz_aktiv',
            "ple0008": "curr_health",
            "plh0258_h": "religion",
            "pld0047": "cl_friends",
            "plj0071": "lang_oral",
            "plj0072": "lang_writ",
            "plj0073": "lang_read",
            "plj0077": "lang_usl",
            "plj0066": "lang_oral2",
            "plm0136_v2": "lang_fam",
            "plm0137_v2": "lang_friends",
            "plj0070": "ger_newspaper",
            "plj0068": "lang_origin",
            "plh0189": "lonely",
            "plh0190": "work_dislike",
            "plj0078": "feel_german",
            "plh0173": "sat_work",
            "plh0175": "sat_hhinc",
            "plh0177": "sat_dwell",
            "plh0178": "sat_leisure",
            "plh0180": "sat_family",
            "plh0172": "sat_sleep",
            "plj0060": "visit_germ",
            
        }
    },
    "biol.dta": {
        "merge_on": ["pid", "syear"],
        "keep_and_rename_columns": {
            "lm0128i01": "ger_premig"
        }
    },
    'bioimmig.dta': {
        'merge_on': ['pid', 'syear'],
        'keep_and_rename_columns': {
            'biimgrp': 'ig_grp',
            'biresper': 'ig_resstatus',
            'biwfam': 'ig_fam_in_ger',
            'birbetr': 'ig_job_in_ger',
            'birmoney': 'ig_money_in_ger',
            'birfree': 'ig_freedom_in_ger',
            'birfam': 'ig_fam_relations_in_ger',
            'birpoor': 'ig_poverty_in_ger',
            'birwar': 'ig_war_experience',
            "bifamc": "ig_cont_fam",
        }
    },
    'pgen.dta': {
        'merge_on': ['pid', 'syear'],
        'keep_and_rename_columns': {
            'pgbilzeit': 'edu_years',
            "pgpbbil01": "edu_vocat",
            "pgpbbil02": "edu_collg",
            "pgpbbil03": "edu_novocat",
            "pgpartz": "gen_partner",
            "pgmps08": "gen_mps08",
            "pgmps92": "gen_mps92",
            "pgisced97": "gen_edu97",
            "pgisced11": "gen_edu11",
            "pgcasmin": "gen_casmin",
            "pgisei88": "gen_isei88",
            "pgisei08": "gen_isei08",
            "pglabnet": "gen_income",
            "pgfamstd": "gen_family",
            "pgemplst": "gen_emplmnt",
            "pglfs": "gen_typeunempl",
            "pgnation": "gen_nation",
            "pgmonth": "smonth",
        }
    },
    'hl.dta': {
        'merge_on': ['hid', 'syear'],
        'keep_and_rename_columns': {
            'hlc0005_h': 'net_income',
            'hlc0006_h': 'income_group',
            "hld0001": "nbh_relation",
            "hld0002": "nbh_visit",
            "hld0003": "nbh_freq",
            "hlf0135": "dist_city",
            "hlf0152": "nbh_interact",
            "hlc0043": "num_children"
        }
    },
    'hbrutto.dta': {
        'merge_on': ['hid', 'syear'],
        'keep_and_rename_columns': {
            "bula_h": "state"
        }
    }
}

# Performing the merge

merge_info_dict_for_merge = {k: v for k, v in merge_info_dict.items() if k != 'ppathl_coriginstr.dta'}

path = "."
columns_to_read_main = list(merge_info_dict['ppathl_coriginstr.dta']['keep_and_rename_columns'].keys())
df_main = pd.read_stata("ppathl_coriginstr.dta", columns=columns_to_read_main, convert_categoricals=False)
df_main.rename(columns=merge_info_dict['ppathl_coriginstr.dta']['keep_and_rename_columns'], inplace=True)

df_main = merge_data(df_main, path, merge_info_dict_for_merge)

In [None]:
# Save combined dataset
df_main.to_parquet("soep2.gzip")
