In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import shutil
import re

In [4]:
# Configure source and output directories:
src_dir = Path("/kellogg/proj/lgg3230/RAIS/output/data/full") 
out_dir = Path("/kellogg/proj/lgg3230/SteppingStones/data/interim/rj_sample")
# Create output directory if it doesn't exist
out_dir.mkdir(parents=True, exist_ok=True)


In [5]:
# List all .dta files in the source directory
dta_files = sorted(src_dir.glob("RAIS_*.dta"))
print(f"Found {len(dta_files)} files")

Found 34 files


In [6]:
mun_code = 330455
start_year = 2007
end_year   = 2017

In [None]:
for f in dta_files:
    year_match = re.search(r"RAIS_(\d{4})\.dta", f.name)
    if not year_match:
        continue

    year = int(year_match.group(1))

    # skip if outside range
    if year < start_year or year > end_year:
        continue

    print(f"Processing year {year}...")
    df_year = pd.read_stata(f, convert_categoricals=False)
    df_year_sample = df_year[df_year['municipio'] == mun_code]
    out_path = out_dir / f"RAIS_{year}_rj.dta"
    df_year_sample.to_stata(out_path, write_index=False, version=119)
    print(f"Saved {len(df_year_sample):,} records to {out_path}")
