In [1]:
from dataset_utils import map_variable

def validate_variable(df_var, values=None, min=None, max=None):
    distinct_values = df_var.dropna().unique()
    if len(distinct_values) == 0:
        raise ValueError("No non-missing values to validate in variable!")
    for val in distinct_values:
        if values is not None:
            if val not in values:
                raise ValueError(f"Value {val} not in allowed set of {values}")
        if min is not None:
            if val < min:
                raise ValueError(f"Value {val} less than minimum of {min}")
        if max is not None:
            if val > max:
                raise ValueError(f"Value {val} greater than maximum of {max}")
    return True


In [2]:
import json
with open("raw/dataset_builds.json") as f:
    configs = json.load(f)

year_2024 = configs["2024"]

year_2024


{'weight': {'source': 'V240107b', 'type': 'float', 'valid': {'min': 0}},
 'age': {'source': 'V241458x', 'nulls': [-2], 'valid': {'min': 0, 'max': 120}},
 'female': {'source': 'V241550',
  'nulls': [3, 0, -9],
  'mapping': {'1': 0, '2': 1},
  'valid': [0, 1]},
 'race': {'source': 'V241501x',
  'nulls': [-4, -8, -9],
  'mapping': {'1': 'White',
   '2': 'Black',
   '3': 'Hispanic',
   '4': 'Other',
   '5': 'Other',
   '6': 'Other'},
  'type': 'str',
  'valid': ['White', 'Black', 'Hispanic', 'Other']},
 'strong_republican': {'source': 'V241227x',
  'mapping': {'1': 0, '2': 0, '3': 0, '4': 0, '5': 0, '6': 0, '7': 1},
  'valid': [0, 1]},
 'republican': {'source': 'V241227x',
  'mapping': {'1': 0, '2': 0, '3': 0, '4': 0, '5': 0, '6': 1, '7': 1},
  'valid': [0, 1]},
 'lean_republican': {'source': 'V241227x',
  'mapping': {'1': 0, '2': 0, '3': 0, '4': 0, '5': 1, '6': 1, '7': 1},
  'valid': [0, 1]},
 'strong_democrat': {'source': 'V241227x',
  'mapping': {'1': 1, '2': 0, '3': 0, '4': 0, '5': 0, 

In [3]:
import pandas as pd
def build_dataset(filepath, year, config):
    if filepath.endswith(".dta"):
        raw = pd.read_stata(filepath)
    elif filepath.endswith(".csv"):
        raw = pd.read_csv(filepath, low_memory=False)
    else:
        raise ValueError("Unsupported file format")

    df = raw[[]].copy()
    df["year"] = year
    for var, kwargs in config.items():
        print(var)
        print(kwargs)
        source = kwargs.pop("source")
        type_check = kwargs.pop("type", "int")
        if type_check == "int":
            type_check = int
        elif type_check == "float":
            type_check = float
        elif type_check == "str":
            type_check = str
        else:
            raise ValueError(f"Unsupported type {type_check} for variable {var}")
        kwargs["type_check"] = type_check
        if "mapping" in kwargs:
            mapping = {}
            for k, v in kwargs["mapping"].items():
                try:
                    k2 = int(k)
                except ValueError:
                    k2 = k
                mapping[k2] = v
            kwargs["mapping"] = mapping
        valid = kwargs.pop("valid", None)
        df[var] = map_variable(raw[source], **kwargs)
        if valid is not None:
            if isinstance(valid, list):
                validate_variable(df[var], values=valid)
            else:
                validate_variable(df[var], **valid)

    # Then we would have the calculated ones.
    df["therm_delta"] = df.apply(lambda row: row["therm_whites"] - row["therm_blacks"], axis=1)
    df["age_group"] = df.apply(lambda row: (
        pd.NA if pd.isna(row["age"]) else
        "18-29" if 18 <= int(row["age"]) <= 29 else
        "30-44" if 30 <= int(row["age"]) <= 44 else
        "45-64" if 45 <= int(row["age"]) <= 64 else
        "65+" if int(row["age"]) >= 65 else
        pd.NA
    ), axis=1)
    df["resentment"] = df.apply(lambda row: (
        pd.NA if pd.isna(row["special_favors_plus"]) or pd.isna(row["tried_harder_plus"]) or pd.isna(row["past_slavery_minus"]) or pd.isna(row["less_deserve_minus"]) else
        row["special_favors_plus"] + row["tried_harder_plus"] - row["past_slavery_minus"] - row["less_deserve_minus"]
    ), axis=1)
    validate_variable(df["resentment"], min=-8, max=8)
    df["race_edu_block"] = df.apply(lambda row: (
        pd.NA if pd.isna(row["college"]) or pd.isna(row["race"]) else
        "NonWhite" if row["race"] in ("Black","Hispanic","Other") else
        "WhiteCollege" if row["race"] == "White" and row["college"] == 1 else
        "WhiteNonCollege" if row["race"] == "White" and row["college"] == 0 else
        pd.NA
    ), axis=1)
    df["race_edu_block"] = df.apply(lambda row: (
        pd.NA if pd.isna(row["college"]) or pd.isna(row["race"]) else
        "NonWhite" if row["race"] in ("Black","Hispanic","Other") else
        "WhiteCollege" if row["race"] == "White" and row["college"] == 1 else
        "WhiteNonCollege" if row["race"] == "White" and row["college"] == 0 else
        pd.NA
    ), axis=1)
    df["race_party_block"] = df.apply(lambda row: (
        pd.NA if pd.isna(row["race"]) or pd.isna(row["republican"]) or pd.isna(row["democrat"]) else
        "WhiteRep" if row["race"] == "White" and row["republican"] == 1 else
        "WhiteDem" if row["race"] == "White" and row["democrat"] == 1 else
        "WhiteInd" if row["race"] == "White" else
        "NonWhite" if row["race"] in ("Black","Hispanic","Other") else
        pd.NA
    ), axis=1)
    df["vote_prev_vote"] = df.apply(lambda row: (
        pd.NA if pd.isna(row["prev_rep_pres"]) or pd.isna(row["vote_rep_pres"]) else
        "Rep-Rep" if row["prev_rep_pres"] == 1 and row["vote_rep_pres"] == 1 else
        "Rep-Dem" if row["prev_rep_pres"] == 1 and row["vote_rep_pres"] == 0 else
        "Dem-Rep" if row["prev_rep_pres"] == 0 and row["vote_rep_pres"] == 1 else
        "Dem-Dem" if row["prev_rep_pres"] == 0 and row["vote_rep_pres"] == 0 else
        pd.NA
    ), axis=1)
    df = df[~df["weight"].isna()]
    return df

df = build_dataset("raw/anes_timeseries_2024_csv_20250808.csv", 2024, year_2024)

weight
{'source': 'V240107b', 'type': 'float', 'valid': {'min': 0}}
age
{'source': 'V241458x', 'nulls': [-2], 'valid': {'min': 0, 'max': 120}}
female
{'source': 'V241550', 'nulls': [3, 0, -9], 'mapping': {'1': 0, '2': 1}, 'valid': [0, 1]}
race
{'source': 'V241501x', 'nulls': [-4, -8, -9], 'mapping': {'1': 'White', '2': 'Black', '3': 'Hispanic', '4': 'Other', '5': 'Other', '6': 'Other'}, 'type': 'str', 'valid': ['White', 'Black', 'Hispanic', 'Other']}
strong_republican
{'source': 'V241227x', 'mapping': {'1': 0, '2': 0, '3': 0, '4': 0, '5': 0, '6': 0, '7': 1}, 'valid': [0, 1]}
republican
{'source': 'V241227x', 'mapping': {'1': 0, '2': 0, '3': 0, '4': 0, '5': 0, '6': 1, '7': 1}, 'valid': [0, 1]}
lean_republican
{'source': 'V241227x', 'mapping': {'1': 0, '2': 0, '3': 0, '4': 0, '5': 1, '6': 1, '7': 1}, 'valid': [0, 1]}
strong_democrat
{'source': 'V241227x', 'mapping': {'1': 1, '2': 0, '3': 0, '4': 0, '5': 0, '6': 0, '7': 0}, 'valid': [0, 1]}
democrat
{'source': 'V241227x', 'mapping': {'1':

In [4]:
df

Unnamed: 0,year,weight,age,female,race,strong_republican,republican,lean_republican,strong_democrat,democrat,...,special_favors_plus,past_slavery_minus,less_deserve_minus,tried_harder_plus,therm_delta,age_group,resentment,race_edu_block,race_party_block,vote_prev_vote
0,2024,0.710865,50,0,Hispanic,1,1,1,0,0,...,1,5,5,1,10,45-64,-8,NonWhite,NonWhite,Dem-Dem
1,2024,2.383786,41,1,Other,0,0,0,0,0,...,2,4,5,3,0,30-44,-4,NonWhite,NonWhite,
2,2024,0.808992,44,1,White,0,0,0,1,1,...,5,1,1,5,-50,30-44,8,WhiteNonCollege,WhiteDem,Rep-Rep
3,2024,0.29174,45,0,Other,0,1,1,0,0,...,1,3,3,3,0,45-64,-2,NonWhite,NonWhite,
4,2024,0.222012,80,0,White,0,0,1,0,0,...,3,4,5,2,0,65+,-4,WhiteNonCollege,WhiteInd,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5514,2024,0.554677,42,1,White,0,0,1,0,0,...,1,5,5,2,0,30-44,-7,WhiteNonCollege,WhiteInd,
5515,2024,0.075758,80,1,Hispanic,0,0,0,0,0,...,3,1,3,3,0,65+,2,NonWhite,NonWhite,Rep-Rep
5517,2024,1.797863,69,1,White,0,0,0,0,0,...,2,4,4,3,0,65+,-3,WhiteCollege,WhiteInd,Rep-Rep
5519,2024,2.255936,28,0,Black,0,0,0,1,1,...,5,1,1,5,-60,18-29,8,NonWhite,NonWhite,Rep-Rep
