In [None]:
from folktables import ACSDataSource, ACSEmployment, ACSIncome
import folktables

In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
states = folktables.state_list

In [None]:
import sys
sys.path.append('code/')

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import train

In [None]:
big_states = ['CA', 'FL', 'GA', 'IL', 'NY', 'NC', 'OH', 'PA', 'TX']

all_categorical_features = ['COW', 'MAR', 'OCCP', 'SEX']

In [None]:
data_source2019 = ACSDataSource(survey_year='2019', horizon='1-Year', survey='person')

data_source2021 = ACSDataSource(survey_year='2021', horizon='1-Year', survey='person')

def get_data(state, random_state1=30291243, random_state2=5027):
    state2019 = data_source2019.get_data(states=[state], download=False)

    state2021 = data_source2021.get_data(states=[state], download=False)

    by_year = {2019: state2019, 2021: state2021}

    by_year_data_split = {"train": {}, "test": {}, "validation": {}}
    for year, data in by_year.items():
        print(data.shape)
        data_train, data_test = train_test_split(data, test_size=0.3, random_state=random_state1)
        by_year_data_split["test"][year] = data_test
        data_train, data_validation = train_test_split(data_train, test_size=0.5, random_state=random_state2)
        by_year_data_split["train"][year] = data_train
        by_year_data_split["validation"][year] = data_validation
    return by_year_data_split

In [None]:
definition_df = data_source2019.get_definitions(download=True)
categories = folktables.generate_categories(features=all_categorical_features, definition_df=definition_df)

In [None]:
all_repetitions = {}
for repetition in range(10):
    print(f"REPEAT {repetition}")
    
    all_states = {}
    for state in big_states:
        print("-------------- STATE")
        print(state)
        by_year_data_split = get_data(state, random_state1=None, random_state2=None)
        stable_features = [
        #         'AGEP',
        #         'COW',
                'SCHL',
        #         'MAR',
        #         'OCCP',
        #         'WKHP',
        #          'SEX',
        #         'JWMNP'  # travel time
        ]
        unstable_features = [
            'HINS4', #medicaid
            'JWMNP'
        ]
        unstable_feature_targets = [('SCHL', LinearRegression)]# ['AGEP'] # 'SCHL'


        res = train.train(by_year_data_split, categories, 2019, stable_features, unstable_features, unstable_feature_targets)
        all_states[state] = res
        metric = "accuracy"
        for method in ['all', 'aux', 'without']:
            r2019 = res[2019]['validation'][method][metric]
            r2021 = res[2021]['validation'][method][metric]
            print(f"2019 {r2019:.3f} 2021 {r2021:.3f} for {method}")
    all_repetitions[repetition] = all_states

In [None]:
metric = "accuracy"
split = "test"

across_repetitions = {}

for state in big_states:
    across_repetitions[state] = {}
    for year in [2019, 2021]:
        across_repetitions[state][year] = {}
        for split in ["train", "validation", "test"]:
            across_repetitions[state][year][split] = {}
            print(split)
            for method in ['all', 'aux', 'without']:
                across_repetitions[state][year][split][method] = {}
                results = [
                    all_repetitions[repetition][state][year][split][method][metric] for repetition in all_repetitions.keys()]
                print(results)
                across_repetitions[state][year][split][method][metric] = np.mean(results)
                across_repetitions[state][year][split][method][metric + "_std"] = np.std(results)
across_repetitions

In [None]:
def report_accuracy(all_states, big_states, years=[2019, 2021], split='validation'):
    metric = 'accuracy'
    for state in big_states:
        print(state)
        for method in ['all', 'aux', 'without']:
            r2019 = all_states[state][2019][split][method][metric]
            r2019_l = all_states[state][2019][split][method]['accuracy_std']
            r2021 = all_states[state][2021][split][method][metric]
            r2021_l = all_states[state][2021][split][method]['accuracy_std']
            
            print(f"2019 {r2019:.3f} +/- {r2019_l:.4f}, 2021 {r2021:.3f}  +/-  {r2021_l:.4f}  for {method}")

    m = {
        2019: {'all': [], 'aux': [], 'without':[]},
        2021: {'all': [], 'aux': [], 'without':[]}
    }

    for state in big_states:
        
        results = f"{state} "
        for year in years:
            for method in ['all', 'aux', 'without']:
                r = all_states[state][year][split][method][metric]
                r_std = all_states[state][year][split][method]['accuracy_std']
                results = results + f" & {round(r,3)} \plusminus {round(r_std,4)}"
                
                m[year][method].append(r)
                
        print(f"{results} \\\\")
    for year in years:
        print(year)
        for method in ['all', 'aux', 'without']:  
            print(method)
            print(
             f"mean {round(np.mean(m[year][method]),3)}  min {np.min(m[year][method]):.3f} q25 {np.quantile(m[year][method], 0.25):.3f} q50 {np.quantile(m[year][method], 0.5):.3f} q75 {np.quantile(m[year][method], 0.75):.3f}")

In [None]:
import evaluate
evaluate.report_accuracy_with_std_dev(across_repetitions, big_states, years=[2019, 2021], split='test')

In [None]:
evaluate.report_accuracy_with_std_dev(across_repetitions, big_states, years=[2021], split='test')

In [None]:
evaluate.report_accuracy_with_std_dev(across_repetitions, big_states, years=[2021], split='validation')

In [None]:
evaluate.report_accuracy_with_std_dev(across_repetitions, big_states, years=[2019], split='test')

In [None]:
evaluate.report_accuracy_with_std_dev(across_repetitions, big_states, years=[2019], split='validation')

# Explore single features

In [None]:
import numpy as np
single_feature_eval = {}
features=[
        'AGEP',
        'COW',
        'SCHL',
        'MAR',
        'OCCP',
        'POBP',
#         'RELP',
        'WKHP',
        'SEX',
        'RAC1P',
        'JWMNP',  # travel time
        'HINS4',
]
for f in features:
    print(f)
    single_feature_eval[f] = {}
    ACSIncomeOneFeature = folktables.BasicProblem(
        features=[
            f
        ],
        target='PINCP',
        target_transform=lambda x: x > 50000,
        preprocess=folktables.acs.adult_filter,
        postprocess=lambda x: np.nan_to_num(x, -1),
    )
    for year in [2019]:
        x, y, _ = ACSIncomeOneFeature.df_to_pandas(by_year_data_split["train"][year], categories=categories, dummies=True)
        feature_names = x.columns
        model = LogisticRegression()
        model.fit(x, y.values)
        evaluations = []
        for other_year in [2019, 2021]:
            eval_x, eval_y, _ = ACSIncomeOneFeature.df_to_pandas(by_year_data_split["validation"][other_year], categories=categories, dummies=True)
            for c in x.columns:  # pad missing columns
                if c not in eval_x.columns:
                    eval_x[c] = 0
            eval_x = eval_x[x.columns] # eliminate extra columns
            evaluations.append(model.score(eval_x, eval_y.values))
            print(np.mean(eval_y.values))
            
        single_feature_eval[f][year] = evaluations

In [None]:
ACSIncomeOneFeature = folktables.BasicProblem(
        features=[
            'SCHL', 
            'JWMNP',  # travel time
        'HINS4',
        ],
        target='PINCP',
        target_transform=lambda x: x > 50000,
        preprocess=folktables.acs.adult_filter,
        postprocess=lambda x: np.nan_to_num(x, -1),
    )


x, y, _ = ACSIncomeOneFeature.df_to_pandas(by_year_data_split["train"][year], categories=categories, dummies=True)

In [None]:
for k, v in single_feature_eval.items():
    print(k, v)
    print(k, v[2019][0] - v[2019][1])