# libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
#import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
#get dataset
data = pd.read_csv('/content/drive/My Drive/data.csv')

In [None]:
data.head()

In [None]:
def evaluate_models(datasets):
  model_results = {}

  for dataset_name, dataset in datasets.items():
    target_feature = 'INCOME'
    X = dataset.drop([target_feature], axis=1)
    y = dataset[target_feature]
    sample_weights = dataset['SAMPLE WEIGHT']

    X_train, X_test, y_train, y_test, sw_train, sw_test = train_test_split(
        X, y, sample_weights, test_size=0.2, random_state=42
    )
    dataset_results = {}

    models = {
        'Linear Regression': LinearRegression(),
        'Decision Tree': DecisionTreeRegressor(random_state=42),
        'Random Forest': RandomForestRegressor(random_state=42)
    }

    for model_name, model in models.items():
        model.fit(X_train, y_train, sample_weight=sw_train)
        y_pred = model.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        dataset_results[model_name] = mae

    model_results[dataset_name] = dataset_results

  results_df = pd.DataFrame(model_results)
  results_df = results_df.transpose()

  return results_df

In [None]:
drop = ["RACE (OTHER)", "RACE", "ETHNICITY (OTHER)", "ETHNICITY",
        "ETHNICITY (CENSUS)", "HISPANIC", "ID"]

none = data.drop(drop, axis=1)
race_other = data.drop([col for col in drop if col != "RACE (OTHER)"], axis=1)
race = data.drop([col for col in drop if col != "RACE"], axis=1)
ethnicity_other = data.drop([col for col in drop if col != "ETHNICITY (OTHER)"], axis=1)
ethnicity_census = data.drop([col for col in drop if col != "ETHNICITY (CENSUS)" and col != "HISPANIC"] , axis=1)
ethnicity = data.drop([col for col in drop if col != "ETHNICITY" and col != "HISPANIC"] , axis=1)

datasets = {
    "no ethnoracial data": none,
    "just race (other)": race_other,
    "just race": race,
    "just ethnicity (other)": ethnicity_other,
    "just ethnicity (census) and hispanic": ethnicity_census,
    "just ethnicity and hispanic": ethnicity
}

results = evaluate_models(datasets)