In [None]:
import pandas as pd

In [None]:
# http://www.fairness-measures.org/Pages/Datasets/Compas.html
# https://www.propublica.org/datastore/dataset/compas-recidivism-risk-score-data-and-analysis
# https://github.com/propublica/compas-analysis/blob/master/Compas%20Analysis.ipynb

data_raw = pd.read_csv("~/Downloads/compas-scores-raw.csv")
data_raw.columns

In [None]:
data_rec = data_raw.loc[
    (data_raw["DisplayText"] == "Risk of Recidivism") &
    (data_raw["AssessmentType"] == "New")
]

In [None]:
cols = ["Agency_Text", "Sex_Code_Text", "Ethnic_Code_Text", "DateOfBirth",
        "ScaleSet_ID", "Language", "LegalStatus", "CustodyStatus", "MaritalStatus"]
data = data_rec.loc[:, cols]

#data.loc[:, ("Agency_Text"     )] = pd.Categorical(data.loc[:, ("Agency_Text")])
#data.loc[:, ("Sex_Code_Text"   )] = pd.Categorical(data.loc[:, ("Sex_Code_Text")])
#data.loc[:, ("Ethnic_Code_Text")] = pd.Categorical(data.loc[:, ("Ethnic_Code_Text")])
#data.loc[:, ("Language"        )] = pd.Categorical(data.loc[:, ("Language")])
#data.loc[:, ("LegalStatus"     )] = pd.Categorical(data.loc[:, ("LegalStatus")])
#data.loc[:, ("CustodyStatus"   )] = pd.Categorical(data.loc[:, ("CustodyStatus")])
#data.loc[:, ("MaritalStatus"   )] = pd.Categorical(data.loc[:, ("MaritalStatus")])
data["DateOfBirth"] = pd.to_datetime(data["DateOfBirth"])

In [None]:
data

In [None]:
agency  = pd.get_dummies(data["Agency_Text"     ], prefix="agency")
sex     = pd.get_dummies(data["Sex_Code_Text"   ], prefix="sex")
ethnic  = pd.get_dummies(data["Ethnic_Code_Text"], prefix="ethnic")
lang    = pd.get_dummies(data["Language"        ], prefix="lang")
legal   = pd.get_dummies(data["LegalStatus"     ], prefix="legal")
custody = pd.get_dummies(data["CustodyStatus"   ], prefix="custody")
marital = pd.get_dummies(data["MaritalStatus"   ], prefix="marital")

dob = pd.DataFrame({
    "dob_day": data["DateOfBirth"].dt.day,
    "dob_month": data["DateOfBirth"].dt.month,
    "dob_year": data["DateOfBirth"].dt.year
})

X = pd.concat([agency, sex, ethnic, lang, legal, custody, marital, dob], axis=1)
y = data_rec["RawScore"]

# Model

In [None]:
import xgboost as xgb
import numpy as np

In [None]:
np.random.seed(1)
Is = np.random.permutation(X.shape[0])
Im = int(len(Is) * 0.8)
dtrain = xgb.DMatrix(X.loc[Is[:Im], :], label=y[:Im])
dtest  = xgb.DMatrix(X.loc[Is[Im:], :], label=y[Im:])
params = {
    "learning_rate": 0.25,
    "max_depth": 6,
    "objective": "reg:squarederror"
}
bst = xgb.train(params, dtrain, 200,
                [(dtrain, "train"), (dtest, "test")],
                early_stopping_rounds=10)
yhat = bst.predict(xgb.DMatrix(X))

In [None]:
yhat[:100]

In [None]:
y[:10].to_numpy()

In [None]:
xgb.plot_importance(bst)