# Imports

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 65)
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate, RepeatedKFold, cross_val_score
from sklearn.metrics import accuracy_score, log_loss
from joblib import dump, load
import warnings
warnings.filterwarnings('ignore')
%load_ext autotime

# Read in data and prepare target variable

In [2]:
df = pd.read_csv("usa_small.csv")
df = df.reindex(columns=(['MARST'] + list([a for a in df.columns if a != 'MARST']) ))

time: 21.2 s


In [3]:
df = df[df['MARST'].isin([1,3,4])]

df['MARST'] = df['MARST'].replace(1,0)
df['MARST'] = df['MARST'].replace([3,4],1)

time: 2.02 s


In [4]:
counts = df["MARST"].value_counts()
multiplier = counts[0]/counts[1]

time: 37.1 ms


# Filter out entries with revealing information

In [5]:
df = df[(df["FERTYR"]!=8)]
df = df[(df["NCOUPLES"]!=0)]
df = df[(df["FAMSIZE"]!=1)]
df = df[~df["MARRNO_SP"].isna()]
df = df[df["MARRNO_SP"] != 0]
df = df[df["EMPSTAT"] != 0]

time: 4.2 s


# Resample to keep same distribution

In [6]:
num_married_sample = int(df["MARST"].value_counts()[1]*multiplier)

married = df[df["MARST"]==0].sample(n=num_married_sample, random_state=13)

df = df[df["MARST"]==1].append(married)

time: 884 ms


# Create training data

In [None]:
pd.DataFrame(columns=["AGE", "AGE_SP", "MARRNO", "MARRNO_SP", "EMPSTAT", "EMPSTAT_SP", "INCTOT", "INCTOT_SP", "YEARS_MARRIED", "NCHILD", 
                      "FERTYR", "FAMSIZE", "RACE", "STATEFIP"]).to_csv("model_input.csv", index=False)

In [56]:
df["YEARS_MARRIED"] = df["YEAR"] - df["YRMARR"]
df["Age_Difference"] = abs(df["AGE"] - df["AGE_SP"])
df["Income_Difference"] = abs(df["INCTOT"] - df["INCTOT_SP"])
X = df[["AGE", "AGE_SP", "MARRNO", "MARRNO_SP", "EMPSTAT", "EMPSTAT_SP", "INCTOT", "INCTOT_SP", "YEARS_MARRIED", "NCHILD", "FERTYR", "FAMSIZE", "RACE", "STATEFIP", 
        "Age_Difference", "Income_Difference"]]

y = df[["MARST"]]

time: 299 ms


# Train and evaluate model

In [57]:
model = GradientBoostingClassifier(learning_rate=.1, n_estimators=200, max_depth=7, subsample=1.0, min_samples_split=10, random_state=33)
# model = RandomForestClassifier(n_estimators=50, random_state=33, max_depth=3, min_samples_split=2, max_features="sqrt")
# model = LogisticRegression()
# model = DecisionTreeClassifier()
# model = KNeighborsClassifier(n_neighbors=100, n_jobs=-1)
cv_results = cross_validate(model, X, y, cv=RepeatedKFold(n_splits=3, n_repeats=1, random_state=7), n_jobs=-1, return_train_score=True, 
                            scoring=['accuracy', 'neg_log_loss'])

time: 3min 10s


In [58]:
# model = GradientBoostingClassifier(learning_rate=.1, n_estimators=200, max_depth=7, subsample=1.0, min_samples_split=10, random_state=33)
# with RACE, STATEFIP, Age_Difference, and Income_Difference
print("Accuracy: {}".format(cv_results['test_accuracy'].mean()))
print("Log Loss: {}".format(-cv_results['test_neg_log_loss'].mean()))

Accuracy: 0.8318993975406453
Log Loss: 0.38256152960943157
time: 11.2 ms


In [51]:
# model = GradientBoostingClassifier(learning_rate=.1, n_estimators=200, max_depth=7, subsample=1.0, min_samples_split=10, random_state=33)
# with RACE, STATEFIP, Age_Difference, and Income_Difference
print("Accuracy: {}".format(cv_results['test_accuracy'].mean()))
print("Log Loss: {}".format(-cv_results['test_neg_log_loss'].mean()))

Accuracy: 0.832167615746472
Log Loss: 0.3837319852908896
time: 6.37 ms


In [185]:
# model = GradientBoostingClassifier(learning_rate=.1, n_estimators=200, max_depth=7, subsample=1.0, min_samples_split=10, random_state=33)
print("Accuracy: {}".format(cv_results['test_accuracy'].mean()))
print("Log Loss: {}".format(-cv_results['test_neg_log_loss'].mean()))

Accuracy: 0.8295266980275646
Log Loss: 0.3943773794122348
time: 3.41 ms


In [91]:
### Naive Model
print("Naive Accuracy: {}".format(1-y.mean()))
print("Naive Log Loss: {}".format(log_loss(y, [y.mean()]*len(y))))

Naive Accuracy: MARST    0.798046
dtype: float64
Naive Log Loss: 0.5030986820238625
time: 6.32 s
