In [1]:
import functools as fn
import itertools as itr
import warnings
from importlib import reload

import catboost as cb
import numba as nb
import numpy as np
import pandas as pd
import scipy.spatial.distance as dist
import scipy.stats as st
import sklearn.cluster as cls
import sklearn.metrics as met
import umap

import entropy as ent
import parametersearch as ps
import unsupervisedcv as ucv
import utility as utl

warnings.filterwarnings('ignore')

<IPython.core.display.Javascript object>

In [2]:
SUBSAMPLE_SIZE = 1000
randy = np.random.default_rng()

<IPython.core.display.Javascript object>

In [3]:
# load user profile data
profiles_path = '../source_data/okcupid_2015/user_data_public.csv'
profiles = pd.read_csv(profiles_path, delimiter=',', low_memory=False)

<IPython.core.display.Javascript object>

In [4]:
# focus on a subset of the demo data for speed
features = [x for x in profiles.columns if x.startswith('q')]  # questions only
df = profiles[features].sample(SUBSAMPLE_SIZE)

# code categorical features in-place
codes = {}
for col in features:
    df[col], codes[col] = df[col].factorize()

# recode missing values from factorize() as np.nan
df = df.replace(to_replace=-1, value=np.nan)

<IPython.core.display.Javascript object>

In [5]:
hamming = ent.hamming_ind_entropy_func(df)

<IPython.core.display.Javascript object>

In [6]:
# demo agreement-based cross validation
# this compares agreement between estimates made on different combinations of folds,
# without lining-up outputs by folds that both estimates are based on


def top_n_columns(df, n):
    """Returns names of the top n columns in a DataFrame with the highest entropy"""
    return ent.entropy(df).sort_values(ascending=False).index.tolist()[:n]


def top_n_score(a, b):
    """Scores the agreement for top_n_columns as the number of columns that appear in both results"""
    return sum(x in b for x in a)


ucv.agreement_cross_val(top_n_score, fn.partial(top_n_columns, n=10), df)

array([9, 8, 9, 9, 8, 9, 9, 8, 9, 9])

<IPython.core.display.Javascript object>

In [7]:
# demo overlap-based cross validation
# this compares agreement between estimates made on different combinations of folds,
# giving the score_func the opportunity to line up outputs row-by-row corresponding
# to the same input rows within the folds shared by each pair of estimates


def ham_umap_embedding(X):
    """Uses hamming distance to create 1D UMAP embedding"""
    dist_matrix = dist.squareform(dist.pdist(X, hamming))
    return umap.UMAP(metric='precomputed', n_components=1).fit_transform(dist_matrix)


def ham_umap_cluster(X, n_clusters=8):
    return cls.KMeans(n_clusters=n_clusters).fit_predict(ham_umap_embedding(X))


ucv.overlap_cross_val(
    met.adjusted_mutual_info_score, fn.partial(
        ham_umap_cluster, n_clusters=5), df,
)

array([0.356088  , 0.31595105, 0.33475683, 0.36104202, 0.3330996 ,
       0.27246886, 0.34324092, 0.28542174, 0.31286132, 0.32109321])

<IPython.core.display.Javascript object>

In [8]:
# demo non-overlap-based cross validation
# this compares agreement between models trained on different combinations of folds
# by comparing their predictions on a fold that neither was trained on, and giving
# the score_func the opportunity to line up outputs row-by-row corresponding to the
# same input rows within each scoring fold


def catboost_predictor(X, y):
    """Returns a function for making predictions based on a CatBoost model"""
    model = cb.CatBoostRegressor(
        iterations=100, depth=3, loss_function='RMSE', logging_level='Silent'
    )
    model.fit(X, y)

    return model.predict


ucv.nonoverlap_cross_val(
    lambda a, b: st.pearsonr(a, b)[0],
    lambda x: catboost_predictor(x, ham_umap_embedding(x)),
    df,
)

array([-0.52588344,  0.9038821 ,  0.92516013,  0.9161221 , -0.21561074])

<IPython.core.display.Javascript object>

In [9]:
distance_matrix = dist.squareform(dist.pdist(df, hamming))

<IPython.core.display.Javascript object>

In [10]:
# demo cross validation with distance matrix data
# sklearn cross validation doesn't properly split a distance matrix into folds
ucv.embedding_cross_val(
    lambda a, b: st.pearsonr(dist.pdist(
        a, 'euclidean'), dist.squareform(b))[0],
    lambda x: umap.UMAP(metric='precomputed', n_components=4).fit_transform(x),
    distance_matrix,
    dist_matrix_data=True,
)

array([0.5053673 , 0.54217419, 0.53671507, 0.53725389, 0.52910807])

<IPython.core.display.Javascript object>

In [12]:
# demo random parameter search
def emb_umap(X, **kwargs):
    """Uses hamming distance to create UMAP embedding"""
    return umap.UMAP(metric='precomputed', **kwargs).fit_transform(X)


def emb_score(a, b):
    a_dist = dist.pdist(a, 'euclidean')
    b_dist = dist.squareform(b)
    r, _ = st.pearsonr(a_dist, b_dist)
    return r


def param_rng(rng=randy):
    return {
        'n_components': rng.integers(2, int(np.ceil(np.log2(len(df))))),
        'min_dist': rng.uniform(0.01, 0.99),
    }


def demo_cross_val(**kwargs):
    scores = ucv.embedding_cross_val(
        emb_score,
        lambda x: emb_umap(x, **kwargs),
        data=distance_matrix,
        dist_matrix_data=True
    )
    return np.mean(scores)


best_params, best_score = ps.random_search(
    param_rng, demo_cross_val, n_iter=32, n_jobs=-1
)

print(f'best score: {best_score}')
print(f'best parameters: {best_params}')

best score: 0.5406593934960883
best parameters: {'n_components': 6, 'min_dist': 0.3601666422825574}


<IPython.core.display.Javascript object>

In [14]:
# demo condensing random parameter search
def demo_func(x, y, wtf, ignore):
    if wtf == 'a':
        return -1 * x ** 2 - 3 * y + 3
    elif wtf == 'b':
        return -2 * x ** 2 - 2 * y + 1
    elif wtf == 'c':
        return -3 * x ** 2 - 1 * y + 2


param_specs = {
    'x': {'type': 'float', 'values': [-2, 2]},
    'y': {'type': 'int', 'values': [-10, 10]},
    'wtf': {'type': 'categorical', 'values': ['a', 'b', 'c']},
    'ignore': {'type': 'constant', 'value': 0},
}

ps.condensing_random_search(param_specs, demo_func, max_time=10, n_jobs=-1)

({'x': 2.0, 'y': -10, 'wtf': 'a', 'ignore': 0}, 29.0)

<IPython.core.display.Javascript object>

In [16]:
# more condensing random parameter search
param_specs = {
    'n_components': ps.param_spec_int([2, int(np.ceil(np.log2(len(df))))]),
    'min_dist': ps.param_spec_float([0.01, 0.99]),
}

best_params, best_score = ps.condensing_random_search(
    param_specs, demo_cross_val, n_iter=32, n_jobs=-1
)

print(f'best score: {best_score}')
print(f'best parameters: {best_params}')

best score: 0.542242481613807
best parameters: {'n_components': 10, 'min_dist': 0.24662001654634388}


<IPython.core.display.Javascript object>