In this notebook I try to produce a prior for metaculus questions based on historical data.

In [24]:
import json
import pandas as pd

In [6]:
def safe_json_load(fpath, mode='r'):
    with open(fpath, mode) as f:
        ret = json.load(f)
    return ret

def filter_by_type(qs, t=None):
    return {k: v for k, v in qs.items() if v['possibilities']['type'] == t} if t is not None else qs

def filter_by_resolution(qs, r=None):
    return {k: v for k, v in qs.items() if v['resolution'] in r} if r is not None else qs

def filter_by_date(qs, date_field, date_low=None, date_high=None):
    def cond(date):
        if date_low is None and date_high is None:
            return True
        if date_low is None and date_high is not None:
            return date < date_high
        if date_low is not None and date_high is None:
            return date_low < date
        return date_low < date < date_high

    return {k: v for k, v in qs.items() if cond(str_to_datetime(v[date_field]))}

def filter_qs(qs, **fields):
    # TODO this is unacceptably slow bc some of the fields in the questions dict have tons of nested data
    # easiest way to fix it is probably to save only pointers to e.g. whole prediction history of a question
    # and have a separate json for that
    filters = {
        'type': lambda qs: filter_by_type(qs, fields.get('type', None)),
        'resolution': lambda qs: filter_by_resolution(qs, fields.get('resolution', None)),
        'created_time': lambda qs: filter_by_date(qs, 'created_time', fields.get('created_time', (None, None))),
        'publish_time': lambda qs: filter_by_date(qs, 'publish_time', fields.get('publish_time', (None, None))),
        'close_time': lambda qs: filter_by_date(qs, 'close_time', fields.get('close_time', (None, None))),
        'resolve_time': lambda qs: filter_by_date(qs, 'resolve_time', fields.get('resolve_time', (None, None))),
        
    }

    out = qs.copy()

    for k in fields:
        out = filters[k](out)

    return out

In [20]:
def comp(n):
    return {i: str(i) for i in range(n)}
def filter_(d):
    return {k: d[k] for k in d if int(d[k])%2 == 0}

for k in [10, 100, 1000, 10000, 100000]:
    d = comp(k)
    %timeit filter_(d)

2.54 µs ± 93.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
23.1 µs ± 373 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
248 µs ± 12 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
2.41 ms ± 66 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
28.7 ms ± 2.17 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [35]:
binqs = {k: qs[k]['resolution'] for k in qs if qs[k]['possibilities']['type'] == 'binary'}
resolved_binqs = {k: binqs[k] for k in binqs if binqs[k] is not None}
neg, pos, amb = pd.Series(resolved_binqs).value_counts() / len(resolved_binqs)

for k, p in {'negative': neg, 'positive': pos, 'ambiguous': amb}.items():
    print(f'Binary questions have a {100*p}% probability of resolving {k}')

Binary questions have a 63.85737439222042% probability of resolving negative
Binary questions have a 30.956239870340358% probability of resolving positive
Binary questions have a 5.186385737439222% probability of resolving ambiguous
