In [1]:
from datetime import datetime
from pprint import pprint

from iterable_orm import QuerySet

from data_analysis.pattern_finders.frequency import FrequencyPatternFinder
from data_vis.models import Series, DataPoint

In [2]:
data_points = QuerySet([
    DataPoint(x=datetime(2020, 1, 1), dy=100),
    DataPoint(x=datetime(2020, 1, 1), dy=-100),
    DataPoint(x=datetime(2020, 1, 4), dy=2),
    DataPoint(x=datetime(2020, 2, 1), dy=95),
    DataPoint(x=datetime(2020, 2, 5), dy=200),
    # DataPoint(x=datetime(2020, 3, 1), dy=105),
    DataPoint(x=datetime(2020, 4, 4), dy=-4),
])

In [3]:
def f():
    series = Series.objects.first()
    finder = FrequencyPatternFinder(intervals=[dict(weeks=1)])
    subsets = finder.find(series.data_points.filter(x__lte='2020-04-01'))
    pprint(subsets)

In [5]:
# from django.conf import settings


# settings.configure(DEBUG=True)
import os


os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"


finder = FrequencyPatternFinder(
    intervals=[dict(months=1)],
    tolerance_y=10,
)
# series = Series.objects.first()
# patterns = finder.find(series.data_points.filter(x__lte='2020-03-01'))

# patterns = finder.find(data_points)
intervals = finder.options.intervals
precision = finder.options.precision
# subsets = finder.select_subsets(data_points)

# series = Series.objects.first()
# data_points_queryset = series.data_points.filter(x__lte='2020-03-01')
data_points_queryset = data_points

In [6]:
import itertools
from dateutil.relativedelta import relativedelta


# Iterate starting points that give us (potentially) different subsets.
# Starting points are those points lie within the interval
# [ point0.x, point0.x + cycle_length )
ordered_points = data_points_queryset.order_by('x')
first_point = ordered_points.first()
starting_time = first_point.x

deltas = [relativedelta(**interval) for interval in intervals]
abstract_cycle_length = sum(deltas, relativedelta())
# Note: Without applying the delta to a datetime the number of e.g.
# days can't be determined because months have different amounts of
# days, i.e. relativedelta(months=1).days == 0.
# But when the delta is added to a concrete datetime, the number of
# days becomes clear, e.g.
# (
#   date(2020,2,1) + relativedelta(months=2) - date(2020,2,1)
# ).days == 60
# because the February 2020 has 29 days.
applied_cycle_length = (
    (starting_time + abstract_cycle_length)
    - starting_time
)
num_atomic_periods = getattr(applied_cycle_length, precision)
# print('num_atomic_periods', num_atomic_periods)
atomic_delta = relativedelta(**{precision: 1})
subsets = []

for i in range(num_atomic_periods):
    delta_iterator = itertools.cycle(deltas)
    # subset = []
    subset = {}
    time_cursor = starting_time + (i * atomic_delta)

    while True:
        # subset += list(data_points_queryset.filter(x=time_cursor))
        # subset += utils.data.grouped_points(
        #     data_points_queryset.filter(x=time_cursor),
        #     key=lambda p: p.x,
        # )
        points_at_x = data_points_queryset.filter(x=time_cursor)
        if points_at_x:
            subset[time_cursor] = list(points_at_x)
        if not data_points_queryset.filter(x__gte=time_cursor):
            break
        time_cursor += next(delta_iterator)

    if subset:
        subsets.append(subset)

pprint(subsets)

# At this point, a subset can contain multiple points with the same
# datetime. But we don't want to have patterns like that.
# For example let's say we have a weekly frequency:
# date ||     01-01   ||  01-08  ||  01-15
# i    || 0   | 1 | 2 || 3   | 4 || 5   | 6
# dy   || 100 | ? | ? || 100 | ? || 100 | 100
# Right now, the points at i=[0, 3, 5, 6] would form a subset.
# We want exactly 1 point per datetime, which means we will get
# multiple subsets instead:
# [0, 3, 5] and [0, 3, 6]
# Therefore, we create the cartesian product of points for all
# datetimes, which gives us all the lines we could draw that only hit
# one point per datetime:
#   list(itertools.product([1,2,3], [4,5], [6,7]))
#   [
#       (1, 4, 6), (1, 4, 7), (1, 5, 6), (1, 5, 7),
#       (2, 4, 6), (2, 4, 7), (2, 5, 6), (2, 5, 7),
#       (3, 4, 6), (3, 4, 7), (3, 5, 6), (3, 5, 7)
#   ]
# import pudb; pudb.set_trace()
result = itertools.chain.from_iterable(
    itertools.product(*subset.values()) for subset in subsets
)
# result_list = list(result)
# result = []
# for subset in subsets:
#     datetime_unique_subsets = itertools.product(*subset.values())
#     result += [*datetime_unique_subsets]

# import pprint; pprint.pprint(result)
# return result
# pprint(result_list)
patterns = [
    candidate 
    for candidate in result 
    if len(candidate) > 1
]
pprint(patterns)

[{datetime.datetime(2020, 1, 1, 0, 0): [<DataPoint: <DataPoint 2020-01-01 00:00:00 100 >>,
                                        <DataPoint: <DataPoint 2020-01-01 00:00:00 -100 >>],
  datetime.datetime(2020, 2, 1, 0, 0): [<DataPoint: <DataPoint 2020-02-01 00:00:00 95 >>]},
 {datetime.datetime(2020, 1, 4, 0, 0): [<DataPoint: <DataPoint 2020-01-04 00:00:00 2 >>],
  datetime.datetime(2020, 4, 4, 0, 0): [<DataPoint: <DataPoint 2020-04-04 00:00:00 -4 >>]},
 {datetime.datetime(2020, 2, 5, 0, 0): [<DataPoint: <DataPoint 2020-02-05 00:00:00 200 >>]}]
[(<DataPoint: <DataPoint 2020-01-01 00:00:00 100 >>,
  <DataPoint: <DataPoint 2020-02-01 00:00:00 95 >>),
 (<DataPoint: <DataPoint 2020-01-01 00:00:00 -100 >>,
  <DataPoint: <DataPoint 2020-02-01 00:00:00 95 >>),
 (<DataPoint: <DataPoint 2020-01-04 00:00:00 2 >>,
  <DataPoint: <DataPoint 2020-04-04 00:00:00 -4 >>)]


In [7]:
def should_use_subset(subset):
    if not subset:
        return False

    tolerance = 6
    
    return all(
        abs(p.dy - q.dy) <= tolerance
        for p, q in itertools.combinations(subset, 2)
    )
    

pprint([
    subset
    for subset in patterns
    if should_use_subset(subset)
])

[(<DataPoint: <DataPoint 2020-01-01 00:00:00 100 >>,
  <DataPoint: <DataPoint 2020-02-01 00:00:00 95 >>),
 (<DataPoint: <DataPoint 2020-01-04 00:00:00 2 >>,
  <DataPoint: <DataPoint 2020-04-04 00:00:00 -4 >>)]


In [8]:
from data_analysis.pattern_finders.frequency import FrequencyPatternFinder

print(data_points_queryset)
finder = FrequencyPatternFinder(
    intervals=[dict(months=1)],
    tolerance_y=10,
)
finder.find(data_points_queryset)

<iterable_orm.query.QuerySet object at 0x1071104c0>
[{datetime.datetime(2020, 1, 1, 0, 0): [<DataPoint: <DataPoint 2020-01-01 00:00:00 100 >>,
                                        <DataPoint: <DataPoint 2020-01-01 00:00:00 -100 >>],
  datetime.datetime(2020, 2, 1, 0, 0): [<DataPoint: <DataPoint 2020-02-01 00:00:00 95 >>]},
 {datetime.datetime(2020, 1, 4, 0, 0): [<DataPoint: <DataPoint 2020-01-04 00:00:00 2 >>],
  datetime.datetime(2020, 4, 4, 0, 0): [<DataPoint: <DataPoint 2020-04-04 00:00:00 -4 >>]},
 {datetime.datetime(2020, 2, 5, 0, 0): [<DataPoint: <DataPoint 2020-02-05 00:00:00 200 >>]}]
<itertools.chain object at 0x107110be0>


[(<DataPoint: <DataPoint 2020-01-01 00:00:00 100 >>,
  <DataPoint: <DataPoint 2020-02-01 00:00:00 95 >>),
 (<DataPoint: <DataPoint 2020-01-04 00:00:00 2 >>,
  <DataPoint: <DataPoint 2020-04-04 00:00:00 -4 >>)]