In [38]:

import pandas as pd
import matplotlib.pyplot as plt
from empiricaldist import Pmf

pd.set_option('display.max_colwidth', None)


## Setup dataframe for analysis

In [39]:
import itertools
import sys



def even_odd(nums):
    nums = [1 if int(x) % 2 == 0 else 0 for x in nums.split('|')]
    even = sum(nums)
    odd = 5 - even
    return (even, odd)

MID = 70 // 2
def lo_hi(nums):
    nums = [1 if int(x) <= MID else 0 for x in nums.split('|')]
    lo = sum(nums)
    hi = 5 - lo
    return (lo, hi)

def consecutive(nums):
    nums = sorted([int(x) for x in nums.split('|')])
    prev = -sys.maxsize
    count = 0
    consecutives = []
    for num in nums:
        if prev + 1 == num:
            count += 1
        elif count > 0:
            consecutives.append(count)
            count = 0
        prev = num
    if count > 0:
        consecutives.append(count)
    return tuple(consecutives) if consecutives else (0,)

def yellow_in_white(nums):
    white_balls, yellow_ball = nums
    return yellow_ball in set([int(x) for x in white_balls.split('|')])

def pattern(nums):
    P1 = {1,2,3,4,5,6,7,8,9,70}
    P2 = {10,11,12,13,14,15,16,17,18,19}
    P3 = {20,21,22,23,24,25,26,27,28,29}
    P4 = {30,31,32,33,34,35,36,37,38,39}
    P5 = {40,41,42,43,44,45,46,47,48,49}
    P6 = {50,51,52,53,54,55,56,57,58,59}
    P7 = {60,61,62,63,64,65,66,67,68,69}

    nums = [int(x) for x in nums.split('|')]

    ret = [0, 0, 0, 0, 0, 0, 0]
    for num in nums:
        if num in P1:
            ret[0] += 1
        elif num in P2:
            ret[1] += 1
        elif num in P3:
            ret[2] += 1
        elif num in P4:
            ret[3] += 1
        elif num in P5:
            ret[4] += 1
        elif num in P6:
            ret[5] += 1
        elif num in P7:
            ret[6] += 1
        else:
            raise Exception('Unexpected number')

    return tuple(ret)

def xup_pattern(num):
    P1 = {1,2,3,4,5,6,7,8,9}
    P2 = {10,11,12,13,14,15,16,17,18,19}
    P3 = {20,21,22,23,24,25}
    if num in P1:
        return 1
    elif num in P2:
        return 2
    elif num in P3:
        return 3
    else:
        raise Exception('Unexpected number')

# TODO: Surprising? Something wrong done?
# def previous_10(idx):
#     return previous_x(idx, 10)

# def previous_100(idx):
#     return previous_x(idx, 100)

# def previous_x(idx, x):
#     if idx.name >= x:
#         cur = set([int(y) for y in df.iloc[idx.name]['white_balls'].split('|')])
#         vals = df.iloc[idx.name - x:idx.name - 1]['white_balls'].apply(lambda idx: [int(y) for y in idx.split('|')])
#         vals_set = set(list(itertools.chain.from_iterable(vals)))
#         return len(cur & vals_set)
#     else:
#         return 0

df = pd.read_csv('../numbers/megamillions.csv')
df['even_odd'] = df['white_balls'].apply(lambda x: even_odd(x))
df['lo_hi'] = df['white_balls'].apply(lambda x: lo_hi(x))
df['consecutive'] = df['white_balls'].apply(lambda x: consecutive(x))

df['even_odd_lo_hi'] = df[['even_odd', 'lo_hi']].apply(tuple, axis=1)
df['even_odd_consecutive'] = df[['even_odd', 'consecutive']].apply(tuple, axis=1)
df['lo_hi_consecutive'] = df[['lo_hi', 'consecutive']].apply(tuple, axis=1)
df['even_odd_lo_hi_consecutive'] = df[['even_odd', 'lo_hi', 'consecutive']].apply(tuple, axis=1)

df['yellow_in_white'] = df[['white_balls', 'yellow_ball']].apply(tuple, axis=1).apply(lambda x: yellow_in_white(x))

df['pattern'] = df['white_balls'].apply(lambda x: pattern(x))
df['xup_pattern'] = df['yellow_ball'].apply(lambda x: xup_pattern(x))

# TODO: Surprising? Something wrong done?
# df['white_in_previous_10'] = df.apply(previous_10, axis=1)
# df['white_in_previous_100'] = df.apply(previous_100, axis=1)

print(df.to_markdown(tablefmt='github'))

|     | date       | white_balls    |   yellow_ball | megaplier   | even_odd   | lo_hi   | consecutive   | even_odd_lo_hi   | even_odd_consecutive   | lo_hi_consecutive   | even_odd_lo_hi_consecutive   | yellow_in_white   | pattern               |   xup_pattern |
|-----|------------|----------------|---------------|-------------|------------|---------|---------------|------------------|------------------------|---------------------|------------------------------|-------------------|-----------------------|---------------|
|   0 | 10/31/2017 | 6|28|31|52|53  |            12 | 4X          | (3, 2)     | (3, 2)  | (1,)          | ((3, 2), (3, 2)) | ((3, 2), (1,))         | ((3, 2), (1,))      | ((3, 2), (3, 2), (1,))       | False             | (1, 0, 1, 1, 0, 2, 0) |             2 |
|   1 | 11/3/2017  | 10|22|42|61|69 |             3 | 2X          | (3, 2)     | (2, 3)  | (0,)          | ((3, 2), (2, 3)) | ((3, 2), (0,))         | ((2, 3), (0,))      | ((3, 2), (2, 3), (0,))       | Fals

## Frequencies

In [40]:
even_odd_prob = Pmf.from_seq(df['even_odd']).sort_values(ascending=False)
print(even_odd_prob.to_markdown(tablefmt='github'))
print()

lo_hi_prob = Pmf.from_seq(df['lo_hi']).sort_values(ascending=False)
print(lo_hi_prob.to_markdown(tablefmt='github'))
print()

consecutive_prob = Pmf.from_seq(df['consecutive']).sort_values(ascending=False)
print(consecutive_prob.to_markdown(tablefmt='github'))
print()

even_odd_lo_hi_prob = Pmf.from_seq(df['even_odd_lo_hi']).sort_values(ascending=False)
print(even_odd_lo_hi_prob.to_markdown(tablefmt='github'))
print()

even_odd_consecutive_prob = Pmf.from_seq(df['even_odd_consecutive']).sort_values(ascending=False)
print(even_odd_consecutive_prob.to_markdown(tablefmt='github'))
print()

lo_hi_consecutive_prob = Pmf.from_seq(df['lo_hi_consecutive']).sort_values(ascending=False)
print(lo_hi_consecutive_prob.to_markdown(tablefmt='github'))
print()

even_odd_lo_hi_consecutive_prob = Pmf.from_seq(df['even_odd_lo_hi_consecutive']).sort_values(ascending=False)
print(even_odd_lo_hi_consecutive_prob.to_markdown(tablefmt='github'))
print()

yellow_in_white_prob = Pmf.from_seq(df['yellow_in_white']).sort_values(ascending=False)
print(yellow_in_white_prob.to_markdown(tablefmt='github'))
print()

pattern_prob = Pmf.from_seq(df['pattern']).sort_values(ascending=False)
print(pattern_prob.to_markdown(tablefmt='github'))
print()

xup_pattern_prob = Pmf.from_seq(df['xup_pattern']).sort_values(ascending=False)
print(xup_pattern_prob.to_markdown(tablefmt='github'))
print()

# TODO: Surprising? Something wrong done?
# white_in_previous_10_prob = Pmf.from_seq(df['white_in_previous_10']).sort_values(ascending=False)
# print(white_in_previous_10_prob.to_markdown(tablefmt='github'))
# print()

# white_in_previous_100_prob = Pmf.from_seq(df['white_in_previous_100']).sort_values(ascending=False)
# print(white_in_previous_100_prob.to_markdown(tablefmt='github'))
# print()

| even_odd   |           |
|------------|-----------|
| (3, 2)     | 0.329301  |
| (2, 3)     | 0.311828  |
| (4, 1)     | 0.165323  |
| (1, 4)     | 0.137097  |
| (5, 0)     | 0.0336022 |
| (0, 5)     | 0.0228495 |

| lo_hi   |           |
|---------|-----------|
| (3, 2)  | 0.333333  |
| (2, 3)  | 0.307796  |
| (4, 1)  | 0.165323  |
| (1, 4)  | 0.142473  |
| (5, 0)  | 0.0349462 |
| (0, 5)  | 0.016129  |

| consecutive   |           |
|---------------|-----------|
| (0,)          | 0.737903  |
| (1,)          | 0.236559  |
| (2,)          | 0.016129  |
| (1, 1)        | 0.0094086 |

| even_odd_lo_hi   |            |
|------------------|------------|
| ((3, 2), (3, 2)) | 0.125      |
| ((3, 2), (2, 3)) | 0.106183   |
| ((2, 3), (3, 2)) | 0.102151   |
| ((2, 3), (2, 3)) | 0.0887097  |
| ((2, 3), (4, 1)) | 0.0591398  |
| ((4, 1), (3, 2)) | 0.0497312  |
| ((3, 2), (4, 1)) | 0.0497312  |
| ((4, 1), (2, 3)) | 0.0483871  |
| ((1, 4), (2, 3)) | 0.0443548  |
| ((2, 3), (1, 4)) | 0.0430108  |
|