In [1]:
import os, json, random, time

from datetime import datetime
from datetime import timedelta
import subprocess as sp
from pathlib import Path

from decimal import Decimal

import requests
import pandas as pd
import numpy as np
from IPython.display import HTML, display

# cell usage data stream

<pre>

# -------------------------------------------------------------------------------------------------- src stream

the messages (kafka speech) or events (flink speech) coming into the source look like these documents:
{"ts": "2021-11-24T00:00:00.762", "account": "324", "bytesUsed": 104857 }
{"ts": "2021-11-24T00:03:00.410", "account": "781", "bytesUsed": 819200 }


# --------------------------------------------------------------------------------------------------
*** there is another CreateTime field outside the document, not sure what to do w/ that at the momment.




# -------------------------------------------------------------------------------------------------- dateutil
*** datetime.fromisoformat() can not parse all ISO 8601 strings, some basics are supported.
corner case are not. The best soln is to use dateutil if you absolutely must handle it.
https://github.com/dateutil/dateutil

*** to avoid needing the xtra package lets get rid of the the +0000
"2021-11-24T00:03:00.410+0000"  --->>> becomes:
"2021-11-24T00:03:00.410"

no dateutil needed.

# --------------------------------------------------------------------------------------------------




# --------------------------------------------------------------------------------------------------



</pre>


# datetime magic

In [2]:
datetime.fromisoformat("2022-01-01T00:00:00").timestamp()

1641013200.0

In [3]:
datetime.fromisoformat("2022-01-01").timestamp()

1641013200.0

In [4]:
start_ts = datetime.fromisoformat("2021-01-01").timestamp()
end_ts   = datetime.fromisoformat("2022-01-01T00:00:00").timestamp()

In [5]:
start_ts

1609477200.0

In [6]:
timedelta(seconds=end_ts - start_ts)

datetime.timedelta(days=365)

In [7]:
datetime.fromtimestamp(random.randint(start_ts, end_ts))

datetime.datetime(2021, 4, 20, 10, 55, 36)

In [8]:
str(datetime.fromtimestamp(start_ts))

'2021-01-01 00:00:00'

----
----
----
----
----
----
----
----
----
----
----
# generate random data

In [9]:
# {"ts": "2021-11-24T00:03:00.410", "account": "781", "bytesUsed": 819200 }

In [16]:
def get_random_events_df(sample_size = 10):
    sample_events = []

    start_ts = datetime.fromisoformat("2021-08-01").timestamp()
    end_ts   = datetime.fromisoformat("2021-08-28").timestamp()

    for _ in range(sample_size):
        curr_event = {}
        curr_event['ts'] =  str(datetime.fromtimestamp(random.randint(start_ts, end_ts))).replace(' ', 'T')
        curr_event['account'] =  str(random.randint(1000, 9999))
        curr_event['bytesUsed'] =  random.randint(1024, 1024*1024)

        sample_events.append(curr_event)
    
    # make df
    # use dtype=object to keep things as is.
    pddf = pd.DataFrame(data=sample_events, columns={'ts':'str', 'account':'str', 'bytesUsed':'int'})

    return pddf

In [25]:
# -- call it
pddf = get_random_events_df(15)

HTML(pddf.head().to_html(index=False))

ts,account,bytesUsed
2021-08-17T19:50:28,2177,40191
2021-08-22T22:28:41,6348,713007
2021-08-23T16:35:38,9192,194496
2021-08-05T00:18:14,3947,94805
2021-08-14T18:21:23,9056,568474


In [26]:
pddf.values

array([['2021-08-17T19:50:28', '2177', 40191],
       ['2021-08-22T22:28:41', '6348', 713007],
       ['2021-08-23T16:35:38', '9192', 194496],
       ['2021-08-05T00:18:14', '3947', 94805],
       ['2021-08-14T18:21:23', '9056', 568474],
       ['2021-08-26T06:15:09', '7673', 314399],
       ['2021-08-13T16:46:56', '4213', 746012],
       ['2021-08-19T08:10:01', '7306', 464389],
       ['2021-08-27T15:48:15', '3647', 531434],
       ['2021-08-03T12:29:48', '4091', 168406],
       ['2021-08-14T00:58:31', '2501', 989436],
       ['2021-08-01T18:08:38', '6624', 100400],
       ['2021-08-24T00:16:18', '7690', 5906],
       ['2021-08-24T00:55:25', '4483', 600987],
       ['2021-08-09T13:04:36', '7774', 719816]], dtype=object)

----
----
----
----
----
----
----
----
----

# choose path

In [108]:
# Note we might as well leave the generated files under 
# ~/tmp_mkmkdt/xxx.json

# just rename them after generation to something like w/ a row count and leave them there
# 805      >>>>>     cell_users_20.json
# 3.3M     >>>>>     cell_users_90k.json

# commit the 20 row sample to the flink repo if you want, but best to leave them out here like this
# as we might need very large ones 100 MB +

In [109]:
Path.home()

PosixPath('/home/lu')

In [110]:
temp_dir_path = Path.home() / 'tmp_mkmkdt'
output_csv_path = temp_dir_path / 'cell_users.csv'
output_psv_path = temp_dir_path / 'cell_users.psv'
output_json_path = temp_dir_path / 'cell_users.json'


In [111]:
temp_dir_path.exists()

True

In [112]:
temp_dir_path.mkdir(parents=False, exist_ok=True)

In [113]:
exit_code = sp.call(f'ls -lh {temp_dir_path}', shell=True)

total 6.1M
-rw-rw-r-- 1 lu lu  805 Dec 17 17:55 cell_users_20.json
-rw-rw-r-- 1 lu lu  657 Dec 17 17:55 cell_users_20.psv
-rw-rw-r-- 1 lu lu 3.3M Dec 17 17:56 cell_users_90k.json
-rw-rw-r-- 1 lu lu 2.8M Dec 17 17:56 cell_users_90k.psv


----
----
----
----
----
----
----
----
----

# generate and save

In [130]:
pddf = get_random_events_df(900_000)

HTML(pddf.head().to_html(index=False))

ts,account,bytesUsed
2021-08-14T16:34:55,2196,518343
2021-08-22T09:44:55,5648,575325
2021-08-01T06:49:45,1954,995393
2021-08-10T15:02:02,5430,813726
2021-08-03T21:56:28,9786,54150


In [131]:
pddf.to_csv(output_psv_path, sep='|', header=False, index=False)

In [132]:
sp.call(f'ls -lah {temp_dir_path}', shell=True)

total 33M
drwxrwxr-x  2 lu lu 4.0K Dec 17 21:15 .
drwxr-x--- 38 lu lu 4.0K Dec 17 17:22 ..
-rw-rw-r--  1 lu lu  805 Dec 17 17:55 cell_users_20.json
-rw-rw-r--  1 lu lu  636 Dec 17 21:13 cell_users_20.psv
-rw-rw-r--  1 lu lu 2.9M Dec 17 21:14 cell_users_80k.json
-rw-rw-r--  1 lu lu 2.5M Dec 17 21:13 cell_users_80k.psv
-rw-rw-r--  1 lu lu  28M Dec 17 21:15 cell_users.psv


0

In [133]:
exit_code = sp.call(f'cat {output_psv_path} | head', shell=True)

2021-08-14T16:34:55|2196|518343
2021-08-22T09:44:55|5648|575325
2021-08-01T06:49:45|1954|995393
2021-08-10T15:02:02|5430|813726
2021-08-03T21:56:28|9786|54150
2021-08-15T03:34:25|6811|19006
2021-08-16T07:05:28|1909|948347
2021-08-15T23:22:08|5820|80534
2021-08-13T14:09:34|3642|1041807
2021-08-16T01:31:11|6858|763630


In [134]:
# terrible
# pddf.to_json(output_json_path, index=True)

In [135]:
# pddf.to_json(output_json_path, orient='table', index=False) # too much info
pddf.to_json(output_json_path, orient='split', index=False)

In [136]:
exit_code = sp.call(f'cat {output_json_path} | jq --indent 4 | head -n 30', shell=True)

{
    "columns": [
        "ts",
        "account",
        "bytesUsed"
    ],
    "data": [
        [
            "2021-08-14T16:34:55",
            "2196",
            518343
        ],
        [
            "2021-08-22T09:44:55",
            "5648",
            575325
        ],
        [
            "2021-08-01T06:49:45",
            "1954",
            995393
        ],
        [
            "2021-08-10T15:02:02",
            "5430",
            813726
        ],
        [
            "2021-08-03T21:56:28",
            "9786",


In [137]:
exit_code = sp.call(f'ls -lah {temp_dir_path}', shell=True)

total 66M
drwxrwxr-x  2 lu lu 4.0K Dec 17 21:15 .
drwxr-x--- 38 lu lu 4.0K Dec 17 17:22 ..
-rw-rw-r--  1 lu lu  805 Dec 17 17:55 cell_users_20.json
-rw-rw-r--  1 lu lu  636 Dec 17 21:13 cell_users_20.psv
-rw-rw-r--  1 lu lu 2.9M Dec 17 21:14 cell_users_80k.json
-rw-rw-r--  1 lu lu 2.5M Dec 17 21:13 cell_users_80k.psv
-rw-rw-r--  1 lu lu  33M Dec 17 21:15 cell_users.json
-rw-rw-r--  1 lu lu  28M Dec 17 21:15 cell_users.psv
