In [1]:
import os, json, random, time

from datetime import datetime
from datetime import timedelta
import subprocess as sp
from pathlib import Path

from decimal import Decimal

import requests
import pandas as pd
import numpy as np
from IPython.display import HTML, display

# cell usage data stream

<pre>

# -------------------------------------------------------------------------------------------------- src stream

the messages (kafka speech) or events (flink speech) coming into the source look like these documents:
{"ts": "2021-11-24T00:00:00.762", "account": "324", "bytesUsed": 104857 }
{"ts": "2021-11-24T00:03:00.410", "account": "781", "bytesUsed": 819200 }


# --------------------------------------------------------------------------------------------------
*** there is another CreateTime field outside the document, not sure what to do w/ that at the momment.




# -------------------------------------------------------------------------------------------------- dateutil
*** datetime.fromisoformat() can not parse all ISO 8601 strings, some basics are supported.
corner case are not. The best soln is to use dateutil if you absolutely must handle it.
https://github.com/dateutil/dateutil

*** to avoid needing the xtra package lets get rid of the the +0000
"2021-11-24T00:03:00.410+0000"  --->>> becomes:
"2021-11-24T00:03:00.410"

no dateutil needed.

# --------------------------------------------------------------------------------------------------




# --------------------------------------------------------------------------------------------------



</pre>


# datetime magic

In [2]:
datetime.fromisoformat("2022-01-01T00:00:00").timestamp()

1641013200.0

In [3]:
datetime.fromisoformat("2022-01-01").timestamp()

1641013200.0

In [4]:
start_ts = datetime.fromisoformat("2021-01-01").timestamp()
end_ts   = datetime.fromisoformat("2022-01-01T00:00:00").timestamp()

In [5]:
start_ts

1609477200.0

In [6]:
timedelta(seconds=end_ts - start_ts)

datetime.timedelta(days=365)

In [7]:
datetime.fromtimestamp(random.randint(start_ts, end_ts))

datetime.datetime(2021, 4, 20, 10, 55, 36)

In [8]:
str(datetime.fromtimestamp(start_ts))

'2021-01-01 00:00:00'

----
----
----
----
----
----
----
----
----
----
----
# generate random data

In [9]:
# {"ts": "2021-11-24T00:03:00.410", "account": "781", "bytesUsed": 819200 }

In [16]:
def get_random_events_df(sample_size = 10):
    sample_events = []

    start_ts = datetime.fromisoformat("2021-08-01").timestamp()
    end_ts   = datetime.fromisoformat("2021-08-28").timestamp()

    for _ in range(sample_size):
        curr_event = {}
        curr_event['ts'] =  str(datetime.fromtimestamp(random.randint(start_ts, end_ts))).replace(' ', 'T')
        curr_event['account'] =  str(random.randint(1000, 9999))
        curr_event['bytesUsed'] =  random.randint(1024, 1024*1024)

        sample_events.append(curr_event)
    
    # make df
    # use dtype=object to keep things as is.
    pddf = pd.DataFrame(data=sample_events, columns={'ts':'str', 'account':'str', 'bytesUsed':'int'})

    return pddf

In [25]:
# -- call it
pddf = get_random_events_df(15)

HTML(pddf.head().to_html(index=False))

ts,account,bytesUsed
2021-08-17T19:50:28,2177,40191
2021-08-22T22:28:41,6348,713007
2021-08-23T16:35:38,9192,194496
2021-08-05T00:18:14,3947,94805
2021-08-14T18:21:23,9056,568474


In [26]:
pddf.values

array([['2021-08-17T19:50:28', '2177', 40191],
       ['2021-08-22T22:28:41', '6348', 713007],
       ['2021-08-23T16:35:38', '9192', 194496],
       ['2021-08-05T00:18:14', '3947', 94805],
       ['2021-08-14T18:21:23', '9056', 568474],
       ['2021-08-26T06:15:09', '7673', 314399],
       ['2021-08-13T16:46:56', '4213', 746012],
       ['2021-08-19T08:10:01', '7306', 464389],
       ['2021-08-27T15:48:15', '3647', 531434],
       ['2021-08-03T12:29:48', '4091', 168406],
       ['2021-08-14T00:58:31', '2501', 989436],
       ['2021-08-01T18:08:38', '6624', 100400],
       ['2021-08-24T00:16:18', '7690', 5906],
       ['2021-08-24T00:55:25', '4483', 600987],
       ['2021-08-09T13:04:36', '7774', 719816]], dtype=object)

----
----
----
----
----
----
----
----
----

# choose path

In [28]:
Path.home()

PosixPath('/home/lu')

In [29]:
temp_dir_path = Path.home() / 'tmp_mkmkdt'
output_csv_path = temp_dir_path / 'cell_users.csv'
output_psv_path = temp_dir_path / 'cell_users.psv'
output_json_path = temp_dir_path / 'cell_users.json'


In [30]:
temp_dir_path.exists()

True

In [31]:
temp_dir_path.mkdir(parents=False, exist_ok=True)

In [54]:
sp.call(f'ls -lah {temp_dir_path}', shell=True)

total 16K
drwxrwxr-x  2 lu lu 4.0K Dec 17 17:44 .
drwxr-x--- 38 lu lu 4.0K Dec 17 17:22 ..
-rw-rw-r--  1 lu lu 1.2K Dec 17 17:52 cell_users.json
-rw-rw-r--  1 lu lu  977 Dec 17 17:51 cell_users.psv


0

----
----
----
----
----
----
----
----
----

# generate and save

In [73]:
pddf = get_random_events_df(80)

HTML(pddf.head().to_html(index=False))

ts,account,bytesUsed
2021-08-23T20:21:15,1058,380287
2021-08-14T21:41:58,5827,898843
2021-08-06T13:01:23,4235,47067
2021-08-25T15:04:31,6929,393749
2021-08-24T11:29:06,9743,382509


In [74]:
pddf.to_csv(output_psv_path, sep='|', index=False)

In [75]:
sp.call(f'ls -lah {temp_dir_path}', shell=True)

total 16K
drwxrwxr-x  2 lu lu 4.0K Dec 17 17:44 .
drwxr-x--- 38 lu lu 4.0K Dec 17 17:22 ..
-rw-rw-r--  1 lu lu 1.2K Dec 17 17:54 cell_users.json
-rw-rw-r--  1 lu lu 2.6K Dec 17 17:54 cell_users.psv


0

In [76]:
exit_code = sp.call(f'cat {output_psv_path} | head', shell=True)

ts|account|bytesUsed
2021-08-23T20:21:15|1058|380287
2021-08-14T21:41:58|5827|898843
2021-08-06T13:01:23|4235|47067
2021-08-25T15:04:31|6929|393749
2021-08-24T11:29:06|9743|382509
2021-08-01T03:51:55|9208|571644
2021-08-04T18:41:05|8091|45026
2021-08-27T01:44:44|6331|984680
2021-08-13T01:33:25|8260|550740


In [77]:
# terrible
# pddf.to_json(output_json_path, index=True)

In [78]:
# pddf.to_json(output_json_path, orient='table', index=False) # too much info
pddf.to_json(output_json_path, orient='split', index=False)

In [79]:
exit_code = sp.call(f'cat {output_json_path} | jq --indent 4 | head -n 30', shell=True)

{
    "columns": [
        "ts",
        "account",
        "bytesUsed"
    ],
    "data": [
        [
            "2021-08-23T20:21:15",
            "1058",
            380287
        ],
        [
            "2021-08-14T21:41:58",
            "5827",
            898843
        ],
        [
            "2021-08-06T13:01:23",
            "4235",
            47067
        ],
        [
            "2021-08-25T15:04:31",
            "6929",
            393749
        ],
        [
            "2021-08-24T11:29:06",
            "9743",


In [80]:
exit_code = sp.call(f'ls -lah {temp_dir_path}', shell=True)

total 16K
drwxrwxr-x  2 lu lu 4.0K Dec 17 17:44 .
drwxr-x--- 38 lu lu 4.0K Dec 17 17:22 ..
-rw-rw-r--  1 lu lu 3.1K Dec 17 17:54 cell_users.json
-rw-rw-r--  1 lu lu 2.6K Dec 17 17:54 cell_users.psv
