# Inserting Data Without Duplicates

Here we test inserting data without duplicates. Afterwards we'll test for missing data inside of the databases.

In [2]:
from jamboree import Jamboree
import pandas as pd
import datetime
import pandas_datareader.data as web
from pandas import Series, DataFrame

In [3]:
from maya import MayaDT
import maya
import copy

In [33]:
import random
import orjson

In [5]:
from typing import List, Dict, Any

In [6]:
jam_session = Jamboree()

In [7]:
start = datetime.datetime(1986, 3, 14)
end = datetime.datetime(2020, 1, 6)

In [8]:
apple_df = web.DataReader("AAPL", 'yahoo', start, end)
msft_df = web.DataReader("MSFT", 'yahoo', start, end)

In [9]:
apple_df

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1986-03-14,0.468750,0.441964,0.441964,0.466518,96213600.0,0.370518
1986-03-17,0.464286,0.453125,0.464286,0.464286,29680000.0,0.368745
1986-03-18,0.486607,0.462054,0.464286,0.479911,62339200.0,0.381155
1986-03-19,0.486607,0.470982,0.479911,0.473214,47471200.0,0.375836
1986-03-20,0.529018,0.500000,0.500000,0.504464,226032800.0,0.400656
...,...,...,...,...,...,...
2019-12-30,292.690002,285.220001,289.459991,291.519989,36028600.0,291.519989
2019-12-31,293.679993,289.519989,289.929993,293.649994,25201400.0,293.649994
2020-01-02,300.600006,295.190002,296.239990,300.350006,33870100.0,300.350006
2020-01-03,300.579987,296.500000,297.149994,297.429993,36580700.0,297.429993


In [10]:
def get_year_month_day(time:MayaDT):
    print(f"{time.day}-{time.month}-{time.year}")

In [11]:
def get_time_dt(df):
    indexes = df.index
    indexes = [maya.MayaDT.from_datetime(index.to_pydatetime()) for index in indexes]
    return indexes

In [12]:
def df_records(df):
    return df.to_dict("records")

In [13]:
def standardize_record(record):
    closing_record = {}
    if "Close" in record:
        closing_record['close'] = record["Close"]
    if "Open" in record:
        closing_record['open'] = record["Open"]
    if "Low" in record:
        closing_record['low'] = record["Low"]
    if "High" in record:
        closing_record['high'] = record["High"]
    if "Volume" in record:
        closing_record['volume'] = record["Volume"]
    
    return closing_record

In [14]:
def standardize_outputs(records:List[Dict[str, Any]]):
    if len(records) == 0:
        return []
    _records = [standardize_record(rec) for rec in records]
    return _records

In [15]:
def add_time(records, times):
    if len(records) == 0 or (len(records) != len(times)):
        return []
    
    _records = []
    for index, rec in enumerate(records):
        rec['time'] = times[index]._epoch
        _records.append(rec)
    return _records

In [16]:
def teardown(df):
    """Breaks the dataframe into a bunch of dictionaries"""
    indexes = get_time_dt(df)
    records = df_records(df)
    standardized = standardize_outputs(records)
#     print(standardized)
    with_time = add_time(standardized, indexes)
    return with_time

In [17]:
dt_time = teardown(apple_df)

In [18]:
def flip(n=0.05):
    if random.uniform(0, 1) < n:
        return True
    return False

In [19]:
def create_duplicates(frame_dict_list:List[Dict]):
    if len(frame_dict_list) == 0:
        return []
    
    final_list = []
    for item in frame_dict_list:
        final_list.append(item)
        if flip(0.1):
            final_list.append(item)
    return final_list

In [155]:
# len(dups)

In [152]:
last_200 = dt_time[-200:]
last_300 = dt_time[-300:]
last_200_dups = create_duplicates(last_200)
last_300_dups = create_duplicates(last_300)

In [157]:
upsert_data_one = jam_session.bulk_upsert_redis({"type": "sample_save", "asset": "AAPL", "label": "duplication"}, last_200)
upsert_data_two = jam_session.bulk_upsert_redis({"type": "sample_save", "asset": "AAPL", "label": "duplication"}, last_300)
upsert_data_one_dups = jam_session.bulk_upsert_redis({"type": "sample_save", "asset": "AAPL", "label": "duplication"}, last_200_dups)
upsert_data_two_dups = jam_session.bulk_upsert_redis({"type": "sample_save", "asset": "AAPL", "label": "duplication"}, last_300_dups)

2020-01-07 15:42:01.437 | INFO     | jamboree.base.main:_bulk_upsert_redis:261 - Default retcon redis
2020-01-07 15:42:01.439 | INFO     | jamboree.base.main:_bulk_upsert_redis:261 - Default retcon redis
2020-01-07 15:42:01.442 | INFO     | jamboree.base.main:_bulk_upsert_redis:261 - Default retcon redis
2020-01-07 15:42:01.444 | INFO     | jamboree.base.main:_bulk_upsert_redis:261 - Default retcon redis


In [158]:
main_hash = upsert_data_one.get("hash")

In [159]:
up1 = upsert_data_one.get('updated', [])
up2 = upsert_data_two.get('updated', [])
up3 = upsert_data_one_dups.get('updated', [])
up4 = upsert_data_two_dups.get('updated', [])

In [165]:
cr1 = [orjson.dumps(x) for x in up1]
cr2 = [orjson.dumps(x) for x in up2]
cr3 = [orjson.dumps(x) for x in up3]
cr4 = [orjson.dumps(x) for x in up4]

In [167]:
set1 = set(cr1)
set2 = set(cr2)
set3 = set(cr3)
set4 = set(cr4)

In [170]:
print(len(set1))
print(len(set2))
print(len(set3))
print(len(set4))

200
300
200
300


In [164]:
jam_session.redis.sadd(set_key, *set(cr3))

0

In [188]:
def deserialize_list(serialized_list:list):
    if len(serialized_list) == 0:
        return []
    
    return [orjson.loads(x) for x in serialized_list]

In [187]:
def add_timestamp(item):
    item['timestamp'] = maya.now()._epoch
    return item

In [189]:
def get_addable_items(set_key, added_set):
    existing = set(jam_session.redis.smembers(set_key))
    addable_items = set(set2 - existing)
    if len(addable_items) == 0:
        return []
    listified = list(addable_items)
    deku = deserialize_list(listified)
    timestamped = [add_timestamp(x) for x in deku]
    return timestamped

In [190]:
# updated_set = set(serialized_updated)

In [193]:
get_addable_items(set_key, set2)

[{'close': 222.22000122070312,
  'open': 219.0500030517578,
  'low': 216.80999755859375,
  'high': 222.36000061035156,
  'volume': 58323200.0,
  'time': 1541030400.0,
  'type': 'sample_save',
  'asset': 'AAPL',
  'label': 'duplication',
  'timestamp': 1578435846.985621},
 {'close': 208.49000549316406,
  'open': 209.97999572753906,
  'low': 206.75,
  'high': 210.1199951171875,
  'volume': 25362600.0,
  'time': 1541635200.0,
  'type': 'sample_save',
  'asset': 'AAPL',
  'label': 'duplication',
  'timestamp': 1578435846.9856315},
 {'close': 171.25,
  'open': 167.41000366210938,
  'low': 167.27999877929688,
  'high': 171.66000366210938,
  'volume': 31495500.0,
  'time': 1549238400.0,
  'type': 'sample_save',
  'asset': 'AAPL',
  'label': 'duplication',
  'timestamp': 1578435846.9856355},
 {'close': 181.7100067138672,
  'open': 182.25,
  'low': 180.9199981689453,
  'high': 183.3000030517578,
  'volume': 31032500.0,
  'time': 1552435200.0,
  'type': 'sample_save',
  'asset': 'AAPL',
  'label

In [136]:
# jam_session.redis.smembers(set_key, 0, -1)

In [137]:
set(retrieved - updated_set)

set()

In [138]:
len(retrieved)

200

In [139]:
len(updated_set)

200