In [1]:
import pandas as pd
from tqdm import tqdm
import random

In [2]:
head_amount = 8000000
tail_amount = 4000000

In [3]:
from heapq import heappush, heappop

In [4]:
head_heap = []
tail_reservoir = []
tail_reservoir_counter = 0

In [5]:
class Record(object):
    def __init__(self, cnt, rec):
        self.cnt = cnt 
        self.rec = rec

    # override "less than"
    def __lt__(self, other):
        return self.cnt < other.cnt
    
    def __le__(self, other):
        return self.cnt <= other.cnt

In [6]:
def build_head_heap(item):
    heappush(head_heap, item)
    if len(head_heap) > head_amount:
        heappop(head_heap)

In [7]:
def build_tail_reservoir(item):
    global tail_reservoir_counter
    if tail_reservoir_counter < tail_amount:
        tail_reservoir.append(item)
    else:
        m = random.randint(0, tail_reservoir_counter)
        if m < tail_amount:
            tail_reservoir[m] = item
    tail_reservoir_counter += 1

In [8]:
for df in tqdm(pd.read_json('wish_queries_with_timestamp.json', lines=True, chunksize=10000)):
    for i in df.to_dict('records'):
        build_head_heap(Record(int(i['cnt']), i))
        build_tail_reservoir(i)

52910it [5:56:50,  2.47it/s]


In [9]:
len(tail_reservoir), len(head_heap)

(4000000, 8000000)

In [10]:
df_tail = pd.DataFrame(tail_reservoir)
df_head = pd.DataFrame([i.rec for i in head_heap])

In [11]:
head_queries = set(df_head['query'])

In [12]:
df_tail_nooverlap = df_tail[df_tail['query'].apply(lambda x: x not in head_queries)]

In [13]:
len(df_tail_nooverlap)

3939646

In [14]:
df_tail_nooverlap['sample_method'] = 'uniform'
df_head['sample_method'] = 'head'
df_head_tail = pd.concat([df_head, df_tail_nooverlap])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tail_nooverlap['sample_method'] = 'uniform'


In [15]:
df_head.gmv.mean(), df_tail_nooverlap.gmv.mean()

(191.0631545879474, 0.5279499341506274)

In [16]:
df_head_tail = df_head_tail.sample(frac=1.0, random_state=42)

In [17]:
assert len(set(df_head_tail['query'])) == len(df_head_tail)

In [18]:
df_head_tail['label_ordering'] = list(range(len(df_head_tail)))

In [19]:
df_head_tail.to_json('wish_queries_with_timestamp_3yr_all_sample_12M_headtail.json', lines=True, orient='records')

In [20]:
len(df_head_tail)

11939646