In [66]:
import pandas as pd
from tqdm import tqdm
import random

In [67]:
head_amount = 8000000
tail_amount = 2000000

In [68]:
from heapq import heappush, heappop

In [69]:
head_heap = []
tail_reservoir = []
tail_reservoir_counter = 0

In [70]:
class Record(object):
    def __init__(self, cnt, rec):
        self.cnt = cnt 
        self.rec = rec

    # override "less than"
    def __lt__(self, other):
        return self.cnt < other.cnt
    
    def __le__(self, other):
        return self.cnt <= other.cnt

In [71]:
def build_head_heap(item):
    heappush(head_heap, item)
    if len(head_heap) > head_amount:
        heappop(head_heap)

In [72]:
def build_tail_reservoir(item):
    global tail_reservoir_counter
    if tail_reservoir_counter < tail_amount:
        tail_reservoir.append(item)
    else:
        m = random.randint(0, tail_reservoir_counter)
        if m < tail_amount:
            tail_reservoir[m] = item
    tail_reservoir_counter += 1

In [73]:
for df in tqdm(pd.read_json('wish_queries_with_timestamp_3wordsormore.json', lines=True, chunksize=10000)):
    for i in df.to_dict('records'):
        build_head_heap(Record(int(i['cnt']), i))
        build_tail_reservoir(i)

38652it [4:23:09,  2.45it/s]


In [75]:
len(tail_reservoir), len(head_heap)

(2000000, 8000000)

In [76]:
df_tail = pd.DataFrame(tail_reservoir)
df_head = pd.DataFrame([i.rec for i in head_heap])

In [78]:
head_queries = set(df_head['query'])

In [79]:
df_tail_nooverlap = df_tail[df_tail['query'].apply(lambda x: x not in head_queries)]

In [80]:
len(df_tail_nooverlap)

1958440

In [81]:
df_tail_nooverlap['sample_method'] = 'uniform'
df_head['sample_method'] = 'head'
df_head_tail = pd.concat([df_head, df_tail_nooverlap])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tail_nooverlap['sample_method'] = 'uniform'


In [86]:
df_head.gmv.mean(), df_tail_nooverlap.gmv.mean()

(75.69147587114027, 0.49721621273490296)

In [90]:
df_head_tail = df_head_tail.sample(frac=1.0, random_state=42)

In [91]:
assert len(set(df_head_tail['query'])) == len(df_head_tail)

In [93]:
df_head_tail['label_ordering'] = list(range(len(df_head_tail)))

In [94]:
df_head_tail.to_json('wish_queries_with_timestamp_3yr_3wordsormore_sample_10M_headtail.json', line=True, orient='records')

Unnamed: 0,query,min_timestamp,max_timestamp,min_dt,max_dt,cnt,gmv,sample_method,label_ordering
1759013,dog cat eater,1585400611,1586565651,2020-03-29,2020-04-11,20,0.000000,uniform,0
1021172,cargador co.putasora en vehicilo,1585532079,1585532128,2020-03-30,2020-03-30,5,0.000000,uniform,1
3584011,pendant gold chain,1579040066,1658453818,2020-01-15,2022-07-22,86,0.000000,head,2
841328,eartg colored shoes,1642706820,1642706960,2022-01-21,2022-01-21,8,0.000000,uniform,3
5362805,turquoise and gold necklace,1579522328,1667614443,2020-01-21,2022-11-05,290,9.790000,head,4
...,...,...,...,...,...,...,...,...,...
1557094,gopro gimbal karma,1587897227,1590849556,2020-04-27,2020-05-31,4,0.000000,uniform,9958435
2234489,ruger lcp 380 magazine,1579020577,1669640608,2020-01-15,2022-11-29,200,0.000000,head,9958436
4304572,thuốc kich dục nữ bán ở đâu,1588407671,1669605086,2020-05-03,2022-11-28,218,0.000000,head,9958437
6550634,tactical bag pack,1578802068,1662227324,2020-01-12,2022-09-04,222,16.965158,head,9958438
