In [None]:
import pandas as pd
import numpy as np

df = pd.read_parquet("../data/train.parquet")

In [None]:
df["length"] = df["session_mask"].map(lambda x: len(x))

In [None]:
import multiprocessing
df = [elem for elem in df.groupby('id')]

In [None]:
from tqdm import tqdm

cores = multiprocessing.cpu_count()

def concatenate(vectors):
    elements = [elem for elem in vectors]
    return np.concatenate(elements, axis=None)

def append_dataframe(df):
    id = df[0]
    df = df[1]
    
    df = df.sort_values("timestamp")
    history = concatenate(df["history"].values)
    timestamp = concatenate(df["timestamp"].values)
    session = concatenate(df["session"].values)
    session_mask = concatenate(df["session_mask"].values)
    user_mask = concatenate(df["user_mask"].values)
    
    return pd.DataFrame({
        "id": [id],
        "history": [history],
        "timestamp": [timestamp],
        "session": [session],
        "session_mask": [session_mask],
        "user_mask": [user_mask],
        "length": np.sum(df["length"].values)
    })

with multiprocessing.Pool(cores) as p:
    df = list(tqdm(p.imap(append_dataframe, df), total=len(df)))

In [None]:
df = pd.concat(df)

In [None]:
df

In [None]:
df.to_parquet("../data/masked.parquet")

In [None]:
import pandas as pd
import numpy as np

df = pd.read_parquet("../data/brunch/session.parquet")

In [None]:
import json
from tqdm import tqdm

with open("../data/dictionary.json") as fp:
    dictionary = json.load(fp)
    
def make_label(x):
    session = x["history"]
    session = [dictionary[elem] for elem in session if elem in dictionary]
    x["session"] = session
    return x

tqdm.pandas()
df = df.progress_apply(make_label, axis=1)

In [None]:
with open("../data/brunch/predict/dev.users") as fp:
    dev = [elem[0:-1] for elem in fp]
with open("../data/brunch/predict/test.users") as fp:
    test = [elem[0:-1] for elem in fp]

In [None]:
df_dev = df[df.id.isin(dev)]

In [None]:
df_dev_grouped = df_dev.groupby("id")

In [None]:
from tqdm import tqdm

printed = False

def get_length(x):
    id = x[0]
    frame = x[1]
    global printed
    
    session = frame.session
    lengths = [len(elem) for elem in session]
    return pd.DataFrame({
        "id": [id],
        "length": [np.sum(lengths)]
    })

df_dev_grouped = [elem for elem in df_dev_grouped]
sampled = df_dev_grouped[0:100]
df_list = []
for elem in tqdm(sampled):
    df_list.append(get_length(elem))


In [None]:
df_dev = pd.concat(df_list)

In [None]:
df_dev.sort_values("length")

In [None]:
df[df.id == "#009bca89575df8ed68a302c1ceaf7da4"]

In [None]:
df

In [None]:
df = [row for index, row in df.iterrows()]

In [None]:
df[0]

In [None]:
def user_parallel_process(frame):    
    session = frame.session
    if len(session) < 2:
        return -1
    frame["session_input"] = session[0:-1]
    frame["session_output"] = session[1:]
    
    session = session[0:-1]
    session_length = len(session)
    # generating session mask
    session_mask = [1.0] * (session_length - 1)
    session_mask = [0.0] + session_mask
    # generating user mask
    user_mask = [0.0] * (session_length - 1)
    user_mask = user_mask + [1.0]
    frame["session_mask"] = session_mask
    frame["user_mask"] = user_mask
    
    return frame

cores = multiprocessing.cpu_count()
with multiprocessing.Pool(cores) as p:
    df = list(tqdm(p.imap(user_parallel_process, df), total=len(df)))

In [None]:
import json

with open("../data/dictionary.json") as fp:
    dictionary = json.load(fp)

In [None]:
keys = []
values = []
for key, value in dictionary.items():
    keys.append(key)
    values.append(value)
    
import pandas as pd

df = pd.DataFrame({
    "id" : keys,
    "pos" : values
})

In [None]:
df.to_parquet("../data/brunch/dataframe_dictionary.parquet")

In [1]:
import pyarrow.parquet as pq
import json
import pandas as pd

with open("../data/dictionary.json") as fp:
    dictionary = json.load(fp)
dataset = pq.ParquetDataset("../data/brunch/train")
table = dataset.read()
df = table.to_pandas()

In [2]:
from tqdm import tqdm
tqdm.pandas()

In [3]:
import numpy as np

def numpy_fill(arr):
    '''Solution provided by Divakar.'''
    mask = np.isnan(arr)
    idx = np.where(~mask,np.arange(mask.shape[1]),0)
    np.maximum.accumulate(idx,axis=1, out=idx)
    out = arr[np.arange(idx.shape[0])[:,None], idx]
    return out

def forward_fill(record):
    
    pos = record["pos"]
    session_input = record["session_input"]
    session_output = record["session_output"]
    
    if len(pos) == 0:
        record["trainable"] = False
        return record
    
    if len(session_input) == 0 and len(session_output) == 0:
        session_input = [float(pos[0])]
        session_output = [float(pos[0])]
    
    input_nans = np.isnan(session_input)
    output_nans = np.isnan(session_output)
    
    if all(input_nans):
        record["trainable"] = False
        return record
    if np.isnan(session_input[0]):
        session_input[0] = float(len(dictionary))
    session_input = numpy_fill(np.array([session_input]))[0]
    record["session_input"] = session_input
    
    if all(output_nans):
        record["trainable"] = False
        return record
    if np.isnan(session_output[0]):
        session_output[0] = session_input[1]
    session_output = numpy_fill(np.array([session_output]))[0]
    record["session_output"] = session_output
    record["trainable"] = True
    
    return record

In [None]:
df = df.progress_apply(forward_fill, axis=1)

100%|██████████| 3493934/3493934 [1:02:47<00:00, 941.08it/s] 

In [None]:
df.to_parquet("../data/brunch/train.parquet")

In [10]:
df[df.id == "#00104b6ef7bea05a3264ea0ab197fba9"]

Unnamed: 0,id,session,history,idx,timestamp,pos,session_input,session_output,session_mask,user_mask,trainable
561827,#5bb0b8360946111632456c07ccce48c4,3198,"[@dysonkim_44, @lunamin28_135]","[1, 0]","[2019-02-11T07:00:00.000000000, 2019-02-11T07:...","[254774, 475795]",[475795.0],[254774.0],[0.0],[1.0],True
561828,#eca9f6f057110a92d0aebaf8fdf972d4,1456,"[@prestigegorilla_250, @prestigegorilla_250]","[0, 1]","[2018-11-30T16:00:00.000000000, 2018-11-30T16:...","[604026, 604026]",[604026.0],[604026.0],[0.0],[1.0],True
561829,#7c20c30c5e6885073f82058372cb8535,3429,"[@heewoo7_39, @istandby4u2_29]","[1, 0]","[2019-02-20T22:00:00.000000000, 2019-02-20T22:...","[220589, 234]",[234.0],[220589.0],[0.0],[1.0],True
561830,#61eb6e260570ee3f78e654d862098048,2714,[@nareun_205],[0],[2019-01-22T03:00:00.000000000],[255438],[255438.0],[255438.0],[0.0],[1.0],True
561831,#43e81a19f9ae6a2ecfd385d165ee230c,34,"[@binkond_1015, @binkond_1015]","[0, 1]","[2018-10-02T10:00:00.000000000, 2018-10-02T10:...","[638942, 638942]",[638942.0],[638942.0],[0.0],[1.0],True
561832,#572e14672c366c0f07acc11ee21b6fd4,2908,"[@sunsutu_349, @sunsutu_349, @sunsutu_349, @da...","[0, 2, 3, 1]","[2019-01-30T05:00:00.000000000, 2019-01-30T05:...","[431918, 431918, 431918, 547717]","[431918.0, 547717.0, 431918.0]","[547717.0, 431918.0, 431918.0]","[0.0, 1.0, 1.0]","[0.0, 0.0, 1.0]",True
561833,#bde0bfc2604345aa49963bc3bb863330,787,"[@arista-seo_83, @arista-seo_83]","[0, 1]","[2018-11-02T19:00:00.000000000, 2018-11-02T19:...","[447708, 447708]",[447708.0],[447708.0],[0.0],[1.0],True
561834,#6f8c75c9a2157dc0e7eb4166c785d45d,3572,"[@brunch_151, @brunch_4, @yueunkimmm_7, @yueun...","[0, 3, 1, 2]","[2019-02-26T21:00:00.000000000, 2019-02-26T21:...","[419020, 500592, 320203, 320203]","[419020.0, 320203.0, 320203.0]","[320203.0, 320203.0, 500592.0]","[0.0, 1.0, 1.0]","[0.0, 0.0, 1.0]",True
561835,#f3c399e0b8d645fb741ccac1bc5ff9b2,1583,"[@sunghun_29, @lyricalp_4, @lyricalp_15, @lyri...","[0, 6, 12, 13, 1, 7, 9, 11, 17, 3, 15, 4, 8, 5...","[2018-12-05T23:00:00.000000000, 2018-12-05T23:...","[491320, 320691, 289628, 289628, 638691, 99089...","[491320.0, 638691.0, 67106.0, 522668.0, 456082...","[638691.0, 67106.0, 522668.0, 456082.0, 534479...","[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",True
561836,#763dfb1062c7d217d7e86b7587ae6872,550,[@ohmygod_22],[0],[2018-10-23T22:00:00.000000000],[],[],[],[0.0],[1.0],False


In [11]:
sample_df.to_parquet("../data/brunch/sample_train.parquet")

In [18]:
import pandas as pd
import numpy as np
import tensorflow as tf

from tqdm import tqdm

df = pd.read_parquet("../data/brunch/train.parquet")
df_trainable = df[df.trainable == True]
df_trainable = df_trainable.groupby("id")

In [13]:
import json
with open("../data/dictionary.json") as fp:
    dictionary = json.load(fp)

In [17]:
input_list = []
label_list = []
mask_list = []

idx = 0
max_length = 30
dictionary_length = len(dictionary)

for idx, (key, frame) in tqdm(enumerate(df_trainable), total=len(df_trainable)):
        frame = frame.sort_values("session")
    
        session_input = np.concatenate(frame.session_input.values, axis=None)
        session_output = np.concatenate(frame.session_output.values, axis=None)
        session_mask = np.concatenate(frame.session_mask.values, axis=None)
        user_mask = np.concatenate(frame.user_mask.values, axis=None)
            
        message = "At least one of the dimension doesn't match in the input."
        assert len(session_input) == len(session_output), message
        assert len(session_output) == len(session_mask), message
        assert len(session_mask) == len(user_mask), message
        
        if len(session_input) > 30:
            continue
            
        inputs = [session_input, session_mask, user_mask]
        inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs, 
                                                               maxlen=max_length, 
                                                               padding="post")
        label = tf.keras.preprocessing.sequence.pad_sequences([session_output], 
                                                              maxlen=max_length, 
                                                              value=np.float64(dictionary_length), 
                                                              padding="post")

        mask = [1.0] * len(session_input)
        mask = tf.keras.preprocessing.sequence.pad_sequences([mask],
                                                           maxlen=max_length,
                                                           value = 0.0,
                                                           padding="post")
        
        input_list.append(inputs)
        label_list.append(label)
        mask_list.append(mask)
        
        if key == "#00104b6ef7bea05a3264ea0ab197fba9":
            print(label)
        
inputs = np.array(input_list)
label = np.array(label_list)
mask = np.array(mask_list)



  0%|          | 0/96 [00:00<?, ?it/s][A[A

 90%|████████▉ | 86/96 [00:00<00:00, 852.22it/s][A[A

100%|██████████| 96/96 [00:00<00:00, 833.99it/s][A[A

[[ 38319  38319 355501 568640  60128 642190 642190 642190 642190 642190
  642190 642190 642190 642190 642190 642190 642190 642190 642190 642190
  642190 642190 642190 642190 642190 642190 642190 642190 642190 642190]]


In [None]:
print(float(dictionary_length))

In [None]:
df

In [None]:
df[df.id == "#00104b6ef7bea05a3264ea0ab197fba9"]

In [15]:
label

array([[[ 38319,  38319, 355501, ..., 642190, 642190, 642190]],

       [[419020, 114019, 114019, ..., 642190, 642190, 642190]],

       [[380165, 642190, 642190, ..., 642190, 642190, 642190]],

       ...,

       [[591255, 307775, 307775, ..., 642190, 642190, 642190]],

       [[638691,  67106, 522668, ..., 642190, 642190, 642190]],

       [[155987, 642190, 642190, ..., 642190, 642190, 642190]]],
      dtype=int32)

In [19]:
df[df.id == "#00104b6ef7bea05a3264ea0ab197fba9"]

Unnamed: 0,id,session,history,idx,timestamp,pos,session_input,session_output,session_mask,user_mask,trainable
561926,#00104b6ef7bea05a3264ea0ab197fba9,1342,"[@wootaiyoung_19, @snobberys_109, @tenbody_902...","[5, 3, 0, 4, 2, 1]","[2018-11-25T22:00:00.000, 2018-11-25T22:00:00....","[60128, 355501, 38319, 568640]","[38319.0, 38319.0, 38319.0, 355501.0, 568640.0]","[nan, nan, 355501.0, 568640.0, 60128.0]","[0.0, 1.0, 1.0, 1.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 1.0]",True
