In [None]:
import pandas as pd
import numpy as np

df = pd.read_parquet("../data/train.parquet")

In [None]:
df["length"] = df["session_mask"].map(lambda x: len(x))

In [None]:
import multiprocessing
df = [elem for elem in df.groupby('id')]

In [None]:
from tqdm import tqdm

cores = multiprocessing.cpu_count()

def concatenate(vectors):
    elements = [elem for elem in vectors]
    return np.concatenate(elements, axis=None)

def append_dataframe(df):
    id = df[0]
    df = df[1]
    
    df = df.sort_values("timestamp")
    history = concatenate(df["history"].values)
    timestamp = concatenate(df["timestamp"].values)
    session = concatenate(df["session"].values)
    session_mask = concatenate(df["session_mask"].values)
    user_mask = concatenate(df["user_mask"].values)
    
    return pd.DataFrame({
        "id": [id],
        "history": [history],
        "timestamp": [timestamp],
        "session": [session],
        "session_mask": [session_mask],
        "user_mask": [user_mask],
        "length": np.sum(df["length"].values)
    })

with multiprocessing.Pool(cores) as p:
    df = list(tqdm(p.imap(append_dataframe, df), total=len(df)))

In [None]:
df = pd.concat(df)

In [None]:
df

In [None]:
df.to_parquet("../data/masked.parquet")

In [None]:
import pandas as pd
import numpy as np

df = pd.read_parquet("../data/brunch/session.parquet")

In [None]:
import json
from tqdm import tqdm

with open("../data/dictionary.json") as fp:
    dictionary = json.load(fp)
    
def make_label(x):
    session = x["history"]
    session = [dictionary[elem] for elem in session if elem in dictionary]
    x["session"] = session
    return x

tqdm.pandas()
df = df.progress_apply(make_label, axis=1)

In [None]:
with open("../data/brunch/predict/dev.users") as fp:
    dev = [elem[0:-1] for elem in fp]
with open("../data/brunch/predict/test.users") as fp:
    test = [elem[0:-1] for elem in fp]

In [None]:
df_dev = df[df.id.isin(dev)]

In [None]:
df_dev_grouped = df_dev.groupby("id")

In [None]:
from tqdm import tqdm

printed = False

def get_length(x):
    id = x[0]
    frame = x[1]
    global printed
    
    session = frame.session
    lengths = [len(elem) for elem in session]
    return pd.DataFrame({
        "id": [id],
        "length": [np.sum(lengths)]
    })

df_dev_grouped = [elem for elem in df_dev_grouped]
sampled = df_dev_grouped[0:100]
df_list = []
for elem in tqdm(sampled):
    df_list.append(get_length(elem))


In [None]:
df_dev = pd.concat(df_list)

In [None]:
df_dev.sort_values("length")

In [None]:
df[df.id == "#009bca89575df8ed68a302c1ceaf7da4"]

In [None]:
df

In [None]:
df = [row for index, row in df.iterrows()]

In [None]:
df[0]

In [None]:
def user_parallel_process(frame):    
    session = frame.session
    if len(session) < 2:
        return -1
    frame["session_input"] = session[0:-1]
    frame["session_output"] = session[1:]
    
    session = session[0:-1]
    session_length = len(session)
    # generating session mask
    session_mask = [1.0] * (session_length - 1)
    session_mask = [0.0] + session_mask
    # generating user mask
    user_mask = [0.0] * (session_length - 1)
    user_mask = user_mask + [1.0]
    frame["session_mask"] = session_mask
    frame["user_mask"] = user_mask
    
    return frame

cores = multiprocessing.cpu_count()
with multiprocessing.Pool(cores) as p:
    df = list(tqdm(p.imap(user_parallel_process, df), total=len(df)))

In [None]:
import json

with open("../data/dictionary.json") as fp:
    dictionary = json.load(fp)

In [None]:
keys = []
values = []
for key, value in dictionary.items():
    keys.append(key)
    values.append(value)
    
import pandas as pd

df = pd.DataFrame({
    "id" : keys,
    "pos" : values
})

In [None]:
df.to_parquet("../data/brunch/dataframe_dictionary.parquet")

In [None]:
import pyarrow.parquet as pq

dataset = pq.ParquetDataset("../data/brunch/train")
table = dataset.read()

In [None]:
import pandas as pd

df = table.to_pandas()

In [None]:
df

In [None]:
df = df.sort_values(["id", "session"])

In [None]:
df

In [None]:
dictionary["@seochogirl_18"]

In [None]:
from tqdm import tqdm
tqdm.pandas()

In [None]:
import numpy as np

def numpy_fill(arr):
    '''Solution provided by Divakar.'''
    mask = np.isnan(arr)
    idx = np.where(~mask,np.arange(mask.shape[1]),0)
    np.maximum.accumulate(idx,axis=1, out=idx)
    out = arr[np.arange(idx.shape[0])[:,None], idx]
    return out

def forward_fill(record):
    
    pos =record["pos"]
    session_input = record["session_input"]
    session_output = record["session_output"]
    
    if len(pos) == 0:
        record["trainable"] = False
        return record
    
    if len(session_input) == 0 and len(session_output) == 0:
        session_input = [float(pos[0])]
        session_output = [float(pos[0])]
    
    input_nans = np.isnan(session_input)
    output_nans = np.isnan(session_output)
    
    if all(input_nans):
        record["trainable"] = False
        return record
    if np.isnan(session_input[0]):
        session_input[0] = len(dictionary)
    record["session_input"] = numpy_fill(np.array([session_input]))[0]
    
    if all(output_nans):
        record["trainable"] = False
        return record
    if np.isnan(session_output[0]):
        session_output[0] = session_input[1]
    record["session_output"] = numpy_fill(np.array([session_output]))[0]
    record["trainable"] = True
    
    return record

df = df.progress_apply(forward_fill, axis=1)

In [None]:
df

In [None]:
len(dictionary)

In [None]:
df.to_parquet("../data/brunch/train/train.parquet")

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

from tqdm import tqdm

df = pd.read_parquet("../data/brunch/train/train.parquet")
df_trainable = df[df.trainable == True]
df_trainable = df_trainable.groupby("id")

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [5]:
input_list = []
label_list = []
mask_list = []

idx = 0
max_length = 30
dictionary_length = len(dictionary)

for idx, (key, frame) in tqdm(enumerate(df_trainable), total=len(df_trainable)):
        frame = frame.sort_values("session")
    
        session_input = np.concatenate(frame.session_input.values, axis=None)
        session_output = np.concatenate(frame.session_output.values, axis=None)
        session_mask = np.concatenate(frame.session_mask.values, axis=None)
        user_mask = np.concatenate(frame.user_mask.values, axis=None)
            
        message = "At least one of the dimension doesn't match in the input."
        assert len(session_input) == len(session_output), message
        assert len(session_output) == len(session_mask), message
        assert len(session_mask) == len(user_mask), message
        
        if len(session_input) > 30:
            continue

        inputs = [session_input, session_mask, user_mask]
        inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs, 
                                                               maxlen=max_length, 
                                                               padding="post")
        label = tf.keras.preprocessing.sequence.pad_sequences([session_output], 
                                                              maxlen=max_length, 
                                                              value=float(dictionary_length), 
                                                              padding="post")
        mask = [1.0] * len(session_input)
        mask = tf.keras.preprocessing.sequence.pad_sequences([mask],
                                                           maxlen=max_length,
                                                           value = 0.0,
                                                           padding="post")
        
        input_list.append(inputs)
        label_list.append(label)
        mask_list.append(mask)
        
inputs = np.array(input_list)
label = np.array(label_list)
mask = np.array(mask_list)

100%|██████████| 301930/301930 [05:03<00:00, 993.84it/s] 


In [3]:
import json
with open("../data/dictionary.json") as fp:
    dictionary = json.load(fp)

In [6]:
mask.shape

(222408, 1, 30)

In [7]:
inputs.shape

(222408, 3, 30)

In [8]:
label.shape

(222408, 1, 30)

In [9]:
max = np.max(label)

In [10]:
max

642190

In [11]:
res = np.where(label == (dictionary_length - 1))

In [12]:
res

(array([165548]), array([0]), array([4]))