In [None]:
import pandas as pd
import numpy as np

df = pd.read_parquet("../data/train.parquet")

In [None]:
df["length"] = df["session_mask"].map(lambda x: len(x))

In [None]:
import multiprocessing
df = [elem for elem in df.groupby('id')]

In [None]:
from tqdm import tqdm

cores = multiprocessing.cpu_count()

def concatenate(vectors):
    elements = [elem for elem in vectors]
    return np.concatenate(elements, axis=None)

def append_dataframe(df):
    id = df[0]
    df = df[1]
    
    df = df.sort_values("timestamp")
    history = concatenate(df["history"].values)
    timestamp = concatenate(df["timestamp"].values)
    session = concatenate(df["session"].values)
    session_mask = concatenate(df["session_mask"].values)
    user_mask = concatenate(df["user_mask"].values)
    
    return pd.DataFrame({
        "id": [id],
        "history": [history],
        "timestamp": [timestamp],
        "session": [session],
        "session_mask": [session_mask],
        "user_mask": [user_mask],
        "length": np.sum(df["length"].values)
    })

with multiprocessing.Pool(cores) as p:
    df = list(tqdm(p.imap(append_dataframe, df), total=len(df)))

In [None]:
df = pd.concat(df)

In [None]:
df

In [None]:
df.to_parquet("../data/masked.parquet")

In [None]:
import pandas as pd
import numpy as np

df = pd.read_parquet("../data/brunch/session.parquet")

In [None]:
import json
from tqdm import tqdm

with open("../data/dictionary.json") as fp:
    dictionary = json.load(fp)
    
def make_label(x):
    session = x["history"]
    session = [dictionary[elem] for elem in session if elem in dictionary]
    x["session"] = session
    return x

tqdm.pandas()
df = df.progress_apply(make_label, axis=1)

In [None]:
with open("../data/brunch/predict/dev.users") as fp:
    dev = [elem[0:-1] for elem in fp]
with open("../data/brunch/predict/test.users") as fp:
    test = [elem[0:-1] for elem in fp]

In [None]:
df_dev = df[df.id.isin(dev)]

In [None]:
df_dev_grouped = df_dev.groupby("id")

In [None]:
from tqdm import tqdm

printed = False

def get_length(x):
    id = x[0]
    frame = x[1]
    global printed
    
    session = frame.session
    lengths = [len(elem) for elem in session]
    return pd.DataFrame({
        "id": [id],
        "length": [np.sum(lengths)]
    })

df_dev_grouped = [elem for elem in df_dev_grouped]
sampled = df_dev_grouped[0:100]
df_list = []
for elem in tqdm(sampled):
    df_list.append(get_length(elem))


In [None]:
df_dev = pd.concat(df_list)

In [None]:
df_dev.sort_values("length")

In [None]:
df[df.id == "#009bca89575df8ed68a302c1ceaf7da4"]

In [None]:
df

In [None]:
df = [row for index, row in df.iterrows()]

In [None]:
df[0]

In [None]:
def user_parallel_process(frame):    
    session = frame.session
    if len(session) < 2:
        return -1
    frame["session_input"] = session[0:-1]
    frame["session_output"] = session[1:]
    
    session = session[0:-1]
    session_length = len(session)
    # generating session mask
    session_mask = [1.0] * (session_length - 1)
    session_mask = [0.0] + session_mask
    # generating user mask
    user_mask = [0.0] * (session_length - 1)
    user_mask = user_mask + [1.0]
    frame["session_mask"] = session_mask
    frame["user_mask"] = user_mask
    
    return frame

cores = multiprocessing.cpu_count()
with multiprocessing.Pool(cores) as p:
    df = list(tqdm(p.imap(user_parallel_process, df), total=len(df)))

In [None]:
import json

with open("../data/dictionary.json") as fp:
    dictionary = json.load(fp)

In [None]:
keys = []
values = []
for key, value in dictionary.items():
    keys.append(key)
    values.append(value)
    
import pandas as pd

df = pd.DataFrame({
    "id" : keys,
    "pos" : values
})

In [None]:
df.to_parquet("../data/brunch/dataframe_dictionary.parquet")

In [None]:
import pyarrow.parquet as pq
import json
import pandas as pd

with open("../data/dictionary.json") as fp:
    dictionary = json.load(fp)
dataset = pq.ParquetDataset("../data/brunch/train")
table = dataset.read()
df = table.to_pandas()

In [None]:
from tqdm import tqdm
tqdm.pandas()

In [None]:
import numpy as np

def numpy_fill(arr):
    '''Solution provided by Divakar.'''
    mask = np.isnan(arr)
    idx = np.where(~mask,np.arange(mask.shape[1]),0)
    np.maximum.accumulate(idx,axis=1, out=idx)
    out = arr[np.arange(idx.shape[0])[:,None], idx]
    return out

def forward_fill(record):
    
    pos = record["pos"]
    session_input = record["session_input"]
    session_output = record["session_output"]
    
    if len(pos) == 0:
        record["trainable"] = False
        return record
    
    if len(session_input) == 0 and len(session_output) == 0:
        session_input = [float(pos[0])]
        session_output = [float(pos[0])]
    
    input_nans = np.isnan(session_input)
    output_nans = np.isnan(session_output)
    
    if all(input_nans):
        record["trainable"] = False
        return record
    if np.isnan(session_input[0]):
        session_input[0] = float(len(dictionary))
    session_input = numpy_fill(np.array([session_input]))[0]
    record["session_input"] = session_input
    
    if all(output_nans):
        record["trainable"] = False
        return record
    if np.isnan(session_output[0]):
        session_output[0] = session_input[1]
    session_output = numpy_fill(np.array([session_output]))[0]
    record["session_output"] = session_output
    record["trainable"] = True
    
    return record

In [None]:
df = df.progress_apply(forward_fill, axis=1)

In [None]:
df.to_parquet("../data/brunch/train.parquet")

In [None]:
df[df.id == "#00104b6ef7bea05a3264ea0ab197fba9"]

In [None]:
sample_df.to_parquet("../data/brunch/sample_train.parquet")

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import json

from tqdm import tqdm

with open("../data/dictionary.json") as fp:
    dictionary = json.load(fp)

df = pd.read_parquet("../data/brunch/train.parquet")

In [None]:
df[df.id == "#00104b6ef7bea05a3264ea0ab197fba9"]

In [None]:
df_trainable = df[df.trainable == True]
df_trainable = df_trainable.groupby("id")

In [None]:
input_list = []
label_list = []
mask_list = []

idx = 0
max_length = 30
dictionary_length = len(dictionary)

for idx, (key, frame) in tqdm(enumerate(df_trainable), total=len(df_trainable)):
        frame = frame.sort_values("session")
    
        session_input = np.concatenate(frame.session_input.values, axis=None)
        session_output = np.concatenate(frame.session_output.values, axis=None)
        session_mask = np.concatenate(frame.session_mask.values, axis=None)
        user_mask = np.concatenate(frame.user_mask.values, axis=None)
            
        message = "At least one of the dimension doesn't match in the input."
        assert len(session_input) == len(session_output), message
        assert len(session_output) == len(session_mask), message
        assert len(session_mask) == len(user_mask), message
        
        if len(session_input) > 30:
            continue
            
        inputs = [session_input, session_mask, user_mask]
        inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs, 
                                                               maxlen=max_length, 
                                                               padding="post")
        label = tf.keras.preprocessing.sequence.pad_sequences([session_output], 
                                                              maxlen=max_length, 
                                                              value=np.float64(dictionary_length), 
                                                              padding="post")

        mask = [1.0] * len(session_input)
        mask = tf.keras.preprocessing.sequence.pad_sequences([mask],
                                                           maxlen=max_length,
                                                           value = 0.0,
                                                           padding="post")
        
        input_list.append(inputs)
        label_list.append(label)
        mask_list.append(mask)
        
        if key == "#00104b6ef7bea05a3264ea0ab197fba9":
            print(label)
        
inputs = np.array(input_list)
label = np.array(label_list)
mask = np.array(mask_list)

In [None]:
np.save("../data/brunch/train", inputs)
np.save("../data/brunch/label", label)
np.save("../data/brunch/mask", mask)

In [1]:
import pyarrow.parquet as pq
import json
import pandas as pd

with open("../data/dictionary.json") as fp:
    dictionary = json.load(fp)
dataset = pq.ParquetDataset("../data/brunch/train")
table = dataset.read()
df = table.to_pandas()

In [2]:
df = df[df["session_input"].map(lambda x: len(x)) >= 3]

In [3]:
df = df.groupby('id').filter(lambda x: x['session'].count() >= 5)

In [4]:
import numpy as np

def filter_nan(x):
    mask = np.isnan(x)
    return all(mask)

df = df[df["session_input"].map(filter_nan) == False]
df = df[df["session_output"].map(filter_nan) == False]

In [5]:
import numpy as np

def forward_fill(arr):
    '''Solution provided by Divakar.'''
    mask = np.isnan(arr)
    idx = np.where(~mask,np.arange(mask.shape[1]),0)
    np.maximum.accumulate(idx, axis=1, out=idx)
    out = arr[np.arange(idx.shape[0])[:,None], idx]
    return out

def bidirectional_fill(x):
    x = forward_fill(x)
    x = np.flip(x, axis=1)
    x = forward_fill(x)
    x = np.flip(x, axis=1)
    return x

def fill_nan(x):
    x = x.sort_values("session")
    
    session_input = [bidirectional_fill(np.array([elem])) for elem in x["session_input"].values]
    session_output = [bidirectional_fill(np.array([elem])) for elem in x["session_output"].values]
    session_mask = [elem for elem in x["session_mask"].values]
    user_mask = [elem for elem in x["user_mask"].values]
    time_step = x["timestamp"].values[-1]
    
    x["session_input"] = [np.hstack(session_input)[0]] * len(x)
    x["session_output"] = [np.hstack(session_output)[0]] * len(x)
    x["session_mask"] = [np.hstack(session_mask)] * len(x)
    x["user_mask"] = [np.hstack(user_mask)] * len(x)
    x["timestamp"] = [time_step[-1]] * len(x)
    
    return x.iloc[0]

In [6]:
def func(x):
    return x

from tqdm import tqdm
tqdm.pandas()

In [7]:
df = df.groupby("id").progress_apply(fill_nan)

100%|██████████| 55557/55557 [03:29<00:00, 265.71it/s]


In [None]:
df.to_csv("../data/brunch/train.csv")

In [None]:
df

In [None]:
df.dtypes

In [8]:
df["length"] = df["session_input"].map(lambda x: len(x))

In [9]:
df

Unnamed: 0_level_0,id,session,history,idx,timestamp,pos,session_input,session_output,session_mask,user_mask,length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
#000474bba0c00c70e12ac7cfc3d04553,#000474bba0c00c70e12ac7cfc3d04553,1503,"[@rickeygo_71, @reading15m_604, @analee_147, @...","[4, 2, 0, 1, 3]",2019-02-27 16:00:00,"[181356, 20695, 389952, 19418, 396636]","[389952.0, 19418.0, 20695.0, 396636.0, 219967....","[19418.0, 20695.0, 396636.0, 181356.0, 58730.0...","[0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, ...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",144
#000b499c90650997ef29cd7e87f176a4,#000b499c90650997ef29cd7e87f176a4,833,"[@brunch_1, @ystweety07_40, @ystweety07_89, @j...","[1, 0, 8, 2, 7, 5, 3, 4, 6]",2018-11-20 15:00:00,"[568640, 552047, 71408, 621996, 236369, 77794,...","[552047.0, 568640.0, 621996.0, 304231.0, 97482...","[568640.0, 621996.0, 304231.0, 97482.0, 77794....","[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...",40
#000b96b2ac278425d84d3842c9fa288b,#000b96b2ac278425d84d3842c9fa288b,1628,"[@brunch_78, @brunch_79, @writersumin_4, @youn...","[4, 3, 2, 1, 0]",2018-12-23 15:00:00,"[298496, 150148, 576351, 283063, 490783]","[490783.0, 283063.0, 576351.0, 150148.0, 16424...","[283063.0, 576351.0, 150148.0, 298496.0, 47991...","[0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...",27
#000c82417339e173dbed1e413e9c15e0,#000c82417339e173dbed1e413e9c15e0,2061,"[@brunch_78, @brunch_110, @brunch_59, @gogksk_...","[3, 2, 4, 1, 0]",2019-02-25 22:00:00,"[298496, 439225, 399382, 585223, 523814]","[523814.0, 585223.0, 439225.0, 298496.0, 52355...","[585223.0, 439225.0, 298496.0, 399382.0, 52355...","[0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, ...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",468
#000e146eb048ca65bb32e712ea0b0bd6,#000e146eb048ca65bb32e712ea0b0bd6,1544,"[@noma1030_55, @noma1030_55, @noma1030_55, @an...","[9, 11, 13, 0, 4, 14, 1, 3, 6, 7, 8, 10, 12, 1...",2018-12-28 08:00:00,"[367628, 367628, 367628, 5730, 562777, 208462,...","[5730.0, 429813.0, 226691.0, 429813.0, 562777....","[429813.0, 226691.0, 429813.0, 562777.0, 22669...","[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",185
#000fe61478d384d09f3bcdd0c2f5227d,#000fe61478d384d09f3bcdd0c2f5227d,1381,"[@mongul-mongul_76, @mongul-mongul_76, @danpyu...","[2, 3, 0, 1]",2019-02-28 10:00:00,"[172349, 172349, 177064, 177064]","[177064.0, 177064.0, 172349.0, 460940.0, 46094...","[177064.0, 172349.0, 172349.0, 460940.0, 29123...","[0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, ...","[0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ...",211
#0013c35c789f0936654ba06389ea75cb,#0013c35c789f0936654ba06389ea75cb,340,"[@bliee_55, @seungmom_217, @jaihie_3, @repd_5,...","[7, 3, 4, 1, 6, 5, 0, 2]",2019-02-28 04:00:00,"[177153, 425786, 530064, 197081, 402932, 56710...","[266757.0, 197081.0, 564698.0, 425786.0, 53006...","[197081.0, 564698.0, 425786.0, 530064.0, 56710...","[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",72
#0014ca12ae4af9e9ed68ca3a62084f11,#0014ca12ae4af9e9ed68ca3a62084f11,465,"[@bookguru_32, @writermonet_47, @writermonet_4...","[3, 1, 4, 0, 2]",2019-01-05 10:00:00,"[287501, 565053, 565053, 297249, 91098]","[297249.0, 565053.0, 91098.0, 287501.0, 565053...","[565053.0, 91098.0, 287501.0, 565053.0, 91098....","[0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, ...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",90
#001587f73993c3cf510a97d9b129ae2a,#001587f73993c3cf510a97d9b129ae2a,402,"[@woonee_45, @alkony_49, @imadorable_79, @imad...","[2, 0, 3, 5, 6, 1, 7, 4]",2019-02-27 00:00:00,"[486226, 115787, 568928, 488237, 488237, 25978...","[115787.0, 259781.0, 486226.0, 568928.0, 38556...","[259781.0, 486226.0, 568928.0, 385563.0, 48823...","[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",70
#00160ec29677bde7de10cffe7c24ee52,#00160ec29677bde7de10cffe7c24ee52,2696,"[@boot0715_94, @boot0715_100, @boot0715_90, @b...","[26, 9, 30, 33, 22, 25, 37, 34, 7, 8, 23, 24, ...",2019-01-28 12:00:00,"[483754, 494510, 640447, 640447, 239328, 23932...","[398970.0, 398970.0, 519224.0, 412283.0, 41228...","[398970.0, 519224.0, 412283.0, 412283.0, 41228...","[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",134


In [17]:
df["length"].quantile(0.9)

558.0

In [13]:
ranged_df = df[df['timestamp'] >= '2019-02-01']

In [14]:
len(ranged_df)

37076

In [15]:
ranged_df

Unnamed: 0_level_0,id,session,history,idx,timestamp,pos,session_input,session_output,session_mask,user_mask,length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
#000474bba0c00c70e12ac7cfc3d04553,#000474bba0c00c70e12ac7cfc3d04553,1503,"[@rickeygo_71, @reading15m_604, @analee_147, @...","[4, 2, 0, 1, 3]",2019-02-27 16:00:00,"[181356, 20695, 389952, 19418, 396636]","[389952.0, 19418.0, 20695.0, 396636.0, 219967....","[19418.0, 20695.0, 396636.0, 181356.0, 58730.0...","[0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, ...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",144
#000c82417339e173dbed1e413e9c15e0,#000c82417339e173dbed1e413e9c15e0,2061,"[@brunch_78, @brunch_110, @brunch_59, @gogksk_...","[3, 2, 4, 1, 0]",2019-02-25 22:00:00,"[298496, 439225, 399382, 585223, 523814]","[523814.0, 585223.0, 439225.0, 298496.0, 52355...","[585223.0, 439225.0, 298496.0, 399382.0, 52355...","[0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, ...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",468
#000fe61478d384d09f3bcdd0c2f5227d,#000fe61478d384d09f3bcdd0c2f5227d,1381,"[@mongul-mongul_76, @mongul-mongul_76, @danpyu...","[2, 3, 0, 1]",2019-02-28 10:00:00,"[172349, 172349, 177064, 177064]","[177064.0, 177064.0, 172349.0, 460940.0, 46094...","[177064.0, 172349.0, 172349.0, 460940.0, 29123...","[0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, ...","[0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ...",211
#0013c35c789f0936654ba06389ea75cb,#0013c35c789f0936654ba06389ea75cb,340,"[@bliee_55, @seungmom_217, @jaihie_3, @repd_5,...","[7, 3, 4, 1, 6, 5, 0, 2]",2019-02-28 04:00:00,"[177153, 425786, 530064, 197081, 402932, 56710...","[266757.0, 197081.0, 564698.0, 425786.0, 53006...","[197081.0, 564698.0, 425786.0, 530064.0, 56710...","[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",72
#001587f73993c3cf510a97d9b129ae2a,#001587f73993c3cf510a97d9b129ae2a,402,"[@woonee_45, @alkony_49, @imadorable_79, @imad...","[2, 0, 3, 5, 6, 1, 7, 4]",2019-02-27 00:00:00,"[486226, 115787, 568928, 488237, 488237, 25978...","[115787.0, 259781.0, 486226.0, 568928.0, 38556...","[259781.0, 486226.0, 568928.0, 385563.0, 48823...","[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",70
#00169d555cb64b6fd923c4966cbfb98a,#00169d555cb64b6fd923c4966cbfb98a,109,"[@choihs0228_28, @officen_19, @thinkaboutlove_...","[1, 3, 0, 2]",2019-02-18 17:00:00,"[466073, 574231, 79687, 79687]","[79687.0, 466073.0, 79687.0, 222089.0, 52237.0...","[466073.0, 79687.0, 574231.0, 52237.0, 411846....","[0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",39
#0019b5172e3f4860be70bf3055f90cd4,#0019b5172e3f4860be70bf3055f90cd4,196,"[@studiocroissant_43, @tenbody_1384, @tenbody_...","[8, 3, 6, 2, 7, 4, 5, 0, 1]",2019-02-09 23:00:00,"[121405, 536579, 375880, 421238, 637384, 57704...","[62845.0, 62845.0, 421238.0, 536579.0, 577042....","[62845.0, 421238.0, 536579.0, 577042.0, 105981...","[0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...",62
#001b05975c5fbf6232058979294965f2,#001b05975c5fbf6232058979294965f2,336,"[@miamiyoung_35, @miamiyoung_35, @miamiyoung_3...","[2, 3, 0, 1]",2019-02-19 18:00:00,"[154312, 154312, 168423, 168423]","[168423.0, 168423.0, 154312.0, 330031.0, 24451...","[168423.0, 154312.0, 154312.0, 244517.0, 75729...","[0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",204
#00222a855547061a7fd00e404cb96148,#00222a855547061a7fd00e404cb96148,33,"[@windyroad2_123, @bobasul_35, @bobasul_35, @r...","[1, 0, 3, 2]",2019-02-28 06:00:00,"[554397, 524402, 524402, 259295]","[524402.0, 554397.0, 259295.0, 141009.0, 14100...","[554397.0, 259295.0, 524402.0, 141009.0, 14100...","[0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, ...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...",599
#002299f5e10111c19ac32d432ba03371,#002299f5e10111c19ac32d432ba03371,14,"[@miamiyoung_29, @miamiyoung_27, @miamiyoung_2...","[4, 1, 3, 2, 0]",2019-02-20 12:00:00,"[329675, 157687, 157687, 554894, 619806]","[619806.0, 157687.0, 554894.0, 157687.0, 40598...","[157687.0, 554894.0, 157687.0, 329675.0, 36113...","[0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, ...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...",25


In [19]:
ranged_df["length"].quantile(0.05)

29.0

In [20]:
ranged_df = ranged_df[ranged_df["length"] <= 128]

In [21]:
len(ranged_df)

18600

In [22]:
ranged_df["length"].max()

128

In [23]:
evaluation_ratio = 0.1
eval_df = ranged_df.iloc[:int(len(ranged_df) * evaluation_ratio)]
train_df = ranged_df.iloc[int(len(ranged_df) * evaluation_ratio):]

In [24]:
len(eval_df)

1860

In [25]:
len(train_df)

16740

In [27]:
train_df.to_csv("../data/brunch/train.csv")
eval_df.to_csv("../data/brunch/eval.csv")

In [31]:
row = train_df.iloc[0:100]
row = row.sort_values("length")
row.iloc[0]["length"]

17

In [32]:
train_df.to_parquet("../data/brunch/train.parquet")
eval_df.to_parquet("../data/brunch/eval.parquet")