In [1]:
%load_ext autoreload
%autoreload 2

In [2]:

import numpy as np
import pandas as pd
import datetime
from ast import literal_eval

from db_connectors.load import bigquery_reader
from db_connectors.postprocessing import bigquery_writer

# from notebooks.env import *
from notebooks.env_prod import *


In [3]:
bq_project_id, json_key_path

('mikvpc-prod-service-com',
 '/Users/LINGYU1/work/repo/recommender/sa-rec-dataproc-2c-prd.json')

In [4]:
import gc
gc.collect()

0

# Data

In [5]:
top_user_num = 2000
query = f"""


-- combine mik_sales with view data
-- just pick top 2000 user first

WITH cte1 AS (
  SELECT
    user_id,
    COUNT(trans_date) AS num_trans
  FROM `Data_Infra_Eng.mik_sales`
  WHERE data_source = "MIK"
    AND trans_date BETWEEN '2023-01-24' AND '2023-02-27'
  GROUP BY user_id
  ORDER BY num_trans DESC
  LIMIT {top_user_num}
), cte2 AS (

  SELECT 
    t1.user_id, 
    t1.sku_number, 
    t1.qty, 
    t1.trans_date,
    t1.created_time,
    t1.data_source,
    t2.full_taxonomy_path as category_path,
    ARRAY_AGG(t1.sku_number) --IFNULL(t2.sku_number, "na")
      OVER (
        PARTITION BY t1.user_id 
        ORDER BY t1.trans_date, t1.created_time ASC
        ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
      ) AS sku_purchase_seq,
    ARRAY_AGG(IFNULL(t2.full_taxonomy_path, "na")) 
      OVER (
        PARTITION BY t1.user_id 
        ORDER BY t1.trans_date, t1.created_time ASC
        ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
      ) AS category_path_purchase_seq
  FROM `Data_Infra_Eng.mik_sales` t1
  LEFT JOIN `Data_Infra_Eng.mik_item` t2
    ON t1.sku_number = t2.sku_number
  WHERE data_source = "MIK"
    AND trans_date BETWEEN '2023-01-24' AND '2023-02-27'
    -- AND t1.user_id IN (SELECT user_id FROM cte1)
  ORDER BY user_id, trans_date, created_time ASC
), cte3 AS (
SELECT
  user_id,
  APPROX_TOP_COUNT(geo_country, 1) AS geo_country,
  APPROX_TOP_COUNT(geo_region, 1) AS geo_region,
  APPROX_TOP_COUNT(geo_city, 1) AS geo_city,
  APPROX_TOP_COUNT(geo_zipcode, 1) AS geo_zipcode,
  APPROX_TOP_COUNT(platform, 1) AS platform,
FROM`atomic.events`
WHERE derived_tstamp BETWEEN '2023-01-24' AND '2023-02-27'
    -- AND user_id IN (SELECT CAST(user_id AS STRING) FROM cte1)
GROUP BY user_id  

)

SELECT
  cte2.*,
  cte3.geo_country,
  cte3.geo_region,
  cte3.geo_city,
  cte3.geo_zipcode,
  cte3.platform,
  t.*
FROM cte2
LEFT JOIN cte3
  ON CAST(cte2.user_id AS STRING) = cte3.user_id
LEFT JOIN `Data_Infra_Eng.user_behavior` t
  ON CAST(cte2.user_id AS STRING)= t.user_id




"""

In [6]:
df = bigquery_reader(
        project_id=bq_project_id, json_credentials_path=json_key_path,
        query_string=query
    )

In [7]:
df = df[df['user_id_1'].notna()].reset_index(drop=True)
df['sku_view_sequence'] = df['user_behavior'].apply(lambda x: np.array([y['item'] for y in x]))
df['geo_country'] = df['geo_country'].apply(lambda x: x[0]['value'] if x else None)
df['geo_region'] = df['geo_region'].apply(lambda x: x[0]['value'] if x else None)
df['geo_city'] = df['geo_city'].apply(lambda x: x[0]['value'] if x else None)
df['geo_zipcode'] = df['geo_zipcode'].apply(lambda x: x[0]['value'] if x else None)
df['platform'] = df['platform'].apply(lambda x: x[0]['value'] if x else None)
print(f"""
        df shape: {df.shape}, user number: {df['user_id'].nunique()}, 
        avg trans per user: {df.groupby('user_id').size().mean()}, 
        item number: {df['sku_number'].nunique()}
        """)

  df['geo_country'] = df['geo_country'].apply(lambda x: x[0]['value'] if x else None)
  df['geo_region'] = df['geo_region'].apply(lambda x: x[0]['value'] if x else None)
  df['geo_city'] = df['geo_city'].apply(lambda x: x[0]['value'] if x else None)
  df['geo_zipcode'] = df['geo_zipcode'].apply(lambda x: x[0]['value'] if x else None)
  df['platform'] = df['platform'].apply(lambda x: x[0]['value'] if x else None)



        df shape: (2039014, 17), user number: 235883, 
        avg trans per user: 8.644175290292221, 
        item number: 68644
        


In [8]:
df['trans_date'].min(),df['trans_date'].max()

(Timestamp('2023-01-24 00:00:00'), Timestamp('2023-02-27 00:00:00'))

In [9]:
df['sku_view_sequence'].apply(lambda x: len(x)).describe()

count    2.039014e+06
mean     1.383778e+01
std      3.578328e+01
min      1.000000e+00
25%      2.000000e+00
50%      5.000000e+00
75%      1.300000e+01
max      1.886000e+03
Name: sku_view_sequence, dtype: float64

In [10]:
df['sku_purchase_seq'].apply(lambda x: len(x)).describe(), df['category_path_purchase_seq'].apply(lambda x: len(x)).describe()

(count    2.039014e+06
 mean     7.519160e+00
 std      1.590276e+01
 min      0.000000e+00
 25%      1.000000e+00
 50%      3.000000e+00
 75%      8.000000e+00
 max      4.030000e+02
 Name: sku_purchase_seq, dtype: float64,
 count    2.039014e+06
 mean     7.519160e+00
 std      1.590276e+01
 min      0.000000e+00
 25%      1.000000e+00
 50%      3.000000e+00
 75%      8.000000e+00
 max      4.030000e+02
 Name: category_path_purchase_seq, dtype: float64)

In [11]:
df.head(2)

Unnamed: 0,user_id,sku_number,qty,trans_date,created_time,data_source,category_path,sku_purchase_seq,category_path_purchase_seq,geo_country,geo_region,geo_city,geo_zipcode,platform,user_id_1,user_behavior,sku_view_sequence
0,36979975,10532558,5,2023-01-30,2023-01-30 01:17:00,MIK,root//Shop Categories//Craft Machines//Diecutt...,[],[],US,NH,Hudson,3051,mob,36979975,"[{'behavior': 'item_view', 'item': '10547287',...","[10547287, M10498465, M20001973]"
1,36979975,10519524,2,2023-01-30,2023-01-30 01:17:00,MIK,root//Shop Categories//Craft Machines//Siser//...,[10532558],[root//Shop Categories//Craft Machines//Diecut...,US,NH,Hudson,3051,mob,36979975,"[{'behavior': 'item_view', 'item': '10547287',...","[10547287, M10498465, M20001973]"


In [12]:
df.columns

Index(['user_id', 'sku_number', 'qty', 'trans_date', 'created_time',
       'data_source', 'category_path', 'sku_purchase_seq',
       'category_path_purchase_seq', 'geo_country', 'geo_region', 'geo_city',
       'geo_zipcode', 'platform', 'user_id_1', 'user_behavior',
       'sku_view_sequence'],
      dtype='object')

In [13]:
df.to_pickle('/Users/LINGYU1/work/localspace/data/mik_dnn_model_02222023/df_raw_full_20230227.sav')


### Reuse data

In [None]:
df = pd.read_pickle('/Users/LINGYU1/work/localspace/data/mik_dnn_model_02222023/df_raw_full_20230227.sav')

### Load data func

In [35]:
def get_sale_view_data(days, top_user_num):
    query = f"""

    -- combine mik_sales with view data
    -- just pick top 200 user first
    WITH cte1 AS (
      SELECT
        user_id,
        COUNT(trans_date) AS num_trans
      FROM `Data_Infra_Eng.mik_sales`
      WHERE data_source = "MIK"
        AND DATE_DIFF(CURRENT_DATE(), trans_date, DAY) < {days}
      GROUP BY user_id
      ORDER BY num_trans DESC
      LIMIT {top_user_num}
    ), cte2 AS (

      SELECT 
        t1.user_id, 
        t1.sku_number, 
        t1.qty, 
        t1.trans_date,
        t1.created_time,
        t1.data_source,
        t2.full_taxonomy_path as category_path,
        ARRAY_AGG(t1.sku_number) --IFNULL(t2.sku_number, "na")
          OVER (
            PARTITION BY t1.user_id 
            ORDER BY t1.trans_date, t1.created_time ASC
            ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
          ) AS sku_purchase_seq,
        ARRAY_AGG(IFNULL(t2.full_taxonomy_path, "na")) 
          OVER (
            PARTITION BY t1.user_id 
            ORDER BY t1.trans_date, t1.created_time ASC
            ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING
          ) AS category_path_purchase_seq
      FROM `Data_Infra_Eng.mik_sales` t1
      LEFT JOIN `Data_Infra_Eng.mik_item` t2
        ON t1.sku_number = t2.sku_number
      WHERE data_source = "MIK"
        AND DATE_DIFF(CURRENT_DATE(), trans_date, DAY) < {days} 
        AND t1.user_id IN (SELECT user_id FROM cte1)
      ORDER BY user_id, trans_date, created_time ASC
    )
    SELECT
      *
    FROM cte2 t1
    LEFT JOIN `Data_Infra_Eng.user_behavior` t2
      ON CAST(t1.user_id AS STRING) = t2.user_id
    ;
    """
    df = bigquery_reader(
            project_id=bq_project_id, json_credentials_path=json_key_path,
            query_string=query
        )
    df = df[df['user_id_1'].notna()].reset_index(drop=True)
    df['sku_view_sequence'] = df['user_behavior'].apply(lambda x: np.array([y['item'] for y in x]))
    print(f"""
            df shape: {df.shape}, user number: {df['user_id'].nunique()}, 
            avg trans per user: {df.groupby('user_id').size().mean()}, 
            item number: {df['sku_number'].nunique()}
            """)
    return df

In [None]:
df = bigquery_reader(
        project_id=bq_project_id, json_credentials_path=json_key_path,
        query_string=query
    )
df = df[df['user_id_1'].notna()].reset_index(drop=True)
df['sku_view_sequence'] = df['user_behavior'].apply(lambda x: np.array([y['item'] for y in x]))

In [9]:
df = get_sale_view_data(35, 200)

In [10]:
df.shape, df['user_id'].nunique(), df.groupby('user_id').size().mean(), df['sku_number'].nunique()

((67624, 12), 179, 377.7877094972067, 8183)

In [73]:
import pickle
with open("/Users/LINGYU1/work/localspace/data/mik_dnn_model_02222023/df_sales_raw_20230310.pickle", "wb") as f: 
    pickle.dump(df, f) 

In [46]:
df[['user_id', 'sku_number', 'trans_date', 'created_time',
    'category_path', 
    'sku_purchase_seq', 'category_path_purchase_seq', 'sku_view_sequence']]\
    .to_csv("/Users/LINGYU1/work/localspace/data/mik_dnn_model_02222023/mik_sales_views_20230227.csv", index=False)

In [26]:
df_sp = pd.read_csv("/Users/LINGYU1/work/localspace/data/mik_dnn_model_02222023/mik_sales_views_20230227.csv")
df_sp.shape, df_sp.user_id.nunique()

((41486, 8), 181)

In [6]:
df_sp['trans_date'].min(), df_sp['trans_date'].max()

('2023-01-24', '2023-02-27')

# Negative Sampling

In [31]:
import gc
gc.collect()

0

In [15]:
candidate_sku = df[['sku_number','category_path']].drop_duplicates()
negsample_ratio = 1
SEQ_LEN = 50
# neg_list = np.random.choice(candidate_set, size=df.shape[0] * negsample_ratio, replace=True)


In [16]:
def create_negative_sample(data, candidate_item, negsample_ratio):
    # get positive sample data
    df_pos = data.copy()
    df_pos['label'] = 1
    
    # create negative data
    df_neg = pd.concat([df_pos.copy()] * negsample_ratio, ignore_index=True)
    df_neg['label'] = 0
    # negative sampling
    neg_sku = candidate_item[~candidate_item['sku_number'].isin(df_pos['sku_number'])]\
                .sample(df_pos.shape[0] * negsample_ratio)
    df_neg['sku_number'] = neg_sku['sku_number'].values
    df_neg['category_path'] = neg_sku['category_path'].values
    
    return pd.concat([df_pos, df_neg], axis=0).reset_index(drop=True)

In [17]:
def gen_input_data(df, negsample_ratio, seq_len):
    # get candidate items and categories
    candidate_sku = df[['sku_number','category_path']].drop_duplicates()
    # group by user_id
    user_group = df.groupby('user_id')
    # negative sampling for each user group
    df_res = []
    for user_id, data in user_group:
        # first refine sequence len to < seq_len
        for col in ['sku_purchase_seq','category_path_purchase_seq','sku_view_sequence']:
            data[col] =  data[col].apply(lambda x: x[0:seq_len])
#             data[col] = data[col].apply(lambda x: np.lib.pad(x, 
#                                                              (seq_len - x.shape[0],0), 
#                                                              'constant', 
#                                                              constant_values=('na')))        
        # create negative sample and combine with positive sample        
        df_sp = create_negative_sample(data = data, candidate_item = candidate_sku, negsample_ratio = negsample_ratio)
        df_res.append(df_sp)
    df_res = pd.concat(df_res).reset_index(drop=True)
    df_res['seq_len'] = seq_len
    return df_res[[
        'user_id', 'sku_number','category_path', 'trans_date', 'created_time',
        'sku_purchase_seq','category_path_purchase_seq','sku_view_sequence', 'seq_len',
        'geo_country', 'geo_region', 'geo_city','geo_zipcode', 'platform', 
        'label'
    ]]

In [18]:
df_input = gen_input_data(df, 1, 50)
df_input.shape

(4078028, 15)

In [23]:
df_input.label.describe()

count    4078028.0
mean           0.5
std            0.5
min            0.0
25%            0.0
50%            0.5
75%            1.0
max            1.0
Name: label, dtype: float64

In [21]:
df_input.head(2)

Unnamed: 0,user_id,sku_number,category_path,trans_date,created_time,sku_purchase_seq,category_path_purchase_seq,sku_view_sequence,seq_len,geo_country,geo_region,geo_city,geo_zipcode,platform,label
0,1280791,D516375S,root//Shop Categories//Crafts & Hobbies//Wood ...,2023-01-26,2023-01-26 13:23:23,[],[],[MP221283],50,US,MD,Leonardtown,20650,web,1
1,1280791,D516375S,root//Shop Categories//Crafts & Hobbies//Wood ...,2023-01-26,2023-01-26 13:23:23,[],[],"[10709269, MP221283, 10709269, 809168938375905...",50,US,MD,Leonardtown,20650,web,1


In [24]:
df_input.to_pickle('/Users/LINGYU1/work/localspace/data/mik_dnn_model_02222023/df_input_notencode_full_20230227.sav')


In [17]:
# import multiprocessing as mp

# def process_chunk(chunk, candidate_sku, negsample_ratio, seq_len):
#     # group by user_id
#     user_group = chunk.groupby('user_id')
#     # negative sampling for each user group
#     df_res = []
#     for user_id, data in user_group:
#         # first refine sequence len to < seq_len
#         for col in ['sku_purchase_seq','category_path_purchase_seq','sku_view_sequence']:
#             data[col] =  data[col].apply(lambda x: x[0:seq_len])
#         # create negative sample and combine with positive sample        
#         df_sp = create_negative_sample(data = data, candidate_item = candidate_sku, negsample_ratio = negsample_ratio)
#         df_res.append(df_sp)
#     df_res = pd.concat(df_res)
#     df_res['seq_len'] = seq_len
#     return df_res

# def gen_input_data_parallel(df, negsample_ratio, seq_len, num_processes=mp.cpu_count()):
#     # get candidate items and categories
#     candidate_sku = df[['sku_number','category_path']].drop_duplicates()
    
#     # split data into chunks
#     chunks = np.array_split(df, num_processes)
    
#     # create pool of processes
#     with mp.Pool(processes=num_processes) as pool:
#         # process each chunk using a separate process
#         results = [pool.apply_async(process_chunk, args=(chunk, candidate_sku, negsample_ratio, seq_len)) for chunk in chunks]
#         # collect the results from all processes
#         df_res = pd.concat([result.get() for result in results]).reset_index(drop=True)
    
#     return df_res[[
#         'user_id', 'sku_number','category_path', 'trans_date', 'created_time',
#         'sku_purchase_seq','category_path_purchase_seq','sku_view_sequence', 'seq_len',
#         'label'
#     ]]


In [None]:
# df_input = gen_input_data_parallel(df, 1, 50)
# df_input.shape

In [25]:
df_input_copy = df_input.copy()

In [76]:
df_input = df_input_copy.copy()

In [26]:
df_input.head(2)

Unnamed: 0,user_id,sku_number,category_path,trans_date,created_time,sku_purchase_seq,category_path_purchase_seq,sku_view_sequence,seq_len,geo_country,geo_region,geo_city,geo_zipcode,platform,label
0,1280791,D516375S,root//Shop Categories//Crafts & Hobbies//Wood ...,2023-01-26,2023-01-26 13:23:23,[],[],[MP221283],50,US,MD,Leonardtown,20650,web,1
1,1280791,D516375S,root//Shop Categories//Crafts & Hobbies//Wood ...,2023-01-26,2023-01-26 13:23:23,[],[],"[10709269, MP221283, 10709269, 809168938375905...",50,US,MD,Leonardtown,20650,web,1


# label encoding

### Dumb encoding

In [78]:
df_input.columns

Index(['user_id', 'sku_number', 'category_path', 'trans_date', 'created_time',
       'sku_purchase_seq', 'category_path_purchase_seq', 'sku_view_sequence',
       'seq_len', 'geo_country', 'geo_region', 'geo_city', 'geo_zipcode',
       'platform', 'label'],
      dtype='object')

In [79]:
from sklearn.preprocessing import LabelEncoder

In [80]:
def label_transform(lbe, x):
    try:
        return lbe.transform(x) + 1 # add one to all the encoded categories labels
    except:
        return np.array([])
    
def encode_features_old(df_input):
    # store original item id and user id
    df_input['sku_number_org'] = df_input['sku_number']
    df_input['user_id_org'] = df_input['user_id']
    
    # specify features and sequence features
    sparse_features = [
        'sku_number', 'category_path',
        'user_id','geo_country', 'geo_region', 
        'geo_city', 'geo_zipcode','platform'
    ]
    seq_sparse_feature = ['sku_purchase_seq','category_path_purchase_seq','sku_view_sequence']
    
    # get full set of item and category
    full_item_set = np.append(
        np.unique(
            np.concatenate(
                (
                    df_input['sku_number'].values,  # all sku in sales
                    df_input['sku_view_sequence'].explode().values # all sku in views
                )
            )
        )
        , 'na')
    full_cat_set = np.append(df_input['category_path'].unique(), 'na')
    
    # fit and transform features
    for feature in sparse_features:
        # need to store sku encoder
        if feature == 'sku_number':
            lbe_sku = LabelEncoder()
            lbe_sku.fit(full_item_set)
            df_input[feature] = lbe_sku.transform(df_input[feature]) + 1 # add one to all the encoded categories labels
        # need to store
        elif feature == 'category_path':
            lbe_cat = LabelEncoder()
            lbe_cat.fit(full_cat_set)
            df_input[feature] = lbe_cat.transform(df_input[feature]) + 1 # add one to all the encoded categories labels
        else:
            lbe = LabelEncoder()
            df_input[feature] = lbe.fit_transform(df_input[feature]) + 1 # add one to all the encoded categories labels 
    # trasnform sequence features
    for feature in seq_sparse_feature:
        if feature == 'sku_purchase_seq' or feature == 'sku_view_sequence':
            df_input[feature] = df_input[feature].apply(lambda x: label_transform(lbe_sku, x))

        elif feature == 'category_path_purchase_seq':
            df_input[feature] = df_input[feature].apply(lambda x: label_transform(lbe_cat, x))
    
    # get feature index table
    feature_max_idx = {}
    for feature in sparse_features:
        feature_max_idx[feature] = df_input[feature].max() + 1
    
    return df_input, feature_max_idx

In [None]:
df_input, feature_max_idx = encode_features_old(df_input)
df_input.shape, len(feature_max_idx)

### Regular encoding

In [33]:
from sklearn.preprocessing import LabelEncoder
lbe_sku = LabelEncoder()
lbe_sku.fit(full_item_set)
lbe_cat = LabelEncoder()
lbe_cat.fit(full_cat_set)

LabelEncoder()

In [52]:
encoded_item_set = lbe_sku.transform(full_item_set) + 1 # +1 to remove 0, 0 leave it for missing value
encoded_cat_set = lbe_cat.transform(full_cat_set) + 1 # +1 to remove 0, 0 leave it for missing value
encoded_item_set.shape, encoded_cat_set.shape

((12590,), (817,))

In [57]:
sku_dict = {full_item_set[i]: encoded_item_set[i] for i in range(len(full_item_set))}
cat_dict = {full_cat_set[i]: encoded_cat_set[i] for i in range(len(full_cat_set))}
len(sku_dict), len(cat_dict)

(12590, 817)

In [59]:
sparse_features = [
        'sku_number', 'category_path',
        'user_id','geo_country', 'geo_region', 
        'geo_city', 'geo_zipcode','platform'
]
seq_sparse_feature = ['sku_purchase_seq','category_path_purchase_seq','sku_view_sequence']

In [60]:
 # fit and transform features
for feature in sparse_features:
    # need to store sku encoder
    if feature == 'sku_number':
        df_input[feature] = lbe_sku.transform(df_input[feature]) + 1 # add one to all the encoded categories labels
    # need to store
    elif feature == 'category_path':
        df_input[feature] = lbe_cat.transform(df_input[feature]) + 1 # add one to all the encoded categories labels
    else:
        lbe = LabelEncoder()
        df_input[feature] = lbe.fit_transform(df_input[feature]) + 1 # add one to all the encoded categories labels 


In [64]:
# encode sequence features
for feature in seq_sparse_feature:
    if feature == 'sku_purchase_seq' or feature == 'sku_view_sequence':
        df_input[feature] = df_input[feature].apply(lambda x: np.array([sku_dict[c] for c in x]))

    elif feature == 'category_path_purchase_seq':
        df_input[feature] = df_input[feature].apply(lambda x: np.array([cat_dict[c] for c in x]))

In [66]:
encoded_item_set.max()

12590

In [67]:
# get feature index table
feature_max_idx = {}
for feature in sparse_features:
    if feature == 'sku_number':
        feature_max_idx[feature] = encoded_item_set.max() + 1
    elif feature == 'category_path':
        feature_max_idx[feature] = encoded_cat_set.max() + 1
    else:
        feature_max_idx[feature] = df_input[feature].max() + 1 # plus one to the max
    

In [27]:
def encode_features(df_input):
    from sklearn.preprocessing import LabelEncoder

    # store original item id and user id
    df_input['sku_number_org'] = df_input['sku_number']
    df_input['user_id_org'] = df_input['user_id']
    
    # specify features and sequence features
    sparse_features = [
        'sku_number', 'category_path',
        'user_id','geo_country', 'geo_region', 
        'geo_city', 'geo_zipcode','platform'
    ]
    seq_sparse_feature = ['sku_purchase_seq','category_path_purchase_seq','sku_view_sequence']
    
    # get full set of item and category
    full_item_set = np.append(
        np.unique(
            np.concatenate(
                (
                    df_input['sku_number'].values,  # all sku in sales
                    df_input['sku_view_sequence'].explode().values # all sku in views
                )
            )
        )
        , 'na')
    full_cat_set = np.append(df_input['category_path'].unique(), 'na')
    
    # encode item and cat set
    lbe_sku = LabelEncoder()
    lbe_sku.fit(full_item_set)
    lbe_cat = LabelEncoder()
    lbe_cat.fit(full_cat_set)
    encoded_item_set = lbe_sku.transform(full_item_set) + 1 # +1 to remove 0, 0 leave it for missing value
    encoded_cat_set = lbe_cat.transform(full_cat_set) + 1 # +1 to remove 0, 0 leave it for missing value
    
    # create encode dict
    sku_dict = {full_item_set[i]: encoded_item_set[i] for i in range(len(full_item_set))}
    cat_dict = {full_cat_set[i]: encoded_cat_set[i] for i in range(len(full_cat_set))}
    
    
     # fit and transform sparse features
    for feature in sparse_features:
        # need to store sku encoder
        if feature == 'sku_number':
            df_input[feature] = lbe_sku.transform(df_input[feature]) + 1 # add one to all the encoded categories labels
        # need to store
        elif feature == 'category_path':
            df_input[feature] = lbe_cat.transform(df_input[feature]) + 1 # add one to all the encoded categories labels
        else:
            lbe = LabelEncoder()
            df_input[feature] = lbe.fit_transform(df_input[feature]) + 1 # add one to all the encoded categories labels 

    # encode sequence features
    for feature in seq_sparse_feature:
        if feature == 'sku_purchase_seq' or feature == 'sku_view_sequence':
            df_input[feature] = df_input[feature].apply(lambda x: np.array([sku_dict[c] for c in x]))
        elif feature == 'category_path_purchase_seq':
            df_input[feature] = df_input[feature].apply(lambda x: np.array([cat_dict[c] for c in x]))
    
    # get feature index table
    feature_max_idx = {}
    for feature in sparse_features:
        if feature == 'sku_number':
            feature_max_idx[feature] = encoded_item_set.max() + 1
        elif feature == 'category_path':
            feature_max_idx[feature] = encoded_cat_set.max() + 1
        else:
            feature_max_idx[feature] = df_input[feature].max() + 1 # plus one to the max
    
    return df_input, feature_max_idx

In [28]:
df_input, feature_max_idx = encode_features(df_input)
df_input.shape, len(feature_max_idx)

((4078028, 17), 8)

In [29]:
df_input.head()

Unnamed: 0,user_id,sku_number,category_path,trans_date,created_time,sku_purchase_seq,category_path_purchase_seq,sku_view_sequence,seq_len,geo_country,geo_region,geo_city,geo_zipcode,platform,label,sku_number_org,user_id_org
0,1,124265,710,2023-01-26,2023-01-26 13:23:23.000,[],[],[157341],50,86,97,3360,3101,2,1,D516375S,1280791
1,1,124265,710,2023-01-26,2023-01-26 13:23:23.000,[],[],"[38158, 157341, 38158, 79216, 157341]",50,86,97,3360,3101,2,1,D516375S,1280791
2,1,30578,1744,2023-01-26,2023-01-26 13:23:23.000,[124265],[710],"[38158, 157341, 38158, 79216, 157341]",50,86,97,3360,3101,2,1,10684625,1280791
3,1,157341,2087,2023-02-21,2023-02-21 17:51:17.774,"[124265, 30578]","[710, 1744]",[157341],50,86,97,3360,3101,2,1,MP221283,1280791
4,1,157341,2087,2023-02-21,2023-02-21 17:51:17.774,"[124265, 30578]","[710, 1744]","[21594, 157341]",50,86,97,3360,3101,2,1,MP221283,1280791


In [30]:
df_input.to_pickle('/Users/LINGYU1/work/localspace/data/mik_dnn_model_02222023/df_input_full_20230227.sav')

In [27]:
# import concurrent.futures

# def encode_features(df_input):
#     # store original item id and user id
#     df_input['sku_number_org'] = df_input['sku_number']
#     df_input['user_id_org'] = df_input['user_id']

#     # specify features and sequence features
#     sparse_features = ['sku_number', 'category_path', 'user_id']
#     seq_sparse_feature = ['sku_purchase_seq','category_path_purchase_seq','sku_view_sequence']

#     # get full set of item and category
#     full_item_set = np.append(
#         np.unique(
#             np.concatenate(
#                 (
#                     df_input['sku_number'].values,  # all sku in sales
#                     df_input['sku_view_sequence'].explode().values # all sku in views
#                 )
#             )
#         )
#         , 'na')
#     full_cat_set = np.append(df_input['category_path'].unique(), 'na')

#     # fit and transform features
#     with concurrent.futures.ThreadPoolExecutor() as executor:
#         futures = []
#         encoders = {}
#         for feature in sparse_features:
#             # need to store sku encoder
#             if feature == 'sku_number':
#                 encoders[feature] = LabelEncoder()
#                 encoders[feature].fit(full_item_set)
#             # need to store
#             elif feature == 'category_path':
#                 encoders[feature] = LabelEncoder()
#                 encoders[feature].fit(full_cat_set)
#             else:
#                 encoders[feature] = LabelEncoder()
#             futures.append(executor.submit(lambda feature: (feature, encoders[feature].fit_transform(df_input[feature]) + 1), feature))
#         for future in concurrent.futures.as_completed(futures):
#             feature, transformed = future.result()
#             df_input[feature] = transformed

#     # transform sequence features
#     for feature in seq_sparse_feature:
#         if feature == 'sku_purchase_seq' or feature == 'sku_view_sequence':
#             df_input[feature] = df_input[feature].apply(lambda x: label_transform(encoders['sku_number'], x))

#         elif feature == 'category_path_purchase_seq':
#             df_input[feature] = df_input[feature].apply(lambda x: label_transform(encoders['category_path'], x))

#     # get feature index table
#     feature_max_idx = {}
#     for feature in sparse_features:
#         feature_max_idx[feature] = df_input[feature].max() + 1

#     return df_input, feature_max_idx


In [28]:
df_input, feature_max_idx = encode_features(df_input)
df_input.shape

(135248, 12)

In [None]:
feature_max_idx

In [134]:
df_input.to_pickle('/Users/LINGYU1/work/localspace/data/mik_dnn_model_02222023/df_input_full_20230227.sav')


In [137]:
df_input.columns

Index(['user_id', 'sku_number', 'category_path', 'trans_date', 'created_time',
       'sku_purchase_seq', 'category_path_purchase_seq', 'sku_view_sequence',
       'seq_len', 'geo_country', 'geo_region', 'geo_city', 'geo_zipcode',
       'platform', 'label', 'sku_number_org', 'user_id_org'],
      dtype='object')

In [138]:
df_input[['user_id', 'sku_number', 'category_path', 'trans_date',
       'sku_purchase_seq', 'category_path_purchase_seq', 'sku_view_sequence',
       'seq_len',  'geo_zipcode',
       'platform', 'label']]

Unnamed: 0,user_id,sku_number,category_path,trans_date,sku_purchase_seq,category_path_purchase_seq,sku_view_sequence,seq_len,geo_zipcode,platform,label
0,1,4233,1109,2023-02-14,"[8798, 12990, 13000, 13017, 13019, 3545]","[866, 251, 251, 251, 251, 1109]","[18479, 21115]",50,259,2,1
1,1,11222,1109,2023-02-14,"[8798, 12990, 13000, 13017, 13019, 3545, 4233,...","[866, 251, 251, 251, 251, 1109, 1109, 1109]","[18479, 21115]",50,259,2,1
2,1,20938,769,2023-02-14,"[8798, 12990, 13000, 13017, 13019, 3545, 4233,...","[866, 251, 251, 251, 251, 1109, 1109, 1109, 11...","[32380, 32379, 15900, 4233, 7093, 1534, 6041, ...",50,259,2,1
3,1,1282,694,2023-02-14,"[8798, 12990, 13000, 13017, 13019, 3545, 4233,...","[866, 251, 251, 251, 251, 1109, 1109, 1109, 11...","[18479, 21115]",50,259,2,1
4,1,21226,748,2023-02-26,"[8798, 12990, 13000, 13017, 13019, 3545, 4233,...","[866, 251, 251, 251, 251, 1109, 1109, 1109, 11...","[32380, 32379, 15900, 4233, 7093, 1534, 6041, ...",50,259,2,1
...,...,...,...,...,...,...,...,...,...,...,...
507891,1717,19196,987,2023-02-17,"[10533, 3161, 1114, 11074, 8479, 6971, 684, 13...","[35, 444, 620, 467, 170, 574, 454, 1009, 672, ...","[30210, 20458, 29148, 31856, 29796, 5079, 3367...",50,1102,2,0
507892,1717,18194,1114,2023-02-17,"[10533, 3161, 1114, 11074, 8479, 6971, 684, 13...","[35, 444, 620, 467, 170, 574, 454, 1009, 672, ...","[28296, 2189, 20215, 8945, 23912, 30656, 19600...",50,1102,2,0
507893,1717,16572,1147,2023-02-17,"[10533, 3161, 1114, 11074, 8479, 6971, 684, 13...","[35, 444, 620, 467, 170, 574, 454, 1009, 672, ...","[28296, 2189, 20215, 8945, 23912, 30656, 19600...",50,1102,2,0
507894,1717,3616,240,2023-02-17,"[10533, 3161, 1114, 11074, 8479, 6971, 684, 13...","[35, 444, 620, 467, 170, 574, 454, 1009, 672, ...","[30210, 20458, 29148, 31856, 29796, 5079, 3367...",50,1102,2,0


In [73]:
# import pickle
# with open("/Users/LINGYU1/work/localspace/data/mik_dnn_model_02222023/df_input_20230227.pickle", "wb") as f: 
#     pickle.dump(df_input, f) 

In [71]:
# df_input.to_json('/Users/LINGYU1/work/localspace/data/mik_dnn_model_02222023/df_input_20230227.json')

In [75]:
full_item_set = np.append(
    np.unique(
        np.concatenate(
            (
                df_input['sku_number'].values,  # all sku in sales
                df_input['sku_view_sequence'].explode().values # all sku in views
            )
        )
    )
    , 'na')

In [76]:
full_item_set

array([1, 2, 3, ..., 11035, 11036, 'na'], dtype=object)

In [93]:
a = df['sku_purchase_seq'][15]
a.shape

(5,)

In [92]:
df['sku_purchase_seq'][15]

array(['10358097', '10683360', '10403125', '10196947', '10228172'],
      dtype=object)

In [83]:
np.concatenate([a, np.zeros((13, a.shape[1]))], axis=0)


IndexError: tuple index out of range

In [102]:
np.lib.pad(a, (50 - a.shape[0],0), 'constant', constant_values=('na'))


array(['na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na',
       'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na',
       'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na',
       'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na', 'na',
       'na', '10358097', '10683360', '10403125', '10196947', '10228172'],
      dtype=object)

In [168]:
full_item_set = np.append(
    np.unique(
        np.concatenate(
            (
                df_input['sku_number'].values,  # all sku in sales
                df_input['sku_view_sequence'].explode().values # all sku in views
            )
        )
    )
    , 'na')
full_cat_set = np.append(df_input['category_path'].unique(), 'na')
# fit and transform features
for feature in sparse_features:
    # need to store sku encoder
    if feature == 'sku_number':
        lbe_sku = LabelEncoder()
        lbe_sku.fit(full_item_set)
        df_input[feature] = lbe_sku.transform(df_input[feature]) + 1
    # need to store
    elif feature == 'category_path':
        lbe_cat = LabelEncoder()
        lbe_cat.fit(full_cat_set)
        df_input[feature] = lbe_cat.transform(df_input[feature]) + 1
    else:
        lbe = LabelEncoder()
        df_input[feature] = lbe.fit_transform(df_input[feature]) + 1 # add one to all the encoded categories labels 

In [171]:
# df_input_copy['category_path_purchase_seq'][0:10].apply(lambda x: label_transform(lbe_cat, x))

In [172]:
# df_input[feature].apply(lambda x: lbe_cat.transform(x) + 1)

In [174]:
# trasnform sequence features
for feature in seq_sparse_feature:
    if feature == 'sku_purchase_seq' or feature == 'sku_view_sequence':
        df_input[feature] = df_input[feature].apply(lambda x: label_transform(lbe_sku, x))
#         df_input[feature] = df_input[feature].apply(lambda x: lbe_sku.transform(x) + 1)

    elif feature == 'category_path_purchase_seq':
        df_input[feature] = df_input[feature].apply(lambda x: label_transform(lbe_cat, x))


In [177]:
# get feature index table
feature_max_idx = {}
for feature in sparse_features:
    feature_max_idx[feature] = df_input[feature].max() + 1


In [181]:
df_input.to_csv("/Users/LINGYU1/work/localspace/data/mik_dnn_model_02222023/df_input_20230224.csv", index=False)

# Encoding Test

In [34]:
import string
import random
# define the character dictionary
char_dict = {chr(i): i-65 for i in range(65, 91)}
char_dict.update({chr(i): i-71 for i in range(97, 123)})

# define a function to generate a random string of uppercase and lowercase English characters
def random_string(length):
    letters = string.ascii_letters
    return ''.join(random.choice(letters) for i in range(length))

# create a Pandas DataFrame with a column of 50-length Numpy arrays containing random strings
df = pd.DataFrame({'random_char': [np.array([random_string(1) for i in range(50)]) for j in range(1600000)]})

# print the resulting DataFrame
print(df.shape, df.head(2)) 

(1600000, 1)                                          random_char
0  [d, O, d, v, z, i, w, D, U, k, Y, K, T, k, f, ...
1  [J, p, U, Z, N, E, T, y, Z, Z, h, c, F, Q, W, ...


### dict encode

In [35]:
import time

def encode_characters(df, char_dict):
    # use the lambda function and the np.apply_along_axis() function to encode all characters in the "Random Characters" column
    df["encode_char"] = df['random_char'].apply(lambda x: np.array([char_dict[c] for c in x]))
    return df

start_time = time.time()
df = encode_characters(df, char_dict)
print("--- %s seconds ---" % (time.time() - start_time))

print(df.shape)
print(df.head())

--- 26.60670518875122 seconds ---
(1600000, 2)
                                         random_char  \
0  [d, O, d, v, z, i, w, D, U, k, Y, K, T, k, f, ...   
1  [J, p, U, Z, N, E, T, y, Z, Z, h, c, F, Q, W, ...   
2  [X, S, M, h, d, K, l, y, f, K, x, O, H, S, j, ...   
3  [o, G, T, z, g, s, h, I, D, F, l, k, k, b, P, ...   
4  [W, J, S, x, X, Q, o, M, h, b, t, W, b, B, T, ...   

                                         encode_char  
0  [29, 14, 29, 47, 51, 34, 48, 3, 20, 36, 24, 10...  
1  [9, 41, 20, 25, 13, 4, 19, 50, 25, 25, 33, 28,...  
2  [23, 18, 12, 33, 29, 10, 37, 50, 31, 10, 49, 1...  
3  [40, 6, 19, 51, 32, 44, 33, 8, 3, 5, 37, 36, 3...  
4  [22, 9, 18, 49, 23, 16, 40, 12, 33, 27, 45, 22...  


### label encoder

In [36]:
from sklearn.preprocessing import LabelEncoder

In [37]:
lbe = LabelEncoder()

In [38]:
lbe.fit(list(char_dict.keys()))

LabelEncoder()

In [39]:
start_time = time.time()
df['lbe_char'] = df['random_char'].apply(lambda x: lbe.transform(x))
print("--- %s seconds ---" % (time.time() - start_time))

--- 96.34215307235718 seconds ---


In [40]:
df.shape

(1600000, 3)