In [1]:
# ! pip install category_encoders

In [2]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import sklearn.metrics
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 150)

from pathlib import Path
import sys
pathlib = str(Path().resolve()) + "/../../"
sys.path.append(pathlib)

from utils import *
data_path = "../../data"

train_df_origin = load_csv_to_pandasdf(os.path.join(data_path, "sharechat_recsys2023_data",  "train"))
train_df_origin = train_df_origin.sort_values(by=['f_1']).reset_index(drop=True)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:36<00:00,  1.27s/it]


In [3]:
def fit_count_encoded_feature(fg_list_1, df):
    encoder_list = {}
    for feature_name in tqdm(fg_list_1, desc='get_count_encoded_feature'):
        feature_name_CE = f"{feature_name}_CE"            
        encoder = CountEncoder(feature_name, "return_nan")
        df[feature_name_CE] = encoder.fit_transform(df[feature_name])        
        encoder_list[feature_name] = encoder
    return df, encoder_list

def fit_indexing_feature(fg_list_1, partition_key, df):
    encoder_list = {}
    for feature_name in tqdm(fg_list_1, desc='get_indexing_feature'):
        feature_name_index = f"{feature_name}_idx"            
        encoder = Indexer(feature_name, partition_key)
        df[feature_name_index] = encoder.fit_transform(df)        
        encoder_list[feature_name] = encoder
    return df, encoder_list

def fit_newvalue(categorical_list, df):
    encoder_list = {}
    for feature_name in tqdm(categorical_list, desc='get_newvalue_flag_feature'):
        encoder = NewValueEncoder(feature_name)
        df = encoder.fit_transform(df)
        encoder_list[feature_name] = encoder
    return df, encoder_list

In [4]:
import warnings
warnings.filterwarnings("ignore")

train_df = train_df_origin.copy()

fdflag_features = [f"f_{i}" for i in list(range(2, 23)) + [78, 75, 50]]
count_features = [f"f_{i}" for i in [2, 4, 6, 13, 15, 18] + [78, 75, 50, 20, 24]]
index_features = [f"f_{i}" for i in list(range(2, 23))]

target_label = 'is_installed'
partition_key = 'f_35'

train_df['dow'] = train_df['f_1'] % 7

train_df, new_value_encoder_list = fit_newvalue(fdflag_features, train_df)
train_df, count_encoder_list = fit_count_encoded_feature(count_features, train_df)
train_df, index_encoder_list = fit_indexing_feature(index_features, partition_key, train_df)

get_newvalue_flag_feature: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24/24 [00:30<00:00,  1.29s/it]
get_count_encoded_feature: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11/11 [00:42<00:00,  3.84s/it]
get_indexing_feature: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21/21 [00:49<00:00,  2.37s/it]


In [5]:
! mkdir -p ../../data/1_LearningFE
for feature_name, encoder in new_value_encoder_list.items():
    encoder.save(os.path.join(data_path, "1_LearningFE",  f"newvalue_{feature_name}.pkl"))
for feature_name, encoder in count_encoder_list.items():
    encoder.save(os.path.join(data_path, "1_LearningFE",  f"count_{feature_name}.pkl"))
for feature_name, encoder in index_encoder_list.items():
    encoder.save(os.path.join(data_path, "1_LearningFE",  f"index_{feature_name}.pkl"))
train_df.to_parquet(os.path.join(data_path, "1_LearningFE",  "train_processed.parquet"))

In [6]:
train_df

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,f_31,f_32,f_33,f_34,f_35,f_36,f_37,f_38,f_39,f_40,f_41,f_42,f_43,f_44,f_45,f_46,f_47,f_48,f_49,f_50,f_51,f_52,f_53,f_54,f_55,f_56,f_57,f_58,f_59,f_60,f_61,f_62,f_63,f_64,f_65,f_66,f_67,f_68,f_69,f_70,f_71,f_72,f_73,f_74,f_75,f_76,f_77,f_78,f_79,is_clicked,is_installed,dow,f_2_first_day,f_2_fdflag,f_3_first_day,f_3_fdflag,f_4_first_day,f_4_fdflag,f_5_first_day,f_5_fdflag,f_6_first_day,f_6_fdflag,f_7_first_day,f_7_fdflag,f_8_first_day,f_8_fdflag,f_9_first_day,f_9_fdflag,f_10_first_day,f_10_fdflag,f_11_first_day,f_11_fdflag,f_12_first_day,f_12_fdflag,f_13_first_day,f_13_fdflag,f_14_first_day,f_14_fdflag,f_15_first_day,f_15_fdflag,f_16_first_day,f_16_fdflag,f_17_first_day,f_17_fdflag,f_18_first_day,f_18_fdflag,f_19_first_day,f_19_fdflag,f_20_first_day,f_20_fdflag,f_21_first_day,f_21_fdflag,f_22_first_day,f_22_fdflag,f_78_first_day,f_78_fdflag,f_75_first_day,f_75_fdflag,f_50_first_day,f_50_fdflag,f_2_CE,f_4_CE,f_6_CE,f_13_CE,f_15_CE,f_18_CE,f_78_CE,f_75_CE,f_50_CE,f_20_CE,f_24_CE,f_2_idx,f_3_idx,f_4_idx,f_5_idx,f_6_idx,f_7_idx,f_8_idx,f_9_idx,f_10_idx,f_11_idx,f_12_idx,f_13_idx,f_14_idx,f_15_idx,f_16_idx,f_17_idx,f_18_idx,f_19_idx,f_20_idx,f_21_idx,f_22_idx
0,2522321,45,3346,22294,6767,21545,1159,27941,19203,6675,21574,27833,4473,18614,27291,2038,18162,12554,21865,29982,27961,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,1,0,0,0,0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.308513,0.308513,0.308513,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.539897,0.000000,0.0,0.0,0.0,0,0,3,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,201888,94474,4065,427984,20006,446231,3394351,14554,3319591,323920,3260294,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,2377898,45,20095,563,31686,15908,590,27941,19203,6675,19343,27833,30670,11359,20899,25365,14709,3249,20452,26758,4222,4625,0,0,0,0,0,0,0,0,0,0,3,0,1,0,1,1,1,0,0,0,26.570643,0.000060,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.398702,0.000000,0.000000,0.192820,0.000000,0.000000,2.159588,1.605064,2.994627,0.0,0.591404,0.000000,1.065663,90779409,1677.072375,0.000050,8.146457,1.058418,1.712723,0.000000,0.0,0.571121,0.000000,0.115692,1.156922,0.269948,0.0,0.0,0.0,0,0,3,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,1007519,397980,397980,4526,842,36666,3394351,2810343,3319591,1008800,3260294,1,1,1,1,1,2,2,2,1,2,1,1,1,1,1,1,1,1,1,1,2
2,3173460,45,5156,22294,18971,25604,30192,27941,19203,6675,19343,32266,4473,17705,26082,29887,14709,11918,12020,29982,27961,4740,0,0,0,0,0,0,0,0,0,0,3,0,1,1,1,1,1,0,0,0,15.657012,1.620103,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.909948,0.000000,0.000000,0.038564,0.038564,0.038564,2.198152,0.830089,0.302038,0.0,1.626360,0.000000,0.000000,13087224,23.439561,1.495782,7.993264,0.000000,0.000000,1.148976,0.0,2.855607,0.571121,0.115692,1.156922,0.269948,0.0,0.0,0.0,0,0,3,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,95835,82898,7705,708988,5860,1340010,3394351,2810343,3319591,323920,3260294,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
3,1930117,45,3346,22294,6767,21545,10208,27941,21621,6675,19343,32266,4473,18614,20496,11481,14709,6395,21865,29982,27961,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,1,1,0,0,0,1.272614,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.192820,0.192820,0.0,0.0,0.0,0,0,3,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,201888,94474,9149,427984,8169,446231,3394351,15787,3319591,323920,3260294,2,2,2,2,1,3,1,3,2,1,2,2,1,1,2,1,2,2,2,2,3
4,2318339,45,3346,22294,5579,15908,11774,27941,21218,6675,22970,27957,4473,20634,20899,908,14709,13234,12020,29982,27961,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,1,1,1,0,0,0,21.673006,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.115692,0.038564,0.115692,0.347077,0.154256,0.694153,0.000000,0.000000,0.0,1.478509,0.258599,1.420884,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.115692,1.156922,0.269948,0.0,0.0,0.0,0,0,3,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,45,True,201888,37540,8565,332611,3791,1340010,3394351,2810343,3319591,323920,3260294,3,3,1,2,1,4,1,4,1,1,3,1,2,1,3,1,1,3,3,3,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3485847,3012038,66,26325,7152,21563,21545,28565,27941,21621,6675,22970,11004,4255,4230,26485,1590,25546,11918,12020,16851,30153,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,1,1,0,0,0,29.308692,2.225864,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.762581,0.000000,0.000000,0.000000,0.347077,0.000000,0.347077,1.229138,1.756662,0.0,4.287676,0.129300,0.000000,51532956,556.191364,2.157136,8.087705,4.223591,11.651090,2.177379,0.0,0.000000,0.000000,0.000000,1.156922,0.269948,0.0,0.0,0.0,0,0,3,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,174464,35939,1345,819496,1150,1340010,3394351,2810343,3319591,222250,3260294,8580,12196,5091,28260,222,69862,4666,68162,22586,7573,5160,15965,14587,29,19546,7988,26330,8656,8580,34659,59129
3485848,3012328,66,20095,563,31686,19475,590,27941,19203,6675,21574,11810,11402,18614,27291,14827,25546,28694,21865,26758,4222,4625,0,0,0,0,0,0,0,0,0,0,2,0,0,0,1,1,1,0,0,0,28.537411,0.000801,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.270802,0.038564,0.000000,0.038564,4.511996,0.000000,5.591790,1.707634,1.979011,0.0,0.147851,0.000000,0.000000,255183433,2255.701154,0.000692,8.190964,0.199623,0.488804,0.000204,0.0,0.000000,0.000000,0.115692,1.156922,0.269948,0.0,0.0,0.0,0,0,3,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,1007519,397980,397980,427984,6378,446231,3394351,2810343,3319591,1008800,3260294,19184,19203,6806,6139,6806,69863,25945,68163,6331,8234,5052,9700,6954,142,19547,4398,10064,19429,19202,19217,59130
3485849,862173,66,20095,563,22861,25604,21280,27941,21218,6675,19343,11407,30670,30214,20899,24120,25546,3249,12020,26758,4222,4625,0,0,0,0,0,0,0,0,0,0,3,0,0,0,1,1,1,0,0,0,22.675672,0.000801,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.270802,0.000000,0.000000,0.269948,0.000000,0.000000,0.192820,1.707634,1.979011,0.0,0.000000,0.129300,0.355221,255183433,2255.701154,0.000692,8.190964,0.199623,0.488804,0.000204,0.0,0.000000,0.000000,0.115692,1.156922,0.269948,0.0,0.0,0.0,0,0,3,45,False,45,False,46,False,45,False,54,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,1007519,495055,306828,505713,1535,1340010,3394351,2810343,3319591,1008800,3260294,19185,19204,12379,24656,12379,69864,22564,68164,40947,8510,11688,10152,21603,34,19548,11055,26331,19430,19203,19218,59131
3485850,2888400,66,26325,22294,4896,21545,26484,27941,19203,6675,22970,11810,8659,17705,26082,24692,18162,8881,12020,16851,30153,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0,14.924294,2.225864,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.762581,0.000000,0.000000,0.000000,0.000000,0.000000,0.462769,1.229138,1.756662,0.0,0.443553,2.068795,0.000000,51532956,556.191364,2.157136,8.087705,4.223591,11.651090,2.177379,0.0,3.426729,0.000000,0.000000,1.156922,0.269948,0.0,0.0,0.0,0,0,3,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,45,False,174464,50239,18760,708988,13234,1340010,3394351,2810343,3319591,222250,3260294,8581,32547,1393,28261,325,69865,25946,68165,22587,8235,8113,13593,7974,300,12154,3880,26332,8657,8581,34660,59132
