In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import os
import random

from sklearn.model_selection import train_test_split
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

import pyarrow as pa
import pyarrow.parquet as pq

import math
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore, spearmanr , shapiro


In [3]:
CFG = {
    'BATCH_SIZE': 4096,
    'EPOCHS': 10,
    'LEARNING_RATE': 1e-3,
    'SEED' : 42
}
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(CFG['SEED']) # Seed 고정

In [5]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

/bin/bash: line 1: nvidia-smi: command not found


In [6]:
train= pd.read_parquet('/content/drive/MyDrive/Colab Notebooks/CRT/train_part1.parquet' , engine= 'pyarrow')

In [8]:
import plotly.express as px

train.describe().T\
    .style.bar(subset=['mean'], color=px.colors.qualitative.G10[2])\
    .background_gradient(subset=['std'], cmap='Blues')\
    .background_gradient(subset=['50%'], cmap='BuGn')

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
gender,3203885.0,1.374313,0.477692,1.0,1.0,1.0,2.0,2.0
age_group,3203885.0,6.379988,1.587037,1.0,6.0,7.0,7.0,8.0
inventory_id,3203885.0,31.559231,21.964794,2.0,11.0,36.0,42.0,95.0
day_of_week,3203885.0,4.15019,1.795642,1.0,3.0,4.0,6.0,7.0
hour,3203885.0,12.589877,6.327867,0.0,8.0,12.0,18.0,23.0
l_feat_1,3203885.0,1.841406,0.370378,1.0,2.0,2.0,2.0,2.0
l_feat_2,3203885.0,1.828081,0.379389,1.0,2.0,2.0,2.0,2.0
l_feat_3,3203885.0,2.333113,0.741038,1.0,2.0,2.0,3.0,3.0
l_feat_4,3203885.0,10.027688,5.634881,1.0,7.0,7.0,14.0,26.0
l_feat_5,3203885.0,415.17218,295.095001,1.0,132.0,383.0,675.0,1079.0


In [9]:
def summary(df):
    sum = pd.DataFrame(train.dtypes, columns=['dtypes'])
    sum['결측치 개수'] = df.isna().sum()
    sum['결측치 비율'] = (df.isna().sum())/len(df)
    sum['고유값 개수'] = df.nunique().values
    sum['데이터 개수'] = df.count().values


    return sum

summary(train).style.background_gradient(cmap='Blues')

Unnamed: 0,dtypes,결측치 개수,결측치 비율,고유값 개수,데이터 개수
gender,float32,0,0.0,2,3203885
age_group,float32,0,0.0,8,3203885
inventory_id,float32,0,0.0,18,3203885
day_of_week,float32,0,0.0,7,3203885
hour,float32,0,0.0,24,3203885
seq,object,0,0.0,2716557,3203885
l_feat_1,float32,0,0.0,2,3203885
l_feat_2,float32,0,0.0,2,3203885
l_feat_3,float32,0,0.0,3,3203885
l_feat_4,float32,0,0.0,26,3203885


In [10]:
corr = train.corr(numeric_only= True)

corr = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
corr.style.background_gradient(cmap='Blues')

  smin = np.nanmin(gmap) if vmin is None else vmin
  smax = np.nanmax(gmap) if vmax is None else vmax


Unnamed: 0,gender,age_group,inventory_id,day_of_week,hour,l_feat_1,l_feat_2,l_feat_3,l_feat_4,l_feat_5,l_feat_6,l_feat_7,l_feat_8,l_feat_9,l_feat_10,l_feat_11,l_feat_12,l_feat_13,l_feat_14,l_feat_15,l_feat_16,l_feat_17,l_feat_18,l_feat_19,l_feat_20,l_feat_21,l_feat_22,l_feat_23,l_feat_24,l_feat_25,l_feat_26,l_feat_27,feat_e_1,feat_e_2,feat_e_3,feat_e_4,feat_e_5,feat_e_6,feat_e_7,feat_e_8,feat_e_9,feat_e_10,feat_d_1,feat_d_2,feat_d_3,feat_d_4,feat_d_5,feat_d_6,feat_c_1,feat_c_2,feat_c_3,feat_c_4,feat_c_5,feat_c_6,feat_c_7,feat_c_8,feat_b_1,feat_b_2,feat_b_3,feat_b_4,feat_b_5,feat_b_6,feat_a_1,feat_a_2,feat_a_3,feat_a_4,feat_a_5,feat_a_6,feat_a_7,feat_a_8,feat_a_9,feat_a_10,feat_a_11,feat_a_12,feat_a_13,feat_a_14,feat_a_15,feat_a_16,feat_a_17,feat_a_18,history_a_1,history_a_2,history_a_3,history_a_4,history_a_5,history_a_6,history_a_7,history_b_1,history_b_2,history_b_3,history_b_4,history_b_5,history_b_6,history_b_7,history_b_8,history_b_9,history_b_10,history_b_11,history_b_12,history_b_13,history_b_14,history_b_15,history_b_16,history_b_17,history_b_18,history_b_19,history_b_20,history_b_21,history_b_22,history_b_23,history_b_24,history_b_25,history_b_26,history_b_27,history_b_28,history_b_29,history_b_30,clicked
gender,,-0.065852,0.062862,-0.005381,-0.026124,0.023673,0.005628,0.060413,-0.032965,-0.085852,-0.009206,-0.02201,0.016672,-0.115479,0.013002,-0.009694,-0.038296,-0.12557,0.018602,-0.070613,-0.023673,-0.115479,0.032142,0.006345,,0.036139,-0.00619,,0.003198,-0.003669,0.007375,0.046872,-0.030235,0.045891,0.024862,0.017678,-0.033217,-0.178381,0.054947,0.042849,-0.064337,0.013007,-0.043434,-0.095086,-0.108797,-0.029102,0.11846,0.132422,-0.177568,0.161616,-0.025724,-0.169963,-0.193833,-0.102018,-0.143427,-0.019138,-0.000676,0.0328,-0.049487,-0.109258,0.065395,-0.029482,0.109986,0.022899,0.125721,0.105336,0.401125,0.089768,-0.029247,0.015046,0.017059,-0.032227,-0.015459,-0.002942,0.042242,0.132652,-0.048313,0.098492,0.179649,-0.083321,0.040783,0.041842,-0.04455,0.021534,0.041515,0.028156,0.034726,0.06344,0.05162,0.054727,0.054381,0.055001,0.055445,0.055058,0.055293,0.055443,0.055516,0.055316,0.055178,0.055979,0.055432,0.053742,0.055435,0.055057,0.055392,0.055104,0.055718,0.055439,0.055597,0.055377,0.055339,0.055397,0.054939,0.055613,0.05635,0.055397,0.052337,-0.005543
age_group,,,-0.198242,0.0159,-0.043241,-0.083062,0.485062,-0.130134,-0.029916,0.113873,-0.034971,-0.060814,0.031985,0.208469,-0.08149,0.03021,0.049656,0.029819,-0.010249,0.042381,0.083062,0.208469,-0.063356,0.047124,,-0.081014,0.152729,,-0.002363,0.00126,0.008352,-0.116451,0.161521,-0.215842,-0.059833,-0.068298,-0.126156,0.079758,0.012201,-0.27774,-0.016008,-0.226413,0.131178,0.390409,0.391605,0.158108,-0.3747,-0.443452,0.415089,0.014527,-0.345303,0.286106,0.399453,0.417054,0.367471,0.179588,0.014863,-0.27022,0.184016,0.384098,-0.329728,0.249041,0.010944,0.001279,0.103321,0.084136,0.128415,0.062428,-0.028446,-0.082727,-0.042852,-0.008462,0.020935,-0.056629,-0.104964,0.290913,-0.382106,-0.044737,-0.108993,-0.268722,-0.155707,-0.143878,0.15802,-0.260202,-0.135594,-0.270195,-0.279837,-0.252945,-0.208114,-0.222575,-0.223529,-0.223383,-0.223124,-0.223488,-0.22349,-0.221556,-0.217935,-0.223263,-0.222504,-0.223442,-0.222833,-0.218346,-0.22318,-0.220668,-0.223219,-0.224026,-0.223424,-0.22319,-0.223333,-0.223147,-0.223276,-0.223191,-0.224946,-0.221787,-0.222192,-0.223227,-0.208811,0.001188
inventory_id,,,,-0.005975,0.005994,-0.087344,-0.137925,-0.03932,-0.005533,-0.052385,-0.044582,0.020157,-0.022421,-0.039825,0.025628,0.008534,-0.004447,-0.033087,-0.009686,-0.008891,0.087344,-0.039825,0.020211,-0.004569,,0.019368,-0.062756,,0.000466,-0.003292,0.000796,0.070608,-0.113068,0.124465,0.074916,0.044496,0.029656,-0.027911,-0.062123,0.154449,0.063585,0.136025,-0.063272,-0.28119,-0.297997,-0.080857,0.20454,0.218327,-0.331829,-0.007525,0.06625,-0.276656,-0.223972,-0.258033,-0.315736,-0.139539,-0.016983,0.140238,-0.138122,-0.29378,0.192991,-0.143497,-0.008086,0.001821,-0.023291,-0.018572,-0.019729,-0.024032,-0.009433,0.012674,-0.001632,-0.013169,-0.01157,0.018382,0.02969,-0.066892,0.080146,0.014123,0.033122,0.050736,0.14262,0.048225,-0.052542,0.137769,0.045944,0.137698,0.136667,0.176429,0.156788,0.156238,0.156704,0.156601,0.156706,0.156625,0.156715,0.156935,0.157893,0.15672,0.156579,0.156663,0.156723,0.156165,0.156718,0.15678,0.156717,0.15665,0.157784,0.156717,0.156946,0.156706,0.156709,0.156729,0.156795,0.156807,0.156697,0.156714,0.15741,0.050827
day_of_week,,,,,-0.02864,-0.010124,0.010084,-0.006967,-0.02497,0.043196,0.01635,0.032358,0.000966,0.027145,-0.017888,0.063258,0.069635,0.034809,0.033207,0.012871,0.010124,0.027145,-0.001639,-0.000515,,-0.007911,0.002977,,-0.000437,-0.011889,0.00096,-0.000823,0.001042,0.001414,0.004866,-0.015808,0.00759,0.009054,-0.007088,0.003884,0.007038,0.003508,0.003379,0.004915,0.00555,-0.006473,-0.005397,-0.00973,0.005632,-0.006479,-0.012645,0.01205,0.006386,0.049508,0.003905,-0.000568,0.000343,-0.000463,0.001456,0.004065,-0.001613,0.000471,-0.061837,-0.001825,-0.003002,-0.002284,-0.006793,-0.013418,-0.012919,-0.004651,-0.004138,-0.004408,-0.003402,-0.00406,-0.005549,0.00058,-0.010629,-0.001743,-0.006025,-0.008215,-0.003027,-0.005801,0.003915,-0.00218,-0.005429,-0.005816,0.001486,-0.002138,-0.002288,-0.002533,-0.002632,-0.002409,-0.002666,-0.002662,-0.002686,-0.002751,-0.00255,-0.002608,-0.002574,-0.002687,-0.002686,-0.002693,-0.002666,-0.002546,-0.002667,-0.002676,-0.002722,-0.002665,-0.002665,-0.002666,-0.002685,-0.002663,-0.002591,-0.002762,-0.002693,-0.002678,-0.002406,-0.000581
hour,,,,,,-0.008542,-0.027932,0.028285,-0.029255,0.009758,-0.01976,0.029745,-0.004339,0.061565,0.061602,0.067697,0.072039,-0.039082,0.006374,0.083081,0.008542,0.061565,0.002784,-0.003322,,-0.001084,-0.010232,,-0.00044,0.003358,0.003412,0.00725,-0.05368,0.088932,0.028194,0.017014,0.010773,-0.009639,-0.054584,0.081327,0.049361,0.067147,-0.067046,-0.053072,-0.091699,-0.018036,0.108386,0.095916,-0.051162,-0.023371,-0.010276,-0.064623,-0.069966,-0.088146,-0.027906,-0.093713,-0.006802,0.086385,-0.090834,-0.090758,0.115145,-0.081774,-0.00385,0.000284,-0.007307,-0.008789,-0.019049,0.004939,0.01576,0.011412,0.01179,0.009731,0.005574,0.006584,0.010841,-0.01035,0.019284,0.004657,0.003423,0.020989,0.032192,0.042163,-0.047348,0.130712,0.040434,0.128689,0.129327,0.046161,0.037416,0.037132,0.036927,0.03674,0.036852,0.03683,0.036877,0.036913,0.037424,0.03687,0.036827,0.036825,0.036852,0.036689,0.036847,0.03707,0.03685,0.036859,0.036946,0.036849,0.036912,0.036864,0.03685,0.036854,0.036954,0.036894,0.03682,0.036852,0.037185,0.002263
l_feat_1,,,,,,,-0.04646,0.788094,-0.016755,0.106764,0.24598,-0.043055,-0.003785,-0.114929,-0.000825,-0.086443,-0.0912,-0.075599,-0.043747,-0.070654,-1.0,-0.114929,0.009599,-0.002607,,0.023505,-0.018144,,0.00021,-0.000181,-0.0017,0.008036,-0.006122,0.019053,-0.809153,0.007172,0.000885,-0.029174,-0.00046,0.026368,-0.001134,0.016735,-0.020863,-0.062096,-0.069355,-0.021083,0.040645,0.060991,-0.084752,0.001869,0.041313,-0.065677,-0.049445,-0.058514,-0.080435,-0.023619,-0.003158,0.027179,-0.023433,-0.066293,0.042365,-0.03077,0.006961,0.002401,-0.001113,-0.001948,0.00876,0.006171,0.008595,0.012239,0.007691,8.2e-05,-1.8e-05,0.011248,0.018974,-0.017647,0.036737,0.012678,0.020168,0.029015,0.016821,0.017384,-0.019153,0.035546,0.0165,0.037506,0.039446,0.034982,0.025784,0.026738,0.027629,0.027484,0.027591,0.027511,0.027599,0.027551,0.027265,0.027606,0.027581,0.027619,0.02761,0.027067,0.027605,0.027263,0.027607,0.027532,0.027809,0.027606,0.027603,0.027581,0.027614,0.027607,0.027589,0.027532,0.02756,0.027617,0.026636,0.012695
l_feat_2,,,,,,,,-0.073227,-0.002715,0.078953,0.009213,-0.0487,0.050378,0.12208,-0.023246,0.028504,0.023377,0.01522,0.030366,0.030803,0.04646,0.12208,-0.047174,0.03761,,-0.044697,0.281429,,-0.006867,0.004287,0.031863,-0.018119,0.094732,-0.127676,-0.026441,-0.088943,-0.090765,0.052044,-0.017303,-0.184276,0.026429,-0.14823,-0.01689,0.213821,0.229593,0.126958,-0.204642,-0.230737,0.226658,0.018826,-0.172513,0.182669,0.244043,0.24534,0.214301,0.111943,0.008858,-0.156139,0.134754,0.224858,-0.200573,0.1458,-0.00316,-0.003364,0.054154,0.042765,0.072162,0.067656,0.026669,-0.033904,-0.000163,0.030124,0.035307,-0.038593,-0.109104,0.12819,-0.292499,-0.037046,-0.063315,-0.212686,-0.091868,-0.073432,0.08116,-0.153667,-0.069065,-0.158668,-0.163905,-0.144107,-0.118807,-0.128776,-0.12713,-0.126868,-0.126872,-0.126703,-0.127012,-0.125973,-0.123996,-0.126875,-0.126486,-0.126505,-0.126579,-0.124507,-0.12683,-0.125664,-0.126844,-0.127568,-0.126861,-0.126837,-0.12678,-0.126808,-0.126863,-0.126838,-0.128032,-0.125981,-0.126464,-0.126867,-0.117005,-0.000479
l_feat_3,,,,,,,,,-0.094138,0.015853,0.116071,-0.041689,-0.036026,-0.18697,-0.021686,-0.03616,-0.082818,-0.11889,-0.069587,-0.086767,-0.788094,-0.18697,0.013255,-0.009458,,0.024296,-0.031273,,0.000777,-0.000798,0.003615,0.006388,-0.036836,0.052502,-0.661678,0.016685,0.012828,-0.034493,-0.016215,0.069888,0.015572,0.0547,-0.043786,-0.109366,-0.11658,-0.039087,0.089488,0.106808,-0.134834,-0.027192,0.042664,-0.113829,-0.09996,-0.108217,-0.125917,-0.059552,-0.006699,0.061671,-0.051774,-0.11455,0.078614,-0.056819,0.006617,0.002073,-0.010833,-0.008323,0.002457,0.000107,0.005495,0.013043,0.007826,-0.003992,-0.002032,0.007647,0.020697,-0.0299,0.05322,0.014861,0.022182,0.035934,0.040292,0.02569,-0.02877,0.061952,0.024004,0.063579,0.065273,0.06672,0.05283,0.054479,0.055244,0.055084,0.055238,0.055108,0.055248,0.055203,0.055537,0.055268,0.054959,0.055341,0.055321,0.054472,0.05525,0.054759,0.055253,0.055194,0.055106,0.055253,0.055271,0.055227,0.055256,0.055252,0.055209,0.055188,0.055181,0.055264,0.053616,0.012623
l_feat_4,,,,,,,,,,-0.07707,0.093151,0.027788,0.030133,0.057209,-0.010953,-0.030848,-0.081098,-0.004192,-0.05095,0.027486,0.016755,0.057209,-0.014397,0.013654,,0.005983,0.005,,0.000654,-0.000567,0.004922,0.076966,0.021776,-0.018529,0.072259,-0.003962,-0.003763,-0.013283,0.025552,-0.029883,-0.029654,-0.027569,0.01545,0.017157,0.019944,0.01392,-0.014748,-0.001049,0.023684,0.075351,0.051497,0.038772,0.027256,0.021501,0.026567,0.042954,0.00444,-0.013891,7.5e-05,0.021701,0.003581,-0.000519,-0.00884,0.001501,-0.006496,-0.002971,-0.003134,0.026164,0.034069,0.015032,0.019957,0.026881,0.011178,0.016298,0.008341,-0.040327,0.026235,0.015076,0.018212,0.026111,-0.003324,0.000413,0.001249,0.004339,0.000268,0.00549,0.006264,-0.008376,-0.004443,-0.00569,-0.005816,-0.005569,-0.005865,-0.005679,-0.005812,-0.005965,-0.00821,-0.005797,-0.005286,-0.005857,-0.005637,-0.005724,-0.005871,-0.006249,-0.00587,-0.005703,-0.004021,-0.00587,-0.005856,-0.005882,-0.005858,-0.005869,-0.005517,-0.005669,-0.005871,-0.00586,-0.004627,0.003154
l_feat_5,,,,,,,,,,,0.431453,0.358269,-0.00072,0.548982,0.392346,0.575309,0.549181,-0.110438,0.379174,0.518691,-0.106764,0.548982,-0.019954,0.00186,,-0.042311,0.024766,,-0.001442,0.001731,0.009973,0.042958,-0.007057,-0.018879,-0.111437,-0.013903,0.0044,0.057613,-0.016437,-0.018359,0.021418,-0.000257,0.001244,0.051334,0.051997,0.029294,-0.049974,-0.05899,0.068039,0.018616,-0.063225,0.048237,0.048928,0.065389,0.062344,0.022355,0.001352,-0.029185,0.028667,0.048149,-0.041607,0.02614,-0.01079,-0.004474,-0.013742,-0.016051,-0.047649,-0.040738,-0.029732,-0.022025,-0.024588,-0.011475,-0.011256,-0.012637,-0.031869,-0.000237,-0.054813,-0.041619,-0.032218,-0.031261,-0.011212,-0.025294,0.026335,-0.031179,-0.024515,-0.03689,-0.037609,-0.02668,-0.018594,-0.022374,-0.022702,-0.022642,-0.022787,-0.022682,-0.022787,-0.022654,-0.02175,-0.02279,-0.022865,-0.022885,-0.022671,-0.021949,-0.02278,-0.022048,-0.022778,-0.022752,-0.021915,-0.022781,-0.022784,-0.022765,-0.02279,-0.022779,-0.02277,-0.022641,-0.022668,-0.022787,-0.018999,0.017126


feature 분석

(A) 범주형(임베딩용)

- user/context: gender, age_group, day_of_week, hour

- item: inventory_id

- sequence: seq (쉼표로 구분된 과거 inventory_id 시퀀스)

<br>


(B) 연속형(스케일링/버킷화)

- l_feat_1 ~ l_feat_27 : 로그 기반/랭크류처럼 보이는 long-term 통계 → 표준화(μ,σ) 또는 분위수 변환 권장

- feat_e_1 ~ feat_e_10, feat_d_1 ~ feat_d_6, feat_c_1 ~ feat_c_8, feat_b_1 ~ feat_b_6,
feat_a_1 ~ feat_a_18 : 아이템/유저/컨텍스트 파생 수치 → 표준화 또는 버킷화(DeepFM 필드로 쓸 거면 버킷)

- history_a_1 ~ history_a_7, history_b_1 ~ history_b_30 : 시퀀스 통계 요약(정규화된 비율/가중치처럼 보임) → 그대로 연속형으로 사용(스케일만 맞춤)

