In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import seaborn as sns
import tqdm
import re
import torch
import torch.nn as nn
from tab_transformer_pytorch import TabTransformer

from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import os
from sklearn.model_selection import train_test_split

import buy_data_function as bf
import model_function as md

In [23]:
pd.set_option("display.max_column",None)

In [24]:
path = "G:/내 드라이브/대학원/프로젝트/LG/"

In [25]:
buy_data = pd.read_csv(path + "master_buy.csv",encoding="euc-kr")
crm_data = pd.read_csv(path + "master_crm.csv",encoding="euc-kr")

In [26]:
#buy_data 전처리 코드
buy_data = bf.preprocess(buy_data)
timeseries = bf.make_timeseries(buy_data)
#predict_2023 = bf.make_predict(timeseries)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  buy_data["금액(중간값)"] = buy_data['금액'].apply(replace_amount)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  buy_data['매출일자(배송완료일자)'] = pd.to_datetime(buy_data['매출일자(배송완료일자)'], format='%Y%m%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  buy_data["년도"] = buy_data['매출일자(배송완료일자)'].apply(lambda x : x.

In [27]:
data = crm_data.copy()

In [28]:
total_columns = crm_data.columns[1:-1]
ordinal_column = ['케어솔루션품목수','케어십품목수','최근3년구매금액','Only&Best품목수','건강관리가전품목수',"연령대",
 "아파트시세","아파트평형대","포인트사용빈도","보유멤버십포인트"]

categorical_column = list(set(total_columns) - set(ordinal_column))

In [29]:
ordinal = [['0품목', '1품목', '2품목', '3품목', '4품목', '5품목', '6품목', '7품목', '8품목', '9품목', '10품목↑'],
['0품목', '1품목', '2품목', '3품목', '4품목', '5품목', '6품목', '8품목', '9품목', '10품목↑'],
['0원','1KK미만','1KK이상3KK미만','3KK이상5KK미만','5KK이상10KK미만','10KK이상15KK미만','15KK이상20KK미만','20KK이상25KK미만','25KK이상30KK미만','30KK↑'],
['0품목','1품목','2품목','3품목','4품목','5품목','6품목','7품목'],
['0품목','1품목','2품목','3품목','4품목'],
[np.nan,'10대', '20대','30대','40대','50대','60대(60~64)','60대(65~69)','70대','80대↑'],
[np.nan,'1억↓','1~2억','3~4억','5~6억','7~8억','9~10억','11~15억','16~29억', '30억↑'],
[np.nan,'10평대↓','10평대','20평대','30평대', '40평대','50평대','60평대','70평대↑'],
['미사용','1~2회','3~4회','5~6회','7~8회','9~10회','11~12회','13~14회','15~16회','17~18회','19~20회','20회↑'],
[np.nan,'1만↓','3만↓','5만↓','10만↓','20만↓','30만↓','30만↑']]

In [30]:
#data 전처리
data,categorical_encoder_list,categorical_mapping_list,ordinal_encoder, ordinal_mapping_list = md.preprocess(data,categorical_column,ordinal_column,ordinal)

In [31]:
#time_series 추가하려면 사용
data = data.merge(timeseries,on="고객ID")

In [32]:
#범주형 변수의 column 구하는 부분
#이 때 넣을 data frame의 범주형 column순서와 같아야함
cat_columns = data.drop(list(timeseries.columns)+["label","고객ID"],axis=1).columns

In [33]:
# column별 unique한 값을 뽑는 코드
nunique_list = []

for i in cat_columns:
    nunique_list.append(crm_data[i].nunique(dropna=False))

In [34]:
#train_test split
X = data.drop(["label","고객ID",2023.0],axis=1)
Y = data.label

x_train, x_valid, y_train, y_valid = train_test_split(X, Y, test_size=0.2, shuffle=True, stratify=Y, random_state=34)

In [35]:
#model 생성
model = TabTransformer(
    categories = tuple(nunique_list),   # tuple containing the number of unique values within each category
    num_continuous = 14,                 # number of continuous values
    dim = 32,                           # dimension, paper set at 32
    dim_out = 1,                        # binary prediction, but could be anything
    depth = 6,                          # depth, paper recommended 6
    heads = 8,                          # heads, paper recommends 8
    attn_dropout = 0.1,                 # post-attention dropout
    ff_dropout = 0.1,                   # feed forward dropout
    mlp_hidden_mults = (4, 2),          # relative multiples of each hidden dimension of the last mlp to logits
    mlp_act = nn.ReLU(),                # activation for final mlp, defaults to relu, but could be anything else (selu etc)
    last_act = nn.Sigmoid(),
    #continuous_mean_std = cont_mean_std # (optional) - normalize the continuous values before layer norm
)

# x_categ = batch_X    # category values, from 0 - max number of categories, in the order as passed into the constructor above
# x_cont = torch.tensor([[]])    # assume continuous values are already normalized individually

# pred = model(x_categ, x_cont,return_attn = True) # (1, 1)

In [36]:
#cuda 지정
device = "cuda" if torch.cuda.is_available() else "cpu"
#저장할 경로
path = "G:/내 드라이브/대학원/수업/lg/lg_model/model_test"

In [37]:
#model을 사용하게 해주는 class 객체 생성
model_controller = md.model_use(
    model = model,
    batch_size = 64,
    epochs = 10,
    device = device,
    #범주형 변수의 수를 넣어줘야 함
    num_cat = len(nunique_list),
    save_path = path,
    want_criterion=nn.BCELoss()
)

In [None]:
#위의 지정한 경로에 모델을 자동으로 저장해 줌
model_controller.train(train_x,train_y,valid_x,valid_y)

In [None]:
#model test 코드
#test data에 대해 아래와 같이 4개를 반환
embedding, accuracy, recall, f1 = model_controller.test(valid_x,valid_y)

In [None]:
#uncertainty를 사용한 코드
#iterations만큼 반복한 평균 sigmoid값과 모델의 uncertainty를 반환해줌
iterations = 100
mean_array,uncertainty = model_controller.uncertainty(iterations,x_valid, y_valid)

In [None]:
#학습된 모델로 embedding된 그래프를 그려주는 코드
model_controller.embedding_graph(X,Y,save_name="/test.png")

In [None]:
#학습된 모델로 column별 embedding된 그래프를 그려주는 코드
model_controller.column_embedding_graph(X,Y,cat_columns,categorical_column,categorical_mapping_list,ordinal_column,ordinal_mapping_list)