In [1]:
import pandas as pd
from tqdm import tqdm
import json
import os
import umap
import numpy as np
import scipy.sparse as sp
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score, confusion_matrix


import stellargraph as sg
from stellargraph.mapper import FullBatchNodeGenerator

from stellargraph.layer import GCN

import warnings
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras import activations, initializers, constraints, regularizers
from tensorflow.keras.layers import Input, Layer, Lambda, Dropout, Reshape, Dense
from tensorflow.keras.callbacks import EarlyStopping

from tensorflow.keras import layers, optimizers, losses, metrics, Model
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples


from tqdm import tqdm 

# Data

## Read in edges, features, and targets

In [64]:
# 기존 유저 데이터
old_df= pd.read_csv('MLRD_UserData_20220131.csv')
# 새로운 유저 데이터
new_df = pd.read_csv('MLRD_UserData_20220227.csv', encoding='cp949')


## 현재 new_df에서 피쳐 데이터로 쓰이는 kyc_level부분의 type을 object->int 수정해주었습니다.(ex: value가 0과 '0'이 다른 것으로 구분되므로)
new_df['max_kyc_level'] = new_df['max_kyc_level'].replace('unknown', 0)
new_df['max_kyc_level'] = pd.to_numeric(new_df['max_kyc_level'])

In [1]:
old_sort = old_df.sort_values(by='signup_time')         
old_sort1 = old_sort.reset_index()
old_sort2 = old_sort1.drop('index', axis=1)

new_sort = new_df.sort_values(by='signup_time')         
new_sort1 = new_sort.reset_index()
new_sort2 = new_sort1.drop('index', axis=1)

old1 = old_sort2[['user_id', 'no_of_days_since_last_trade', 'total_no_of_trade_times', 'accumulative_trading_amount_in_usdt',
             'signup_channel_type', 'signup_device_type', 'signup_country_en', 'max_kyc_level', 'first_deposit_method', 'signup_channel_id']]

new1 = new_sort2[['user_id', 'no_of_days_since_last_trade', 'total_no_of_trade_times', 'accumulative_trading_amount_in_usdt',
             'signup_channel_type', 'signup_device_type', 'signup_country_en', 'max_kyc_level', 'first_deposit_method', 'signup_channel_id']]


# 세 개의 컬럼에 대해 결측치 행 모두 제거
old3 =old1.dropna(axis=0)
old4 = old3.reset_index()
old5 = old4.drop('index', axis=1)

orfm_df2 = pd.DataFrame()
orfm_df2['CustomerID'] = old5['user_id']
orfm_df2['Recency'] = old5['no_of_days_since_last_trade']            # 최근 거래 후 몇일 지났는지
orfm_df2['Frequency'] = old5['total_no_of_trade_times']              # 거래 총 수
orfm_df2['Monetary'] = old5['accumulative_trading_amount_in_usdt']   # 총 거래 금액(USDT)

orfm_df1 = orfm_df2.reset_index()
orfm_df = orfm_df1.drop('index', axis=1)

### Log 변환을 통해 데이터 변환

# Recency, Frequecny, Monetary 컬럼에 np.log1p() 로 Log Transformation
orfm_df['Recency_log'] = np.log1p(orfm_df['Recency'])
orfm_df['Frequency_log'] = np.log1p(orfm_df['Frequency'])
orfm_df['Monetary_log'] = np.log1p(orfm_df['Monetary'])

# Log Transformation 데이터에 StandardScaler 적용
X_features = orfm_df[['Recency_log','Frequency_log','Monetary_log']].values
X_features_scaled = StandardScaler().fit_transform(X_features)

kmeans = KMeans(n_clusters=3, random_state=0)
labels = kmeans.fit_predict(X_features_scaled)
orfm_df['cluster_label'] = labels

old5['cluster'] = orfm_df['cluster_label']

df = pd.concat([old5, new1])   # 둘이 날짜형식 안 맞아서 따로 정렬 후 병합/ old부분만  target을 제대로 만들고  new는 모두 0처리 할 것임
df['cluster'].replace(np.nan,0, inplace=True)
df6 = df[(df['cluster']==0) | (df['cluster']==2)]

df6['cluster'] = df6['cluster'].replace(2,1)
df7 = df6.reset_index()
df8 = df7.drop('index', axis=1)



# 가입 경로 유형에 따라 엣지 데이터 생성
arr_org = np.array(df8[df8['signup_channel_type']=='Organic traffic'].index)
arr_ref = np.array(df8[df8['signup_channel_type']=='Referral'].index)
arr_seo = np.array(df8[df8['signup_channel_type']=='SEO'].index)
arr_aff = np.array(df8[df8['signup_channel_type']=='Affiliate'].index)
arr_cam = np.array(df8[df8['signup_channel_type']=='Campaign promotion'].index)
arr_oth = np.array(df8[df8['signup_channel_type']=='Others'].index)

edges_df = pd.DataFrame()
start=[]
end=[]
arrs=[arr_org, arr_ref, arr_seo, arr_aff, arr_cam, arr_oth]

for arr in tqdm(arrs):
    for i in range(len(arr)-1):
        for j in range(i+1, len(arr)):
            start.append(arr[i])
            end.append(arr[j])

edges_df['source'] = start
edges_df['target'] = end        



# 피쳐 데이터 생성
features_df = pd.DataFrame()

f1 = pd.get_dummies(df8['signup_device_type'])
f2 = pd.get_dummies(df8['signup_country_en'])
f3 = pd.get_dummies(df8['max_kyc_level'])
f4 = pd.get_dummies(df8['first_deposit_method'])

i=0
fs=[f1,f2,f3,f4]
for f in tqdm(fs):
    for col in f.columns:
        features_df[i] = f[col]
        i+=1
        
        
# 타겟 데이터
targets_df1 = pd.DataFrame()
targets_df1['id'] = df8['user_id']
targets_df1['target'] = df8['cluster']
targets_df2 = targets_df1.reset_index()
targets_df = targets_df2.drop('index', axis=1)

## Train/Test/Val split & Preprocessing & GCN Model

In [556]:
import gc
gc.collect() 

G = sg.StellarGraph(features_df, edges_df)

t1 = round((len(targets_df)-len(new_df))*(6/10))
t2 = ((len(targets_df)-len(new_df)) - t1
train_pages1, test_pages1 = train_test_split(targets_df, train_size=t1, shuffle=False, random_state=45)
val_pages1, test_pages1 = train_test_split(test_pages1, train_size=t2, shuffle=False, random_state=120) 
      
target_encoding = LabelBinarizer()  # 2개의 클래스가 있을 때, 이에 대한 벡터를 반환함
train_targets1 = target_encoding.fit_transform(train_pages1['target'])
val_targets1 = target_encoding.transform(val_pages1['target'])
test_targets1 = target_encoding.transform(test_pages1['target'])

      
# Initialize the generator
generator = FullBatchNodeGenerator(G, method="gcn")

# Use the .flow method to prepare it for use with GCN
train_gen1 = generator.flow(train_pages1.index, train_targets1)
val_gen1 = generator.flow(val_pages1.index, val_targets1)
test_gen1 = generator.flow(test_pages1.index, test_targets1)

In [566]:
# Build necessary layers
gcn = GCN(
    layer_sizes=[32,32], activations=["relu", "relu"], generator=generator, dropout=0.5
    )

# Access the input and output tensors
x_inp, x_out = gcn.in_out_tensors()

# Pass the output tensor through the dense layer with sigmoid
predictions = layers.Dense(units=train_targets1.shape[1], activation="sigmoid")(x_out)

model = Model(inputs=x_inp, outputs=predictions)
model.compile(
    optimizer = optimizers.Adam(lr=0.01),
    loss = losses.binary_crossentropy,
    metrics = ["acc"]
)

history = model.fit(
    train_gen1,
    epochs = 200,
    validation_data = val_gen1,
    verbose = 1,
    shuffle=False,         
)


def evaluate_preds(true, pred):
    auc = roc_auc_score(true, pred)
    pr = average_precision_score(true, pred)
    bin_pred = [1 if p > 0.5 else 0 for p in pred]   # 0.5
    f_score = f1_score(true, bin_pred)
    print('ROC AUC:', auc)
    print('PR AUC:', pr)
    print('F1 score:', f_score)
    print(confusion_matrix(true, bin_pred, normalize='true'))

    return auc, pr, f_score


new_preds1 = model.predict(test_gen1)
new_preds1

#  신규고객 세그먼테이션

In [2]:
result = new_df.copy()
bin_pred = [1 if p > 0.5 else 0 for p in new_preds1[0].ravel()] 
result['pred_cluster'] = bin_pred

result0 = result[result['pred_cluster']==0]
result1 = result[result['pred_cluster']==1]


# 군집0 추출
result0['user_id'].to_csv('group0.csv')

# 군집1 추출
result1['user_id'].to_csv('group1.csv')