In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import xgboost as xgb
%matplotlib inline

In [3]:
# dir = 'C:/Users/SAMSUNG/Desktop/new/빅콘테스트/2019빅콘테스트_챔피언스리그_데이터_수정/train_pledge.csv'

def preprocessing_pledge(pledge_dir, pay_dir):
    # file load
    tr_pledge = pd.read_csv(pledge_dir)
    pay = pd.read_csv(pay_dir)
    
    # 공지에 올라온 error 부분 제거
    del tr_pledge['non_combat_play_time']
    
    # 가입한 혈맹의 순위
    bbb = tr_pledge.pledge_id.value_counts().to_frame().reset_index()
    bbb.columns = ['pledge_id','count']
    bbb['rank'] = bbb['count'].rank(ascending = False, method = 'min')
    pledge_rank = {}
    # dictionary를 활용해 원데이터의 pledge_id에 rank값을 mapping
    for i, j in enumerate(list(bbb['rank'])):
        pledge_rank[bbb['pledge_id'][i]] = j
    tr_pledge['pledge_rank'] = tr_pledge['pledge_id'].map(pledge_rank)


    # 혈맹원의 합
    pledge_member_num = {}
    for i in tr_pledge.pledge_id:
        if i not in pledge_member_num.keys():
            pledge_member_num[i] = 0
        pledge_member_num[i] += 1
    tr_pledge['pledge_member_num'] = tr_pledge['pledge_id'].map(pledge_member_num)


    # acc_id 기준으로 데이터 압축
    group = tr_pledge.groupby(['acc_id', 'day']).sum().reset_index()
    groups = group.groupby(['acc_id']).sum().reset_index()
    # label 데이터 merge
    merge_df = groups.copy()


    # 접속일 변수 log_in_freq 생성
    freq = []
    for i in group.acc_id.unique():
        freq.append([i,group[group.acc_id == i].shape[0]])
    new = pd.DataFrame(sorted(freq))
    new.columns = ['acc_id', 'log_in_freq']
    merge_df = pd.merge(merge_df, new, how = 'left', on = 'acc_id')


    # 유저별 가입한 혈맹 수
    act_pledge_num = {}
    for i in tr_pledge.acc_id.unique():
        act_pledge_num[i] = tr_pledge[tr_pledge.acc_id == i].pledge_id.nunique()
    merge_df['join_pledge_num'] = merge_df['acc_id'].map(act_pledge_num)


    # payment 데이터 곃합
    pay = pay.groupby(pay.acc_id).sum().reset_index().drop('day', axis=1)
    merge_pay = pd.merge(merge_df, pay, on = 'acc_id', how = 'left')
    merge_pay = merge_pay.fillna(0)


    # 일자별 혈맹 활동 내역 flatten (pledge_rank와 combat_char_cnt는 flatten을 안시키는게 퍼포먼스 향상에 더 좋음)
    df = tr_pledge[[col for col in tr_pledge.columns if col not in ['server', 'char_id','pledge_id',
                                                                   'pledge_rank']]
                  ].groupby(['day', 'acc_id']).sum().reset_index()
    df_grouped = df.groupby('day')
    p = df_grouped.get_group(1)
    for i in range(2, 29):
        p = pd.merge(p, df_grouped.get_group(i), on='acc_id', how='outer',
                     suffixes=('_'+str(i-1), '_'+str(i)))
    p = p[[col for col in p.columns if ('day' not in col) & ('combat_char_cnt' not in col)]]
    p = p.fillna(0).set_index('acc_id')
    df = p.reset_index()
    merge_flatten_df = pd.merge(merge_pay, df, on = 'acc_id')
    
    return merge_flatten_df

In [4]:
# log1p 가중치 적용 함수
def log_weight(df):
    df_weight = df.copy()
    day = list(map(str,list(range(1,29,1))))
    weight = []
    for i in day:
        weight.append(np.log1p(int(i)))
    
    for j in df_weight.columns:
        if j[-1] in day: # column의 맨 뒷글자(str)가 day에 있으면
            index = int(j[-1]) - 1 # weight의 index로 사용하기 위해 1을 배줌
            df_weight[j] = df_weight[j].apply(lambda x: x * weight[index])
    
    return df_weight

In [46]:
pledge_dir_1 = 'C:/Users/SAMSUNG/Desktop/new/빅콘테스트/2019빅콘테스트_챔피언스리그_데이터_수정/test1_pledge.csv'
pay_dir_1 = 'C:/Users/SAMSUNG/Desktop/new/빅콘테스트/2019빅콘테스트_챔피언스리그_데이터_수정/test1_payment.csv'
pledge_dir_2 = 'C:/Users/SAMSUNG/Desktop/new/빅콘테스트/2019빅콘테스트_챔피언스리그_데이터_수정/test2_pledge.csv'
pay_dir_2 = 'C:/Users/SAMSUNG/Desktop/new/빅콘테스트/2019빅콘테스트_챔피언스리그_데이터_수정/test2_payment.csv'

test1_df = preprocessing_pledge(pledge_dir_1, pay_dir_1)
test2_df = preprocessing_pledge(pledge_dir_2, pay_dir_2)

In [47]:
# 28일치에 대한 weight 적용
test1_df_log = log_weight(test1_df)
test2_df_log = log_weight(test2_df)

In [48]:
# acc_id를 익덱스로 변환
test1_df = test1_df.set_index('acc_id')
test1_df_log = test1_df_log.set_index('acc_id')
test2_df = test2_df.set_index('acc_id')
test2_df_log = test2_df_log.set_index('acc_id')

In [49]:
# 분석에 필요 없는 char_id와 pledge_id 제거
X_test1 = test1_df.drop(['char_id', 'pledge_id'], axis=1)
X_test1_log = test1_df_log.drop(['char_id', 'pledge_id'], axis=1)
X_test2 = test2_df.drop(['char_id', 'pledge_id'], axis=1)
X_test2_log = test2_df_log.drop(['char_id', 'pledge_id'], axis=1)

# raw

In [50]:
# 64 categorical
from joblib import dump, load

model_1 = load('0001.joblib')
y_pred_churn_1 = model_1.predict(X_test1)
y_pred_churn_2 = model_1.predict(X_test2)

In [51]:
# amount spent
bst_2 = xgb.Booster({'nthread' : 4}) # init model
bst_2.load_model('0002.model')
y_pred_spent_1 = bst_2.predict(xgb.DMatrix(pd.DataFrame(X_test1)))
y_pred_spent_2 = bst_2.predict(xgb.DMatrix(pd.DataFrame(X_test2)))



# log

In [52]:
# 64 categorical
from joblib import dump, load

model_3 = load('0003.joblib')
y_pred_churn_log_1 = model_3.predict(X_test1_log)
y_pred_churn_log_2 = model_3.predict(X_test2_log)

In [53]:
# amount spent
bst_4 = xgb.Booster({'nthread' : 4}) # init model
bst_4.load_model('0004.model')
y_pred_spent_log_1 = bst_4.predict(xgb.DMatrix(pd.DataFrame(X_test1_log)))
y_pred_spent_log_2 = bst_4.predict(xgb.DMatrix(pd.DataFrame(X_test2_log)))



## spent에 음수로 예측한 값 0으로 바꾸기

In [65]:
def find_negative(lst):
    for i, x in enumerate(lst):
        if x < 0:
            lst[i] = 0
    return lst

In [67]:
y_pred_spent_1 = find_negative(y_pred_spent_1)
y_pred_spent_2 = find_negative(y_pred_spent_2)
y_pred_spent_log_1 = find_negative(y_pred_spent_log_1)
y_pred_spent_log_2 = find_negative(y_pred_spent_log_2)

# save csv

In [54]:
X_test1 = X_test1.reset_index()
X_test1_log = X_test1_log.reset_index()
X_test2 = X_test2.reset_index()
X_test2_log = X_test2_log.reset_index()

In [68]:
test1_raw = pd.DataFrame({'acc_id' : X_test1['acc_id'],
                          'survival_time' : y_pred_churn_1,
                          'amount_spent' : y_pred_spent_1})
test1_log = pd.DataFrame({'acc_id' : X_test1_log['acc_id'],
                          'survival_time' : y_pred_churn_log_1,
                          'amount_spent' : y_pred_spent_log_1})
test2_raw = pd.DataFrame({'acc_id' : X_test2['acc_id'],
                          'survival_time' : y_pred_churn_2,
                          'amount_spent' : y_pred_spent_2})
test2_log = pd.DataFrame({'acc_id' : X_test2_log['acc_id'],
                          'survival_time' : y_pred_churn_log_2,
                          'amount_spent' : y_pred_spent_log_2})

In [71]:
test1_raw.to_csv("C:/Users/SAMSUNG/Desktop/test1_predict.csv", index = False)

In [72]:
test2_raw.to_csv("C:/Users/SAMSUNG/Desktop/test2_predict.csv", index = False)

In [None]:
test1_log.to_csv("C:/Users/SAMSUNG/Desktop/test1_predict.csv", index = False)

In [None]:
test2_log.to_csv("C:/Users/SAMSUNG/Desktop/test2_predict.csv", index = False)