In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [0]:
import os
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor

In [0]:
#train

import sys
import os
import pandas as pd
import numpy as np
import gc

mod = sys.modules[__name__]

# data load
path = 'drive/My Drive/bigcontest2019/data/'

# Data Merge


In [0]:
# function define

def week_transform(row):
    if row>0 and row<8:
        row = 1
    elif row>7 and row<15:
        row = 2
    elif row>14 and row<22:
        row = 3
    else:
        row = 4
    return row

def merge_by_char_id(df_list):
    '''
    df_list
    '''
    df = eval(df_list[0])

    for arg in df_list[1:]:
        df_arg = eval(arg)
        df = pd.concat([df, df_arg],axis=1)

    return df

def pivoting_df(df):
    
    ind = list(df.columns)[0]
    col = list(df.columns)[1]
    
    names = list(df.columns)[2:]

    for name in names:
        tmp = df.reset_index(drop=True).pivot(index = ind, columns= col,values=name).fillna(0)
        tmp.columns = [name+'_day'+str(x) for x in tmp.columns]

        setattr(mod, '{}'.format(name), tmp)

    final_df = merge_by_char_id(names)
    
    return final_df

def merge_final_df(df,*args):
    '''
    df : 기준이 되는 데이터 프레임 (char_id가 전부 있는 쪽으로!!)
    *args : 가변인자 (데이터 프레임을 원하는 만큼 집어넣으면 된다.)
    '''
    for arg in args:
        df = pd.merge(df, arg, how='left',left_index=True, right_index=True)

    return df

In [0]:
#train
train_activity = pd.read_csv(path + 'train/train_activity.csv').sort_values(['acc_id','day']).fillna(0)
train_payment = pd.read_csv(path + 'train/train_payment.csv').sort_values(['acc_id','day']).fillna(0)
train_trade = pd.read_csv(path + 'train/train_trade.csv').fillna(0)
train_pledge = pd.read_csv(path + 'train/train_pledge.csv').sort_values(['acc_id','day']).fillna(0)
train_combat = pd.read_csv(path + 'train/train_combat.csv').sort_values(['acc_id','day']).fillna(0)


train_activity['day'] = train_activity['day'].apply(week_transform)
train_payment['day'] = train_payment['day'].apply(week_transform)
train_trade['day'] = train_trade['day'].apply(week_transform)
train_pledge['day'] = train_pledge['day'].apply(week_transform)
train_combat['day'] = train_combat['day'].apply(week_transform)

# activity
train_activity['custom_acc_id'] = train_activity['acc_id'].astype(str)+'_'+train_activity['char_id'].astype(str)
train_activity['game_money_change'] = np.abs(train_activity['game_money_change'])
train_activity_group = train_activity.groupby(['custom_acc_id','day']).agg({
                                                             'server':'nunique', 
                                                             'playtime':['sum','mean','std'], 
                                                             'npc_kill':['sum','mean','std'], 
                                                             'solo_exp':'sum',
                                                             'party_exp':'sum', 
                                                             'quest_exp':'sum',
                                                             'rich_monster':'sum', 
                                                             'death':'sum', 
                                                             'revive':'sum',
                                                             'exp_recovery':'sum',
                                                             'fishing':'sum',
                                                             'private_shop':'sum',
                                                             'game_money_change':['sum','std'],
                                                             'enchant_count':'sum'}).reset_index(drop=False)
train_activity_group.columns = ['activity_'+'_'.join(x) for x in train_activity_group.columns.ravel()]
train_activity_char = pivoting_df(train_activity_group)

# # create char_id for payment
id_label = train_activity.groupby(['acc_id', 'char_id']).agg({'day': 'count'}).reset_index(drop=False)
id = train_activity.groupby(['acc_id']).agg({'day': 'count'})

label = id_label.div(id, level='acc_id')
label.reset_index(drop=False,inplace=True)
label.rename(columns = {'day':'weight'},inplace=True)

train_weight = pd.merge(label,train_payment,how='left',on='acc_id')
train_weight['amount_spent'] *= train_weight['weight']*100
train_weight.drop(['weight'],axis=1,inplace=True)

# # payment
train_payment['custom_acc_id'] = train_weight['acc_id'].astype(str)+'_'+train_weight['char_id'].astype(str)
train_payment_group = train_payment.groupby(['custom_acc_id','day']).agg({'amount_spent':'sum'}).reset_index(drop=False)
train_payment_group.columns = ['payment_'+i for i in train_payment_group.columns]

train_payment_char = pivoting_df(train_payment_group)

# trade
## 판매자 테이블 정의
train_trade['source_custom_acc_id'] = train_trade['source_acc_id'].astype(str)+'_'+train_trade['source_char_id'].astype(str)
train_trade_seller = train_trade.groupby(['source_custom_acc_id','day']).agg({
                                         'type':['nunique','count','sum'], # 거래의 종류들을 파악하기 위해서 -> nuique = 2이면 두 종류의 거래 모두 진행 / count 
                                         'server':'nunique', 
                                         'target_char_id':'nunique',
                                         'item_type':'nunique',
                                         'item_amount':'sum',
                                         'item_price':'sum'}).reset_index(drop=False)
train_trade_seller.columns = ['trade_seller_'+'_'.join(x) for x in train_trade_seller.columns.ravel()]
train_trade_seller['trade_seller_type_count'] -= train_trade_seller['trade_seller_type_sum']
train_trade_seller = train_trade_seller.rename(columns = {'trade_seller_type_count':'trade_seller_type_personal','trade_seller_type_sum':'trade_seller_type_exchange'})

train_trade_seller_char = pivoting_df(train_trade_seller)

## 구매자 테이블 정의
train_trade['target_custom_acc_id'] = train_trade['target_acc_id'].astype(str)+'_'+train_trade['target_char_id'].astype(str)
train_trade_buyer = train_trade.groupby(['target_custom_acc_id','day']).agg({ 
                                         'type':['nunique','count','sum'], # 거래의 종류들을 파악하기 위해서 -> nuique = 2이면 두 종류의 거래 모두 진행 / count 
                                         'server':'nunique', 
                                         'source_char_id':'nunique', # 몇개의 캐릭터 운용하는지
                                         'item_type':'nunique', 
                                         'item_amount':'sum',
                                          'item_price':'sum'}).reset_index(drop=False)
# train_trade_buyer.drop('target_custom_acc_id',axis=1,inplace=True)

train_trade_buyer.columns = ['trade_buyer_'+'_'.join(x) for x in train_trade_buyer.columns.ravel()]
train_trade_buyer['trade_buyer_type_count'] -= train_trade_buyer['trade_buyer_type_sum']
train_trade_buyer = train_trade_buyer.rename(columns = {'trade_buyer_type_count':'trade_buyer_type_personal','trade_buyer_type_sum':'trade_buyer_type_exchange'})

train_trade_buyer_char= pivoting_df(train_trade_buyer)

# pledge
train_pledge['custom_acc_id'] = train_pledge['acc_id'].astype(str)+'_'+train_pledge['char_id'].astype(str)
train_pledge_group = train_pledge.groupby(['custom_acc_id','day']).agg({
        'pledge_id' : 'nunique',            # 혈맹 아이디
        'play_char_cnt' : 'sum',        # 게임에 접속한 혈맹원 수
        'combat_char_cnt' : 'sum',      # 전투에 참여한 혈맹원 수
        'pledge_combat_cnt': 'sum',     # 혈맹 간 전투 횟수의 합
        'random_attacker_cnt' : 'sum',  # 혈맹원 중 막피 전투를 행한 횟수의 합
        'random_defender_cnt': 'sum',   # 혈맹원 중 막피로부터 피해를 받은 횟수의 합
        'same_pledge_cnt': 'sum',       # 동일 혈맹원 간 전투 횟수의 합
        'temp_cnt' : 'sum',             # 혈맹원들의 단발성 전투 횟수의 합
        'etc_cnt' : 'sum',              # 혈맹원들의 기타 전투 횟수의 합
        'combat_play_time': 'sum',      # 혈맹의 전투 캐릭터들의 플레이 시간의 합
        'non_combat_play_time' : 'sum' # 혈맹의 非전투 캐릭터 플레이 시간의 합
    }).reset_index(drop=False)
# train_pledge_group.drop('custom_acc_id',axis=1,inplace=True)

train_pledge_group.columns = ['pledge_'+x for x in train_pledge_group.columns]
train_pledge_char = pivoting_df(train_pledge_group)

# combat
train_combat['custom_acc_id'] = train_combat['acc_id'].astype(str)+'_'+train_combat['char_id'].astype(str)
train_combat_group = train_combat.groupby(['custom_acc_id','day']).agg(
    {
        'server' : 'nunique',          # 캐릭터 서버
        'class' : 'nunique',           # 직업
        'pledge_cnt' : 'sum',          # 혈맹간 전투에 참여한 횟수
        'random_attacker_cnt' : 'sum', # 본인이 막피 공격을 행한 횟수
        'random_defender_cnt' : 'sum', # 막피 공격자로부터 공격을 받은 횟수
        'temp_cnt' : 'sum',            # 단발성 전투 횟수
        'same_pledge_cnt' : 'sum',     # 동일 혈맹원 간의 전투 횟수
        'etc_cnt' : 'sum',             # 기타 전투 횟수
        'num_opponent' : 'sum'         # 전투 상대 캐릭터수
    }).reset_index(drop=False)
# train_combat_group.drop('custom_acc_id',axis=1,inplace=True)
train_combat_group.columns = ['combat_'+x for x in train_combat_group.columns]
train_combat_char = pivoting_df(train_combat_group)

In [0]:
train_payment

Unnamed: 0,day,acc_id,amount_spent
0,1.0,8,1.056123
1,4.0,8,0.348521
2,1.0,8,1.056123
3,4.0,8,0.348521
4,1.0,8,1.056123
5,4.0,8,0.348521
6,1.0,8,1.056123
7,4.0,8,0.348521
8,1.0,8,1.056123
9,4.0,8,0.348521


In [0]:
# Final Merge
train_char = merge_final_df(train_activity_char ,
                train_payment_char,
                train_trade_seller_char,
                train_trade_buyer_char ,
                train_pledge_char,
                train_combat_char)

In [0]:
np.min(train_payment['amount_spent'])

0.01173470155204053

In [0]:
train_char.shape

(152483, 1568)

In [0]:
#test1
test1_activity = pd.read_csv(path + 'test/test1_activity.csv').sort_values(['acc_id','day']).fillna(0)
test1_payment = pd.read_csv(path + 'test/test1_payment.csv').sort_values(['acc_id','day']).fillna(0)
test1_trade = pd.read_csv(path + 'test/test1_trade.csv').fillna(0)
test1_pledge = pd.read_csv(path + 'test/test1_pledge.csv').sort_values(['acc_id','day']).fillna(0)
test1_combat = pd.read_csv(path + 'test/test1_combat.csv').sort_values(['acc_id','day']).fillna(0)

# activity
test1_activity['custom_acc_id'] = test1_activity['acc_id'].astype(str)+'_'+test1_activity['char_id'].astype(str)
test1_activity['game_money_change'] = np.abs(test1_activity['game_money_change'])
test1_activity_group = test1_activity.groupby(['custom_acc_id','day']).agg({
                                                             'server':'nunique', 
                                                             'playtime':['sum','mean','std'], 
                                                             'npc_kill':['sum','mean','std'], 
                                                             'solo_exp':'sum',
                                                             'party_exp':'sum', 
                                                             'quest_exp':'sum',
                                                             'rich_monster':'sum', 
                                                             'death':'sum', 
                                                             'revive':'sum',
                                                             'exp_recovery':'sum',
                                                             'fishing':'sum',
                                                             'private_shop':'sum',
                                                             'game_money_change':['sum','std'],
                                                             'enchant_count':'sum'}).reset_index(drop=False)
test1_activity_group.columns = ['activity_'+'_'.join(x) for x in test1_activity_group.columns.ravel()]
test1_activity_char = pivoting_df(test1_activity_group)

# # create char_id for payment
id_label = test1_activity.groupby(['acc_id', 'char_id']).agg({'day': 'count'}).reset_index(drop=False)

# # payment
test1_payment['custom_acc_id'] = test1_payment['acc_id'].astype(str)+'_'+id_label['char_id'].astype(str)
test1_payment_group = test1_payment.groupby(['custom_acc_id','day']).agg({'amount_spent':'sum'}).reset_index(drop=False)
test1_payment_group.columns = ['payment_'+i for i in test1_payment_group.columns]

test1_payment_char = pivoting_df(test1_payment_group)

# trade
## 판매자 테이블 정의
test1_trade['source_custom_acc_id'] = test1_trade['source_acc_id'].astype(str)+'_'+test1_trade['source_char_id'].astype(str)
test1_trade_seller = test1_trade.groupby(['source_custom_acc_id','day']).agg({
                                         'type':['nunique','count','sum'], # 거래의 종류들을 파악하기 위해서 -> nuique = 2이면 두 종류의 거래 모두 진행 / count 
                                         'server':'nunique', 
                                         'target_char_id':'nunique',
                                         'item_type':'nunique',
                                         'item_amount':'sum',
                                         'item_price':'sum'}).reset_index(drop=False)
test1_trade_seller.columns = ['trade_seller_'+'_'.join(x) for x in test1_trade_seller.columns.ravel()]
test1_trade_seller['trade_seller_type_count'] -= test1_trade_seller['trade_seller_type_sum']
test1_trade_seller = test1_trade_seller.rename(columns = {'trade_seller_type_count':'trade_seller_type_personal','trade_seller_type_sum':'trade_seller_type_exchange'})

test1_trade_seller_char = pivoting_df(test1_trade_seller)

## 구매자 테이블 정의
test1_trade['target_custom_acc_id'] = test1_trade['target_acc_id'].astype(str)+'_'+test1_trade['target_char_id'].astype(str)
test1_trade_buyer = test1_trade.groupby(['target_custom_acc_id','day']).agg({ 
                                         'type':['nunique','count','sum'], # 거래의 종류들을 파악하기 위해서 -> nuique = 2이면 두 종류의 거래 모두 진행 / count 
                                         'server':'nunique', 
                                         'source_char_id':'nunique', # 몇개의 캐릭터 운용하는지
                                         'item_type':'nunique', 
                                         'item_amount':'sum',
                                          'item_price':'sum'}).reset_index(drop=False)

test1_trade_buyer.columns = ['trade_buyer_'+'_'.join(x) for x in test1_trade_buyer.columns.ravel()]
test1_trade_buyer['trade_buyer_type_count'] -= test1_trade_buyer['trade_buyer_type_sum']
test1_trade_buyer = test1_trade_buyer.rename(columns = {'trade_buyer_type_count':'trade_buyer_type_personal','trade_buyer_type_sum':'trade_buyer_type_exchange'})

test1_trade_buyer_char= pivoting_df(test1_trade_buyer)

# pledge
test1_pledge['custom_acc_id'] = test1_pledge['acc_id'].astype(str)+'_'+test1_pledge['char_id'].astype(str)
test1_pledge_group = test1_pledge.groupby(['custom_acc_id','day']).agg({
        'pledge_id' : 'nunique',            # 혈맹 아이디
        'play_char_cnt' : 'sum',        # 게임에 접속한 혈맹원 수
        'combat_char_cnt' : 'sum',      # 전투에 참여한 혈맹원 수
        'pledge_combat_cnt': 'sum',     # 혈맹 간 전투 횟수의 합
        'random_attacker_cnt' : 'sum',  # 혈맹원 중 막피 전투를 행한 횟수의 합
        'random_defender_cnt': 'sum',   # 혈맹원 중 막피로부터 피해를 받은 횟수의 합
        'same_pledge_cnt': 'sum',       # 동일 혈맹원 간 전투 횟수의 합
        'temp_cnt' : 'sum',             # 혈맹원들의 단발성 전투 횟수의 합
        'etc_cnt' : 'sum',              # 혈맹원들의 기타 전투 횟수의 합
        'combat_play_time': 'sum',      # 혈맹의 전투 캐릭터들의 플레이 시간의 합
        'non_combat_play_time' : 'sum' # 혈맹의 非전투 캐릭터 플레이 시간의 합
    }).reset_index(drop=False)

test1_pledge_group.columns = ['pledge_'+x for x in test1_pledge_group.columns]
test1_pledge_char = pivoting_df(test1_pledge_group)

# combat
test1_combat['custom_acc_id'] = test1_combat['acc_id'].astype(str)+'_'+test1_combat['char_id'].astype(str)
test1_combat_group = test1_combat.groupby(['custom_acc_id','day']).agg(
    {
        'server' : 'nunique',          # 캐릭터 서버
        'class' : 'nunique',           # 직업
        'pledge_cnt' : 'sum',          # 혈맹간 전투에 참여한 횟수
        'random_attacker_cnt' : 'sum', # 본인이 막피 공격을 행한 횟수
        'random_defender_cnt' : 'sum', # 막피 공격자로부터 공격을 받은 횟수
        'temp_cnt' : 'sum',            # 단발성 전투 횟수
        'same_pledge_cnt' : 'sum',     # 동일 혈맹원 간의 전투 횟수
        'etc_cnt' : 'sum',             # 기타 전투 횟수
        'num_opponent' : 'sum'         # 전투 상대 캐릭터수
    }).reset_index(drop=False)

test1_combat_group.columns = ['combat_'+x for x in test1_combat_group.columns]
test1_combat_char = pivoting_df(test1_combat_group)

In [0]:
# Final Merge
test1_char = merge_final_df(test1_activity_char ,
                test1_payment_char,
                test1_trade_seller_char,
                test1_trade_buyer_char ,
                test1_pledge_char,
                test1_combat_char)

In [0]:
#test2
test2_activity = pd.read_csv(path + 'test/test2_activity.csv').sort_values(['acc_id','day']).fillna(0)
test2_payment = pd.read_csv(path + 'test/test2_payment.csv').sort_values(['acc_id','day']).fillna(0)
test2_trade = pd.read_csv(path + 'test/test2_trade.csv').fillna(0)
test2_pledge = pd.read_csv(path + 'test/test2_pledge.csv').sort_values(['acc_id','day']).fillna(0)
test2_combat = pd.read_csv(path + 'test/test2_combat.csv').sort_values(['acc_id','day']).fillna(0)

# activity
test2_activity['custom_acc_id'] = test2_activity['acc_id'].astype(str)+'_'+test2_activity['char_id'].astype(str)
test2_activity['game_money_change'] = np.abs(test2_activity['game_money_change'])
test2_activity_group = test2_activity.groupby(['custom_acc_id','day']).agg({
                                                             'server':'nunique', 
                                                             'playtime':['sum','mean','std'], 
                                                             'npc_kill':['sum','mean','std'], 
                                                             'solo_exp':'sum',
                                                             'party_exp':'sum', 
                                                             'quest_exp':'sum',
                                                             'rich_monster':'sum', 
                                                             'death':'sum', 
                                                             'revive':'sum',
                                                             'exp_recovery':'sum',
                                                             'fishing':'sum',
                                                             'private_shop':'sum',
                                                             'game_money_change':['sum','std'],
                                                             'enchant_count':'sum'}).reset_index(drop=False)
test2_activity_group.columns = ['activity_'+'_'.join(x) for x in test2_activity_group.columns.ravel()]
test2_activity_char = pivoting_df(test2_activity_group)

# # create char_id for payment
id_label = test2_activity.groupby(['acc_id', 'char_id']).agg({'day': 'count'}).reset_index(drop=False)

# # payment
test2_payment['custom_acc_id'] = test2_payment['acc_id'].astype(str)+'_'+id_label['char_id'].astype(str)
test2_payment_group = test2_payment.groupby(['custom_acc_id','day']).agg({'amount_spent':'sum'}).reset_index(drop=False)
test2_payment_group.columns = ['payment_'+i for i in test2_payment_group.columns]

test2_payment_char = pivoting_df(test2_payment_group)

# trade
## 판매자 테이블 정의
test2_trade['source_custom_acc_id'] = test2_trade['source_acc_id'].astype(str)+'_'+test2_trade['source_char_id'].astype(str)
test2_trade_seller = test2_trade.groupby(['source_custom_acc_id','day']).agg({
                                         'type':['nunique','count','sum'], # 거래의 종류들을 파악하기 위해서 -> nuique = 2이면 두 종류의 거래 모두 진행 / count 
                                         'server':'nunique', 
                                         'target_char_id':'nunique',
                                         'item_type':'nunique',
                                         'item_amount':'sum',
                                         'item_price':'sum'}).reset_index(drop=False)
test2_trade_seller.columns = ['trade_seller_'+'_'.join(x) for x in test2_trade_seller.columns.ravel()]
test2_trade_seller['trade_seller_type_count'] -= test2_trade_seller['trade_seller_type_sum']
test2_trade_seller = test2_trade_seller.rename(columns = {'trade_seller_type_count':'trade_seller_type_personal','trade_seller_type_sum':'trade_seller_type_exchange'})

test2_trade_seller_char = pivoting_df(test2_trade_seller)

## 구매자 테이블 정의
test2_trade['target_custom_acc_id'] = test2_trade['target_acc_id'].astype(str)+'_'+test2_trade['target_char_id'].astype(str)
test2_trade_buyer = test2_trade.groupby(['target_custom_acc_id','day']).agg({ 
                                         'type':['nunique','count','sum'], # 거래의 종류들을 파악하기 위해서 -> nuique = 2이면 두 종류의 거래 모두 진행 / count 
                                         'server':'nunique', 
                                         'source_char_id':'nunique', # 몇개의 캐릭터 운용하는지
                                         'item_type':'nunique', 
                                         'item_amount':'sum',
                                          'item_price':'sum'}).reset_index(drop=False)

test2_trade_buyer.columns = ['trade_buyer_'+'_'.join(x) for x in test2_trade_buyer.columns.ravel()]
test2_trade_buyer['trade_buyer_type_count'] -= test2_trade_buyer['trade_buyer_type_sum']
test2_trade_buyer = test2_trade_buyer.rename(columns = {'trade_buyer_type_count':'trade_buyer_type_personal','trade_buyer_type_sum':'trade_buyer_type_exchange'})

test2_trade_buyer_char= pivoting_df(test2_trade_buyer)

# pledge
test2_pledge['custom_acc_id'] = test2_pledge['acc_id'].astype(str)+'_'+test2_pledge['char_id'].astype(str)
test2_pledge_group = test2_pledge.groupby(['custom_acc_id','day']).agg({
        'pledge_id' : 'nunique',            # 혈맹 아이디
        'play_char_cnt' : 'sum',        # 게임에 접속한 혈맹원 수
        'combat_char_cnt' : 'sum',      # 전투에 참여한 혈맹원 수
        'pledge_combat_cnt': 'sum',     # 혈맹 간 전투 횟수의 합
        'random_attacker_cnt' : 'sum',  # 혈맹원 중 막피 전투를 행한 횟수의 합
        'random_defender_cnt': 'sum',   # 혈맹원 중 막피로부터 피해를 받은 횟수의 합
        'same_pledge_cnt': 'sum',       # 동일 혈맹원 간 전투 횟수의 합
        'temp_cnt' : 'sum',             # 혈맹원들의 단발성 전투 횟수의 합
        'etc_cnt' : 'sum',              # 혈맹원들의 기타 전투 횟수의 합
        'combat_play_time': 'sum',      # 혈맹의 전투 캐릭터들의 플레이 시간의 합
        'non_combat_play_time' : 'sum' # 혈맹의 非전투 캐릭터 플레이 시간의 합
    }).reset_index(drop=False)

test2_pledge_group.columns = ['pledge_'+x for x in test2_pledge_group.columns]
test2_pledge_char = pivoting_df(test2_pledge_group)

# combat
test2_combat['custom_acc_id'] = test2_combat['acc_id'].astype(str)+'_'+test2_combat['char_id'].astype(str)
test2_combat_group = test2_combat.groupby(['custom_acc_id','day']).agg(
    {
        'server' : 'nunique',          # 캐릭터 서버
        'class' : 'nunique',           # 직업
        'pledge_cnt' : 'sum',          # 혈맹간 전투에 참여한 횟수
        'random_attacker_cnt' : 'sum', # 본인이 막피 공격을 행한 횟수
        'random_defender_cnt' : 'sum', # 막피 공격자로부터 공격을 받은 횟수
        'temp_cnt' : 'sum',            # 단발성 전투 횟수
        'same_pledge_cnt' : 'sum',     # 동일 혈맹원 간의 전투 횟수
        'etc_cnt' : 'sum',             # 기타 전투 횟수
        'num_opponent' : 'sum'         # 전투 상대 캐릭터수
    }).reset_index(drop=False)

test2_combat_group.columns = ['combat_'+x for x in test2_combat_group.columns]
test2_combat_char = pivoting_df(test2_combat_group)

# Final Merge
test2_char = merge_final_df(test2_activity_char ,
                test2_payment_char,
                test2_trade_seller_char,
                test2_trade_buyer_char ,
                test2_pledge_char,
                test2_combat_char)

In [0]:
train_label = pd.read_csv('drive/My Drive/bigcontest2019/scripts/model/BJW/Final/train_label_char.csv').reset_index(drop=True)

In [0]:
train_char.shape
test1_char.shape
test2_char.shape

train_label.shape

(152483, 1568)

(75303, 1568)

(85813, 1568)

(152483, 4)

In [0]:
 def reduce_mem_usage(df, verbose=True):
            numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
            start_mem = df.memory_usage().sum() / 1024**2    
            for col in df.columns:
                col_type = df[col].dtypes
                if col_type in numerics:
                    c_min = df[col].min()
                    c_max = df[col].max()
                    if str(col_type)[:3] == 'int':
                        if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                            df[col] = df[col].astype(np.int8)
                        elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                            df[col] = df[col].astype(np.int16)
                        elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                            df[col] = df[col].astype(np.int32)
                        elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                            df[col] = df[col].astype(np.int64)  
                    else:
                        if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                            df[col] = df[col].astype(np.float16)
                        elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                            df[col] = df[col].astype(np.float32)
                        else:
                            df[col] = df[col].astype(np.float64)    
            end_mem = df.memory_usage().sum() / 1024**2
            if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
            return df

In [0]:
train_char = reduce_mem_usage(train_char)
test1_char = reduce_mem_usage(test1_char)
test2_char = reduce_mem_usage(test2_char)
train_label = reduce_mem_usage(train_label)

Mem. usage decreased to 462.20 Mb (74.7% reduction)
Mem. usage decreased to 228.28 Mb (74.7% reduction)
Mem. usage decreased to 259.80 Mb (74.8% reduction)
Mem. usage decreased to  1.60 Mb (65.6% reduction)


# Model

In [0]:
FOLDS=5;SEED=42





def lgb_model(train, test, label, folds=FOLDS, seed=SEED):
    
    skf = StratifiedKFold(n_splits=folds, random_state=seed)
    kf = KFold(n_splits=folds, random_state=seed)

    train.reset_index(drop=True,inplace=True)
    test.reset_index(drop=True,inplace=True)
    label.reset_index(drop=True,inplace=True)

    oof_time = np.zeros(len(train))
    oof_spent = np.zeros(len(train))
    pred_time = np.zeros(len(test))
    pred_spent = np.zeros(len(test))
    
    for trn_idx, val_idx in kf.split(train):
        params = {
                'objective':'regression',
                "boosting": "gbdt",
                "num_iterations ":100,
                'num_leaves': 10000,
                'max_depth': 8,
                'learning_rate': 0.1,
                'min_data_in_leaf': 32, 
                'min_child_samples': 30,
                'min_child_weight': 0.5,
                'min_split_gain': 0.005,
                "feature_fraction": 0.9,
                "bagging_fraction": 0.9 ,
                "bagging_freq": 2,
                "bagging_seed": 42,
                "metric": 'rmse',
#                 "device" : "gpu",
                "lambda_l1": 0.1,
                "lambda_l2": 0.1
            }
        train_df = lgb.Dataset(train.loc[trn_idx], label=label.loc[trn_idx, 'amount_spent'])
        valid_df = lgb.Dataset(train.loc[val_idx], label=label.loc[val_idx, 'amount_spent'])
        
        lgb_model = lgb.train(params, train_df, 5000, valid_sets = [train_df, valid_df], early_stopping_rounds = 500, verbose_eval=5000)
        oof_spent[val_idx] = lgb_model.predict(train.loc[val_idx])/skf.n_splits
        pred_spent += lgb_model.predict(test)/skf.n_splits
      
    # for trn_idx, val_idx in skf.split(train, label['survival_time']):
        
    #     train_df = lgb.Dataset(train.loc[trn_idx], label=label.loc[trn_idx, 'survival_time'])
    #     valid_df = lgb.Dataset(train.loc[val_idx], label=label.loc[val_idx, 'survival_time'])
        
    #     lgb_model = lgb.train(params, train_df, 5000, valid_sets = [train_df, valid_df], early_stopping_rounds = 500, verbose_eval=5000)
    #     oof_time[val_idx] = lgb_model.predict(train.loc[val_idx])/skf.n_splits
    #     pred_time += lgb_model.predict(test)/skf.n_splits
    
    return  oof_spent, pred_spent, oof_time,  pred_time

In [0]:
oof_spent_1, pred_spent_1, oof_time_1,  pred_time_1 = lgb_model(train_char, test1_char, train_label, folds=FOLDS, seed=SEED)

In [0]:
oof_spent_1.describe()
oof_time_1.describe()

In [0]:
oof_spent_2, pred_spent_2, oof_time_2,  pred_time_2 = lgb_model(train_char, test2_char.drop(columns='char_id'), train_label, folds=FOLDS, seed=SEED)

In [0]:
path = 'drive/My Drive/bigcontest2019/scripts'
os.chdir(path)
from model.metrics import score_function

In [0]:
'survival_time': oof_spent_1,oof_time_1

In [0]:
score_function.score_function(train_label, train_label, path=False)

In [0]:
score_function.score_function(ensembel_oof, train_label, path=False)