In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd
import numpy as np
import os
import gc

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score

from tqdm import tqdm

# model
import lightgbm as lgb

SEED=42
LABEL=None

In [0]:
path = '/content/drive/My Drive/bigcontest2019/'
os.chdir(path)

In [0]:
train_activity = pd.read_csv('data/train/train_activity.csv')
# train_combat = pd.read_csv('data/train/train_combat.csv')
train_payment = pd.read_csv('data/train/train_payment.csv')
# train_pledge = pd.read_csv('data/train/train_pledge.csv')
# train_trade = pd.read_csv('data/train/train_trade.csv')

test1_activity = pd.read_csv('data/test/test1_activity.csv')
# test1_combat = pd.read_csv('data/test/test1_combat.csv')
test1_payment = pd.read_csv('data/test/test1_payment.csv')
# test1_pledge = pd.read_csv('data/test/test1_pledge.csv')
# test1_trade = pd.read_csv('data/test/test1_trade.csv')

test2_activity = pd.read_csv(path + 'data/test/test2_activity.csv')
# test2_combat = pd.read_csv(path + 'data/test/test2_combat.csv')
test2_payment = pd.read_csv(path + 'data/test/test2_payment.csv')
# test2_pledge = pd.read_csv(path + 'data/test/test2_pledge.csv')
# test2_trade = pd.read_csv(path + 'data/test/test2_trade.csv')

train_label = pd.read_csv('data/train/train_label.csv')

In [0]:
class data_transform(object):
    def __init__(self, data):
        self.data = data
        
    def create_week(self):
        self.data['week'] = (self.data['day']-1)//7 + 1
        
        return self.data
    
#     def ():
#         return 
    
    def groupby_week(self):
        temp_df = self.data
        
        activity_agg = {'day':'nunique', 
               #'char_id':'nunique',
               #'server':'nunique', 
               'playtime':'sum', 
               'npc_kill':'sum', 
               'solo_exp':'sum',
               'party_exp':'sum', 
               'quest_exp':'sum',
               'rich_monster':'sum', 
               'death':'sum', 
               'revive':'sum',
               'exp_recovery':'sum',
               'fishing':'sum',
               'private_shop':'sum',
               'game_money_change':'sum',
               'enchant_count':'sum'}
        
        temp_df = temp_df.groupby(['acc_id', 'week']).agg(activity_agg).reset_index()
        
        return temp_df
    
    #survival time용
    def under_sampling(self):
        temp_df = self.data
        output_df = pd.DataFrame()
        for week in [1, 2, 3, 4]:
            _, under_sample = train_test_split(temp_df[(temp_df['week']==week) & (temp_df['survival_time']==64)], test_size=0.01, random_state=42, shuffle=True, 
                                               stratify=temp_df.loc[(temp_df['week']==week) & (temp_df['survival_time']==64), 'day'])
            output_df = pd.concat([output_df, under_sample]).reset_index(drop=True)
        
        return output_df
    
    # amount_time 용
    def under_sampling2(self):
        temp_df = self.data
        _, under_sample = train_test_split(temp_df[(temp_df['survival_time']==64)], test_size=0.01, random_state=42, shuffle=True)
#                                            stratify=temp_df.loc[(temp_df['survival_time']==64), 'daynunique'])
        
        return under_sample

In [0]:
|

In [0]:
def feature_extraction(data, feature, method, week):
    train = pd.DataFrame(np.unique(data['acc_id']), columns=['acc_id'])
    
    standard_feature = 'acc_id'
    if method=='nunique':
        data = data.groupby(standard_feature)[feature].nunique()
    elif method=='sum':
        data = data.groupby(standard_feature)[feature].sum()
    elif method=='min':
        data = data.groupby(standard_feature)[feature].min()
    elif method=='max':
        data = data.groupby(standard_feature)[feature].max()
    elif method=='mean':
        data = data.groupby(standard_feature)[feature].mean()
    elif method=='median':
        data = data.groupby(standard_feature)[feature].median()
    elif method=='skew':
        data = data.groupby(standard_feature)[feature].skew()
    elif method=='kurt':
        data = data.groupby(standard_feature)[feature].apply(pd.DataFrame.kurt)
    elif method=='count':
        data = data.groupby(standard_feature)[feature].count()
    elif method=='std':
        data = data.groupby(standard_feature)[feature].std()
    else:
        return print('method is no return')
    
    data = data.reset_index().rename(columns={feature:feature+'_'+method})
    train = pd.merge(train, data, how='left', on='acc_id')
    
    return train

# train

## def function

In [0]:
def activity_merge(train_week, data, week=1, on='acc_id'):
    WEEK=week
    ON=on
    for method in ['nunique', 'count']:
        train_week = pd.merge(train_week, activity_proprocess(DATA, feature='server', method=method, week=WEEK), how='left', on=ON)

    for feature in ['playtime', 'npc_kill', 'solo_exp', 'party_exp', 'quest_exp', 'rich_monster', 'death', 'revive', 'exp_recovery', 'fishing', 'private_shop', 'game_money_change', 'enchant_count']:
        for method in ['nunique', 'max', 'min', 'mean', 'median', 'skew', 'kurt', 'sum', 'std']:
            train_week = pd.merge(train_week, activity_proprocess(DATA, feature=feature, method=method, week=WEEK), how='left', on=ON)
    
    return train_week

In [0]:
def payment_merge(train_week, data, week=1, on='acc_id'):
    WEEK=week
    ON=on
    feature = 'amount_spent'
    
    for method in ['nunique', 'max', 'min', 'mean', 'median', 'skew', 'kurt', 'sum', 'std']:
        train_week = pd.merge(train_week, activity_proprocess(DATA, feature=feature, method=method, week=WEEK), how='left', on=ON)
    
    return train_week

## activity

In [0]:
transform = data_transform(train_activity)
transform.create_week()
train = transform.groupby_week()

## Payment

In [0]:
train2 = train_payment.groupby('acc_id').agg({'amount_spent':['sum', 'max', 'mean'], 'day':'nunique'}).reset_index()
train2.columns = [i+j for i,j in train2.columns.ravel()]
train2 = pd.merge(train2, train_label, how='left', on='acc_id').fillna(0)

In [0]:
# under sampling
transform = data_transform(train2)
under_train = transform.under_sampling2()

train2 = pd.concat([under_train, train2[train2['survival_time']!=64]]).reset_index(drop=True)

In [0]:
np.round(train_label['amount_spent'], 2).value_counts()

0.00     17494
0.03      2027
0.04      1979
0.02      1812
0.05      1369
0.01      1333
0.06       936
0.07       870
0.08       773
0.09       683
0.10       657
0.11       601
0.12       572
0.14       494
0.13       472
0.16       416
0.15       396
0.18       377
0.17       365
0.21       316
0.19       307
0.20       301
0.22       291
0.23       275
0.25       275
0.24       237
0.26       232
0.29       223
0.27       211
0.28       206
         ...  
2.53         1
1.54         1
2.09         1
3.25         1
31.25        1
25.44        1
3.90         1
2.57         1
3.67         1
2.47         1
3.94         1
1.81         1
12.53        1
1.65         1
4.02         1
6.30         1
2.21         1
8.71         1
4.77         1
6.97         1
5.41         1
4.63         1
6.33         1
10.58        1
1.50         1
3.00         1
2.50         1
9.50         1
2.89         1
4.39         1
Name: amount_spent, Length: 348, dtype: int64

In [0]:
train_label.head(10)

Unnamed: 0,acc_id,survival_time,amount_spent
0,27835,64,0.002559
1,12351,64,0.120154
2,125437,55,0.182593
3,104483,64,0.016241
4,4704,20,0.226396
5,3277,50,0.058863
6,118874,64,0.030777
7,7678,5,0.0
8,124997,46,0.009187
9,116489,23,0.0


try
1. under sampling
2. weight
3. total amount spent

weaknees
1. data loss
2. weak
3. unknown

# test

## activity

In [0]:
transform = data_transform(test1_activity)
transform.create_week()
test1 = transform.groupby_week()

## payment

In [0]:
test12 = test1_payment.groupby('acc_id').agg({'amount_spent':['sum', 'max', 'mean'], 'day':'nunique'}).reset_index()
test12.columns = [i+j for i,j in test12.columns.ravel()]

# model

## survival_time

In [0]:
train = pd.merge(train, train_label[['acc_id', 'survival_time']], how='left', on='acc_id')

for week in range(1, 5):
    train.loc[train['week']==week, 'survival_time'] = np.minimum(64, train.loc[train['week']==week, 'survival_time'] + 7*(4-week))

In [0]:
# under sampling
transform = data_transform(train)
under_train = transform.under_sampling()

train = pd.concat([under_train, train[train['survival_time']!=64]]).reset_index(drop=True)

In [0]:
seed=42
LABEL='survival_time'
train_df, valid_df = train_test_split(train.fillna(0), test_size=0.2, random_state=seed, shuffle=True, stratify=train[LABEL])

In [0]:
feature_ = train.drop(columns=['acc_id', 'week', LABEL]).columns
params = {
    'objective':'multiclass',
    'num_class':64,
    "boosting": "gbdt",
    'learning_rate': 0.03,
    'subsample' : 0.6,
    'sumsample_freq':1,
    'colsample_bytree':0.221856,
    'max_depth': 16,
    'max_bin':255,
    "lambda_l1": 0.25,
    "lambda_l2": 1,
    'min_child_weight': 0.2,
    'min_child_samples': 20,
    'min_gain_to_split':0.02,
    'min_data_in_bin':3,
    'bin_construct_sample_cnt':5000,
    'cat_l2':10,
    'verbose':-1,
    'nthread':-1,
    'seed':seed
}

trn_label = train_df[LABEL] - 1
val_label = valid_df[LABEL] - 1

ttt = lgb.Dataset(train_df[feature_], label=trn_label)
vvv = lgb.Dataset(valid_df[feature_], label=val_label)

lgb_model = lgb.train(params, ttt, 5000, valid_sets = [ttt, vvv], early_stopping_rounds = 50, verbose_eval=100)

preds_st = lgb_model.predict(test1[test1['week']==4][feature_].fillna(0))
preds_st = np.argmax(preds_st, axis=1)+1

# preds_st2 = lgb_model.predict(test2_week[feature_].fillna(0))
# preds_st2 = np.argmax(preds_st2, axis=1)+1

Training until validation scores don't improve for 50 rounds.
[100]	training's multi_logloss: 3.396	valid_1's multi_logloss: 3.67305
[200]	training's multi_logloss: 3.10243	valid_1's multi_logloss: 3.58588
[300]	training's multi_logloss: 2.89313	valid_1's multi_logloss: 3.55213
[400]	training's multi_logloss: 2.72491	valid_1's multi_logloss: 3.53701
[500]	training's multi_logloss: 2.58186	valid_1's multi_logloss: 3.53209
Early stopping, best iteration is:
[525]	training's multi_logloss: 2.54919	valid_1's multi_logloss: 3.53167


## amount spent

In [0]:
seed=42
LABEL='amount_spent'
train_df, valid_df = train_test_split(train2.fillna(0), test_size=0.2, random_state=seed, shuffle=True)

In [0]:
feature_ = train_week.drop(columns=['acc_id', 'survival_time', LABEL]).columns
params = {
    'objective':'regression',
    "boosting": "gbdt",
    'learning_rate': 0.03,
    'subsample' : 0.6,
    'sumsample_freq':1,
    'colsample_bytree':0.221856,
    'max_depth': 16,
    'max_bin':255,
    "lambda_l1": 0.25,
    "lambda_l2": 1,
    'min_child_weight': 0.2,
    'min_child_samples': 20,
    'min_gain_to_split':0.02,
    'min_data_in_bin':3,
    'bin_construct_sample_cnt':5000,
    'cat_l2':10,
    'verbose':-1,
    'nthread':-1,
    'metrics':'mse',
    'seed':seed
}

trn_label = train_df[LABEL]
val_label = valid_df[LABEL]

ttt = lgb.Dataset(train_df[feature_], label=trn_label)
vvv = lgb.Dataset(valid_df[feature_], label=val_label)

lgb_model = lgb.train(params, ttt, 5000, valid_sets = [ttt, vvv], early_stopping_rounds = 50, verbose_eval=100)
preds_spent = lgb_model.predict(test1_week[feature_].fillna(0))
preds2_spent = lgb_model.predict(test2_week[feature_].fillna(0))

Training until validation scores don't improve for 50 rounds.
[100]	training's l2: 0.33375	valid_1's l2: 0.385441
Early stopping, best iteration is:
[97]	training's l2: 0.334721	valid_1's l2: 0.385333


# submission

In [0]:
test1_pred = pd.concat([test1_week['acc_id'], pd.DataFrame(preds, columns=['survival_time']), pd.DataFrame(preds_spent*25, columns=['amount_spent'])], 1)
test1_pred.to_csv('/content/test1_predict.csv', index=False)

In [0]:
test2_pred = pd.concat([test2_week['acc_id'], pd.DataFrame(preds2, columns=['survival_time']), pd.DataFrame(preds2_spent*10, columns=['amount_spent'])], 1)
test2_pred.to_csv('/content/test2_predict.csv', index=False)