In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import pandas as pd
import numpy as np
from collections import defaultdict
import os
import gc

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score

from tqdm import tqdm

# model
import lightgbm as lgb

SEED=42
LABEL=None

In [0]:
path = '/content/drive/My Drive/bigcontest2019/'
os.chdir(path)

In [0]:
train_activity = pd.read_csv('data/train/train_activity.csv')
train_combat = pd.read_csv('data/train/train_combat.csv')
train_payment = pd.read_csv('data/train/train_payment.csv')
train_pledge = pd.read_csv('data/train/train_pledge.csv')
train_trade = pd.read_csv('data/train/train_trade.csv')

test1_activity = pd.read_csv('data/test/test1_activity.csv')
test1_combat = pd.read_csv('data/test/test1_combat.csv')
test1_payment = pd.read_csv('data/test/test1_payment.csv')
test1_pledge = pd.read_csv('data/test/test1_pledge.csv')
test1_trade = pd.read_csv('data/test/test1_trade.csv')

test2_activity = pd.read_csv(path + 'data/test/test2_activity.csv')
test2_combat = pd.read_csv(path + 'data/test/test2_combat.csv')
test2_payment = pd.read_csv(path + 'data/test/test2_payment.csv')
test2_pledge = pd.read_csv(path + 'data/test/test2_pledge.csv')
test2_trade = pd.read_csv(path + 'data/test/test2_trade.csv')

train_label = pd.read_csv('data/train/train_label.csv')

In [0]:
class data_transform(object):
    def __init__(self, data):
        self.data = data
        
    def create_week(self):
        self.data['week'] = (self.data['day']-1)//7 + 1
        return self.data
    
    def activity_transform(self):
        temp_df = self.data
        groupby_dict = defaultdict()
        
        temp_df = pd.get_dummies(temp_df)
        
        for feature in temp_df.columns:
            if feature == 'acc_id' or feature == 'week':
                pass
            elif feature == 'day' or feature == 'char_id':
                groupby_dict[feature] = 'nunique'
            else:
                groupby_dict[feature] = 'sum'
        else:        
            temp_df = temp_df.groupby(['acc_id', 'week']).agg(groupby_dict).reset_index()
        
        return temp_df
    
    def payment_transform(self):
        output_df = self.data
        groupby_dict = defaultdict()
        
        for feature in output_df.columns:
            if feature == 'acc_id' or feature == 'week':
                pass
            elif feature == 'day':
                groupby_dict[feature] = 'nunique'
            else:
                groupby_dict[feature] = ['sum', 'count', 'max', 'std']
        else:
            output_df = output_df.groupby(['acc_id', 'week']).agg(groupby_dict).reset_index()
            output_df.columns = [i+j for i,j in output_df.columns.ravel()]
        
        return output_df
    
    def trade_transform(self):
        output_df = self.data
        groupby_dict = defaultdict()
        groupby_dict2 = defaultdict()
        
        output_df['time'] = output_df['time'].apply(lambda x: str(x)[:2])
        output_df[['time', 'type', 'server']] = output_df[['time', 'type', 'server']].astype(object)
        output_df = pd.get_dummies(output_df)
        output_df2 = output_df.copy()
        
        output_df = output_df.rename(columns={'source_acc_id':'acc_id'})
        output_df2 = output_df2.rename(columns={'target_acc_id':'acc_id'})
        
        for feature in output_df.columns:
            if feature == 'acc_id' or feature == 'week':
                pass
            elif feature in ['day', 'item_type', 'source_char_id', 'target_char_id', 'target_acc_id']:
                groupby_dict[feature] = 'nunique'
            else:
                groupby_dict[feature] = 'sum'
        else:
            output_df = output_df.groupby(['acc_id', 'week']).agg(groupby_dict).reset_index()
        
        for feature in output_df2.columns:
            if feature == 'acc_id' or feature == 'week':
                pass
            elif feature in ['day', 'item_type', 'source_char_id', 'target_char_id', 'source_acc_id']:
                groupby_dict2[feature] = 'nunique'
            else:
                groupby_dict2[feature] = 'sum'
        else:
            output_df2 = output_df2.groupby(['acc_id', 'week']).agg(groupby_dict2).reset_index()
        
        output_df = pd.merge(output_df, output_df2, how='left', on=['acc_id', 'week'])
        
        return output_df
    
    def combat_transform(self):
        output_df = self.data
        groupby_dict = defaultdict()
        
        output_df[['server', 'class', 'level']] = output_df[['server', 'class', 'level']].astype(str)
        output_df = pd.get_dummies(output_df)
        
        for feature in output_df.columns:
            if feature == 'acc_id' or feature == 'week':
                pass
            elif feature == 'day':
                groupby_dict[feature] = 'nunique'
            elif feature == 'char_id':
                groupby_dict[feature] = ['nunique', 'size']
            else:
                groupby_dict[feature] = ['sum']
        else:
            output_df = output_df.groupby(['acc_id', 'week']).agg(groupby_dict).reset_index()
            output_df.columns = [i+j for i,j in output_df.columns.ravel()]
        
        return output_df
    
    def pledge_transform(self):
        output_df = self.data
        groupby_dict = defaultdict()
        
        output_df[['server']] = output_df[['server']].astype(str)
        output_df = pd.get_dummies(output_df)
        
        for feature in output_df.columns:
            if feature == 'acc_id' or feature == 'week':
                pass
            elif feature in ['day', 'pledge_id']:
                groupby_dict[feature] = 'nunique'
            elif feature == 'char_id':
                groupby_dict[feature] = ['nunique', 'size']
            else:
                groupby_dict[feature] = 'sum'
        else:
            output_df = output_df.groupby(['acc_id', 'week']).agg(groupby_dict).reset_index()
            output_df.columns = [i+j for i,j in output_df.columns.ravel()]
        return output_df
    
    
    
    def under_sampling(self):
        temp_df = self.data
        output_df = pd.DataFrame()
        for week in [1, 2, 3, 4]:
            _, under_sample = train_test_split(temp_df[(temp_df['week']==week) & (temp_df['survival_time']==64)], test_size=0.001, random_state=42, shuffle=True, 
                                               stratify=temp_df.loc[(temp_df['week']==week) & (temp_df['survival_time']==64), 'day'])
            output_df = pd.concat([output_df, under_sample]).reset_index(drop=True)
        
        return output_df
    
    def under_sampling2(self):
        temp_df = self.data
        _, under_sample = train_test_split(temp_df[(temp_df['survival_time']==64)], test_size=0.01, random_state=42, shuffle=True)
#                                            stratify=temp_df.loc[(temp_df['survival_time']==64), 'daynunique'])
        
        return under_sample

# survival time

## train

### acitivity

In [0]:
transform = data_transform(train_activity)
transform.create_week()
train = transform.activity_transform()

### payment

In [0]:
transform = data_transform(train_payment)
transform.create_week()
train = transform.payment_transform()

### trade

In [0]:
trandform = data_transform(train_trade)
trandform.create_week()
train = trandform.trade_transform()

### combat

In [0]:
trandform = data_transform(train_combat)
trandform.create_week()
train = trandform.combat_transform()

### pledge

In [0]:
trandform = data_transform(train_pledge)
trandform.create_week()
train = trandform.pledge_transform()

## test

### activity

In [0]:
transform = data_transform(test1_activity)
transform.create_week()
test1 = transform.activity_transform()

transform = data_transform(test2_activity)
transform.create_week()
test2 = transform.activity_transform()

### payment

In [0]:
transform = data_transform(test1_payment)
transform.create_week()
test1 = transform.payment_transform()

transform = data_transform(test2_payment)
transform.create_week()
test2 = transform.payment_transform()

### trade

In [0]:
trandform = data_transform(test1_trade)
trandform.create_week()
test1 = trandform.combat_transform()

trandform = data_transform(test2_trade)
trandform.create_week()
test2 = trandform.combat_transform()

### combat

In [0]:
trandform = data_transform(test1_combat)
trandform.create_week()
test1 = trandform.combat_transform()

trandform = data_transform(test2_combat)
trandform.create_week()
test2 = trandform.combat_transform()

### pledge

In [0]:
trandform = data_transform(test1_pledge)
trandform.create_week()
test1 = trandform.pledge_transform()

trandform = data_transform(test2_pledge)
trandform.create_week()
test2 = trandform.pledge_transform()

## model

In [0]:
class model(object):
    def __init__(self, train_data, train_label, test1_data, test2_data, kind):
        self.train_data = train_data.fillna(0)
        self.train_label = train_label
        self.test1_data = test1_data.fillna(0)
        self.test2_data = test2_data.fillna(0)
        self.features_ = train_data[list(set(train_data.columns) & set(test1_data.columns) & set(test2_data.columns))].drop(columns=['acc_id', 'week']).columns
        self.kind = kind
        
    def labeling(self):
        self.train_data = pd.merge(self.train_data, self.train_label[['acc_id', 'survival_time']], how='left', on='acc_id')
        self.train_data = self.train_data.dropna()
        for week in range(1, 5):
            self.train_data.loc[train['week']==week, 'survival_time'] = np.minimum(64, self.train_data.loc[self.train_data['week']==week, 'survival_time'] + 7*(4-week))
        
    def under_sampling(self):
        all_week_under_sample = pd.DataFrame()
        for week in range(1, 5):
            if self.kind == 'activity':
                _, under_sample = train_test_split(self.train_data[(self.train_data['week']==week) & (self.train_data['survival_time']==64)], test_size=0.01, random_state=42, shuffle=True, 
                                                   stratify=self.train_data.loc[(self.train_data['week']==week) & (self.train_data['survival_time']==64), 'day'])
            elif self.kind == 'payment':
                _, under_sample = train_test_split(self.train_data[(self.train_data['week']==week) & (self.train_data['survival_time']==64)], test_size=0.01, random_state=42, shuffle=True)
                
            elif self.kind == 'trade':
                _, under_sample = train_test_split(self.train_data[(self.train_data['week']==week) & (self.train_data['survival_time']==64)], test_size=0.01, random_state=42, shuffle=True,
                                                  stratify=self.train_data.loc[(self.train_data['week']==week) & (self.train_data['survival_time']==64), ['day_x', 'day_y']])
            elif self.kind == 'combat':
                _, under_sample = train_test_split(self.train_data[(self.train_data['week']==week) & (self.train_data['survival_time']==64)], test_size=0.01, random_state=42, shuffle=True,
                                                  stratify=self.train_data.loc[(self.train_data['week']==week) & (self.train_data['survival_time']==64), 'daynunique'])
            elif self.kind == 'pledge':
                _, under_sample = train_test_split(self.train_data[(self.train_data['week']==week) & (self.train_data['survival_time']==64)], test_size=0.01, random_state=42, shuffle=True,
                                                  stratify=self.train_data.loc[(self.train_data['week']==week) & (self.train_data['survival_time']==64), 'daynunique'])
                
            all_week_under_sample = pd.concat([all_week_under_sample, under_sample]).reset_index(drop=True)
        else:
            self.train_data = pd.concat([self.train_data[self.train_data['survival_time']!=64], all_week_under_sample]).reset_index(drop=True)
            
    def st_lgb_model(self, params, iteration, seed):
        LABEL='survival_time'
        train_df, valid_df = train_test_split(self.train_data, test_size=0.2, random_state=seed, shuffle=True, stratify=self.train_data[['week', LABEL]])
        
        trn_label = train_df[LABEL] - 1
        val_label = valid_df[LABEL] - 1
        ttt = lgb.Dataset(train_df[self.features_], label=trn_label)
        vvv = lgb.Dataset(valid_df[self.features_], label=val_label)

        lgb_model = lgb.train(params, ttt, iteration, valid_sets = [ttt, vvv], early_stopping_rounds = 50, verbose_eval=100)
        self.st_lgb_model = lgb_model
        
        preds_st = lgb_model.predict(self.test1_data.loc[self.test1_data['week']==4, self.features_].fillna(0))
        preds_st = np.argmax(preds_st, axis=1)+1

        preds_st2 = lgb_model.predict(self.test2_data.loc[self.test2_data['week']==4, self.features_].fillna(0))
        preds_st2 = np.argmax(preds_st2, axis=1)+1
        
        preds_st = pd.concat([self.test1_data.loc[self.test1_data['week']==4, 'acc_id'].reset_index(drop=True), pd.DataFrame(preds_st, columns=['survival_time'])], 1)
        preds_st2 = pd.concat([self.test2_data.loc[self.test2_data['week']==4, 'acc_id'].reset_index(drop=True), pd.DataFrame(preds_st2, columns=['survival_time'])], 1)
    
        return preds_st, preds_st2
    
    def feature_importance(self):
        feature_imp = pd.DataFrame(sorted(zip(self.st_lgb_model.feature_importance(), self.features_)), columns=['Value','Feature'])
        plt.figure(figsize=(20, 10))
        sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
        plt.title('LightGBM Features (avg over folds)')
        plt.tight_layout()
        plt.show()
        
        
    def oof(self):
        oof = np.zeros(len(self.train_data))
        
        skf = StratifiedKFold(n_splits=5, random_state=42)
        
        for trn_idx, val_idx in skt.split(self.train_data, self.train_data['LABEL']):
            
        
        return oof

In [15]:
PARAMS = {
    'objective':'multiclass',
    'num_class':64,
    "boosting": "gbdt",
    'learning_rate': 0.03,
    'subsample' : 0.6,
    'sumsample_freq':1,
    'colsample_bytree':0.221856,
    'max_depth': 16,
    'max_bin':255,
    "lambda_l1": 0.25,
    "lambda_l2": 1,
    'min_child_weight': 0.2,
    'min_child_samples': 20,
    'min_gain_to_split':0.02,
    'min_data_in_bin':3,
    'bin_construct_sample_cnt':5000,
    'cat_l2':10,
    'verbose':-1,
    'nthread':-1,
    'seed':SEED
}

# 주의!! model object 마지막 kinds arguments 명시해줄것!!
model_ = model(train, train_label, test1, test2, 'combat')
model_.labeling()
model_.under_sampling()
preds_st, preds_st2 = model_.st_lgb_model(PARAMS, 5000, SEED)

Training until validation scores don't improve for 50 rounds.
[100]	training's multi_logloss: 3.42337	valid_1's multi_logloss: 3.67941
[200]	training's multi_logloss: 3.14848	valid_1's multi_logloss: 3.61143
[300]	training's multi_logloss: 2.95039	valid_1's multi_logloss: 3.58971
[400]	training's multi_logloss: 2.79396	valid_1's multi_logloss: 3.58446
Early stopping, best iteration is:
[403]	training's multi_logloss: 2.78984	valid_1's multi_logloss: 3.58438


In [0]:
model_.feature_importance()

# amount spent

In [0]:
train_label['amount_spent'] = np.round(train_label['amount_spent'], 5)

In [0]:
train_label[train_label['survival_time']!=64]

# submission

In [0]:
test1_pred = pd.concat([test1_week['acc_id'], pd.DataFrame(preds, columns=['survival_time']), pd.DataFrame(preds_spent*25, columns=['amount_spent'])], 1)
test1_pred.to_csv('/content/test1_predict.csv', index=False)

In [0]:
test2_pred = pd.concat([test2_week['acc_id'], pd.DataFrame(preds2, columns=['survival_time']), pd.DataFrame(preds2_spent*10, columns=['amount_spent'])], 1)
test2_pred.to_csv('/content/test2_predict.csv', index=False)

In [0]:
test1_pred = pd.concat([test1[test1['week']==4]['acc_id'].reset_index(drop=True), pd.DataFrame(preds_st, columns=['survival_time']), pd.DataFrame(preds_spent, columns=['amount_spent'])], 1)
test1_pred.to_csv('/content/test1_predict.csv', index=False)

In [0]:
test2_pred = pd.concat([test2[test2['week']==4]['acc_id'].reset_index(drop=True), pd.DataFrame(preds_st2, columns=['survival_time']), pd.DataFrame(preds_spent2, columns=['amount_spent'])], 1)
test2_pred.to_csv('/content/test2_predict.csv', index=False)