# Basic Settings

In [0]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import os
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor

In [0]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [0]:
path = 'drive/My Drive/bigcontest2019/scripts/model/BJW'
# print(os.listdir(path + 'train'),'\n', 
#       os.listdir(path + 'test'),'\n', 
#     #   os.listdir(path + '../metrics'),'\n', 
#       os.listdir(path + '../scripts'))

# Load Data

In [0]:
train = pd.read_csv(path+'/train.csv').reset_index(drop=True).sort_values('acc_id')
test1 = pd.read_csv(path+'/test1.csv').reset_index(drop=True).sort_values('acc_id')
test2 = pd.read_csv(path+'/test2.csv').reset_index(drop=True).sort_values('acc_id')

In [0]:
t1 = pd.DataFrame{'acc_id':test1['acc_id'],}

In [0]:
train.shape
test1.shape
test2.shape

(40000, 70)

(20000, 68)

(20000, 68)

# Model

In [0]:
FOLDS = 5
SEED = 42

In [0]:
# model
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge

import xgboost as xgb
import warnings

class model(object):
    def __init__(self, train, test, folds=FOLDS, seed=SEED, log=True):
        self.train = train
        self.test = test
        self.kf = KFold(n_splits=folds, random_state=seed)
        self.skf = StratifiedKFold(n_splits=folds,random_state=seed)
        self.log = log
    
    def ridge_model(self):
        oof_spent = np.zeros(len(self.train))
        pred_spent = np.zeros(len(self.test))

        oof_time = np.zeros(len(self.train))
        pred_time = np.zeros(len(self.test))
        
        for trn_idx, val_idx in self.kf.split(self.train):
            train_df = self.train.loc[trn_idx]
            valid_df = self.train.loc[val_idx].drop(['amount_spent','survival_time'], axis=1)
            if self.log:
                spent_ridge_model = Ridge().fit(train_df.drop(['amount_spent','survival_time'], axis=1), np.log1p(train_df['amount_spent']))
                time_ridge_model = Ridge().fit(train_df.drop(['amount_spent','survival_time'], axis=1), np.log1p(train_df['survival_time']))
            else:
                spent_ridge_model = Ridge().fit(train_df.drop(['amount_spent','survival_time'], axis=1), train_df['amount_spent'])
                time_ridge_model = Ridge().fit(train_df.drop(['amount_spent','survival_time'], axis=1), train_df['survival_time'])
                
            oof_spent[val_idx] = spent_ridge_model.predict(valid_df)
            oof_time[val_idx] = time_ridge_model.predict(valid_df)

            pred_spent += spent_ridge_model.predict(self.test)/self.kf.n_splits
            pred_time += time_ridge_model.predict(self.test)/self.kf.n_splits

        return oof_time, oof_spent, pred_time, pred_spent
    
    def rf_model(self):
        oof_spent = np.zeros(len(self.train))
        pred_spent = np.zeros(len(self.test))

        oof_time = np.zeros(len(self.train))
        pred_time = np.zeros(len(self.test))

        for trn_idx, val_idx in self.kf.split(self.train):
            train_df = self.train.loc[trn_idx]
            valid_df = self.train.loc[val_idx].drop(['amount_spent','survival_time'], axis=1)
            if self.log:
                spent_rf = RandomForestRegressor().fit(X=train_df.drop(['amount_spent','survival_time'], axis=1), y=np.log1p(train_df['amount_spent']))
                time_rf = RandomForestRegressor().fit(X=train_df.drop(['amount_spent','survival_time'], axis=1), y=np.log1p(train_df['survival_time']))

            else:
                spent_rf = RandomForestRegressor().fit(X= train_df.drop(['amount_spent','survival_time'], axis=1), y = train_df['amount_spent'])
                time_rf = RandomForestRegressor().fit(X= train_df.drop(['amount_spent','survival_time'], axis=1), y = train_df['survival_time'])

            oof_spent[val_idx] = spent_rf.predict(valid_df)
            oof_time[val_idx] = time_rf.predict(valid_df)

            pred_spent += spent_rf.predict(self.test)/self.kf.n_splits
            pred_time += time_rf.predict(self.test)/self.kf.n_splits
        return oof_time, oof_spent, pred_time, pred_spent
        
    def xgb_model(self):
        warnings.filterwarnings(action='ignore')
        
        # params = {
        #     'objective':'reg:squarederror',
        #     'n_estimators':10000,
        #     'max_depth':16,
        #     'learning_rate':0.001,
        #     'subsample':0.7,
        #     'colsample_bytree':0.7,
        #     'reg_alpha':0.2,
        #     'seed':42
        # }
            
        oof_spent = np.zeros(len(self.train))
        pred_spent = np.zeros(len(self.test))

        oof_time = np.zeros(len(self.train))
        pred_time = np.zeros(len(self.test))

        for trn_idx, val_idx in self.kf.split(self.train):
            if self.log:
                spent_train_df = xgb.DMatrix(self.train.loc[trn_idx].drop(['amount_spent','survival_time'], axis=1), label=np.log1p(self.train.loc[trn_idx, 'amount_spent' ]))
                time_train_df = xgb.DMatrix(self.train.loc[trn_idx].drop(['amount_spent','survival_time'], axis=1), label=np.log1p(self.train.loc[val_idx,'survival_time' ]))
                
                spent_valid_df = xgb.DMatrix(self.train.loc[val_idx].drop(['amount_spent','survival_time'], axis=1), label=np.log1p(self.train.loc[trn_idx, 'amount_spent' ]))
                time_valid_df = xgb.DMatrix(self.train.loc[val_idx].drop(['amount_spent','survival_time'], axis=1), label=np.log1p(self.train.loc[val_idx,'survival_time' ]))

            else:
                spent_train_df = xgb.DMatrix(self.train.loc[trn_idx].drop(['amount_spent','survival_time'], axis=1), label=self.train.loc[trn_idx, 'amount_spent' ])
                time_train_df = xgb.DMatrix(self.train.loc[trn_idx].drop(['amount_spent','survival_time'], axis=1), label=self.train.loc[val_idx,'survival_time' ])
                
                spent_valid_df = xgb.DMatrix(self.train.loc[val_idx].drop(['amount_spent','survival_time'], axis=1), label=self.train.loc[trn_idx, 'amount_spent' ])
                time_valid_df = xgb.DMatrix(self.train.loc[val_idx].drop(['amount_spent','survival_time'], axis=1), label=self.train.loc[val_idx,'survival_time' ])

            spent_xgb_model = xgb.train(params, spent_train_df, num_boost_round=5000, evals=[(spent_train_df, 'train'), (spent_valid_df, 'val')], verbose_eval=100, early_stopping_rounds=10)
            time_xgb_model = xgb.train(params, time_train_df, num_boost_round=5000, evals=[(time_train_df, 'train'), (time_valid_df, 'val')], verbose_eval=100, early_stopping_rounds=10)
            
            oof_spent[val_idx] = spent_xgb_model.predict(xgb.DMatrix(self.train.loc[val_idx].drop(['amount_spent','survival_time'], axis=1)))
            oof_time[val_idx] = time_xgb_model.predict(xgb.DMatrix(self.train.loc[val_idx].drop(['amount_spent','survival_time'], axis=1)))

            pred_spent += spent_xgb_model.predict(xgb.DMatrix(self.test))/self.kf.n_splits
            pred_time += time_xgb_model.predict(xgb.DMatrix(self.test))/self.kf.n_splits

        return oof_time, oof_spent, pred_time, pred_spent
    
    def rmse(self, true, pred):
        
        if self.log:
            true = np.expm1(pred)
            mse = mean_squared_error(true, pred)
            rmse = np.round(np.sqrt(mse), 2)
        else:
            mse = mean_squared_error(true, pred)
            rmse = np.round(np.sqrt(mse), 2)
    
        return rmse

    def lgb_model(self):
        
        params = {
            'objective':'regression',
            "boosting": "gbdt",
            "num_iterations ":100,
            'num_leaves': 10000,
            'max_depth': 8,
            'learning_rate': 0.1,
            'min_data_in_leaf': 32, 
            'min_child_samples': 30,
            'min_child_weight': 0.5,
            'min_split_gain': 0.005,
            "feature_fraction": 0.9,
            "bagging_fraction": 0.9 ,
            "bagging_freq": 2,
            "bagging_seed": 42,
            "metric": 'rmse',
            "lambda_l1": 0.1,
            "lambda_l2": 0.1
        }
                
        oof_time = np.zeros(len(self.train))
        oof_spent = np.zeros(len(self.train))
        pred_time = np.zeros(len(self.test))
        pred_spent = np.zeros(len(self.test))
        
        for trn_idx, val_idx in self.kf.split(train):
            
            train_df = lgb.Dataset(self.train.loc[trn_idx].drop(['amount_spent','survival_time'], axis=1), label=self.train.loc[trn_idx, 'amount_spent'])
            valid_df = lgb.Dataset(self.train.loc[val_idx].drop(['amount_spent','survival_time'], axis=1), label=self.train.loc[val_idx, 'amount_spent'])
            
            lgb_model = lgb.train(params, train_df, 5000, valid_sets = [train_df, valid_df], early_stopping_rounds = 500, verbose_eval=5000)
            oof_spent[val_idx] = lgb_model.predict(self.train.loc[val_idx])/self.skf.n_splits
            pred_spent += lgb_model.predict(self.test)/self.kf.n_splits
        
        # params = {
        #     'objective':'multiclass',
        #     'num_class':65,
        #     "boosting": "gbdt",
        #     "num_iterations ":100,
        #     'num_leaves': 10000,
        #     'max_depth': 8,
        #     'learning_rate': 0.1,
        #     'min_data_in_leaf': 32, 
        #     'min_child_samples': 30,
        #     'min_child_weight': 0.5,
        #     'min_split_gain': 0.005,
        #     "feature_fraction": 0.9,
        #     "bagging_fraction": 0.9 ,
        #     "bagging_freq": 2,
        #     "bagging_seed": 42,
        #     "metric": 'multi_logloss',
        #     "lambda_l1": 0.1,
        #     "lambda_l2": 0.1
        # }

        for trn_idx, val_idx in self.skf.split(self.train, self.train['survival_time']):
            
            train_df = lgb.Dataset(self.train.loc[trn_idx].drop(['amount_spent','survival_time'], axis=1), label=self.train.loc[trn_idx, 'survival_time'])
            valid_df = lgb.Dataset(self.train.loc[val_idx].drop(['amount_spent','survival_time'], axis=1), label=self.train.loc[val_idx, 'survival_time'])
            
            lgb_model = lgb.train(params, train_df, 5000, valid_sets = [train_df, valid_df], early_stopping_rounds = 500, verbose_eval=5000)
            oof_time[val_idx] = lgb_model.predict(self.train.loc[val_idx])/self.skf.n_splits
            pred_time += lgb_model.predict(self.test)/self.skf.n_splits
        
        return oof_time, oof_spent, pred_time, pred_spent

In [0]:
m = model(train = train,test=test1,log=False)

In [0]:
oof_time_lgb, oof_spent_lgb, pred_time_lgb, pred_spent_lgb= m.lgb_model()

Training until validation scores don't improve for 500 rounds.
Early stopping, best iteration is:
[18]	training's rmse: 0.635757	valid_1's rmse: 0.757798
Training until validation scores don't improve for 500 rounds.
Early stopping, best iteration is:
[13]	training's rmse: 0.65501	valid_1's rmse: 0.722533
Training until validation scores don't improve for 500 rounds.
Early stopping, best iteration is:
[24]	training's rmse: 0.582365	valid_1's rmse: 0.876314
Training until validation scores don't improve for 500 rounds.
Early stopping, best iteration is:
[20]	training's rmse: 0.665116	valid_1's rmse: 0.601347
Training until validation scores don't improve for 500 rounds.
Early stopping, best iteration is:
[2]	training's rmse: 0.751147	valid_1's rmse: 0.493337
Training until validation scores don't improve for 500 rounds.
Early stopping, best iteration is:
[118]	training's rmse: 14.2913	valid_1's rmse: 18.2638
Training until validation scores don't improve for 500 rounds.
Early stopping, 

In [0]:
oof_time_xgb, oof_spent_xgb, pred_time_xgb, pred_spent_xgb = m.xgb_model() # 잘좀봐야될듯....

In [0]:
oof_time_rf, oof_spent_rf, pred_time_rf, pred_spent_rf = m.rf_model()
# oof_time_ridge, oof_spent_ridge, pred_time_ridge, pred_spent_ridge = m.ridge_model()

In [0]:
# rf_oof = pd.DataFrame({'acc_id':train['acc_id'],'survival_time':oof_time_rf.astype(int),'amount_spent':oof_spent_rf})
# rf_pred = pd.DataFrame({'acc_id':test1['acc_id'],'survival_time':pred_time_rf.astype(int),'amount_spent':pred_spent_rf})

lgb_oof = pd.DataFrame({'acc_id':train['acc_id'],'survival_time':oof_time_lgb.astype(int),'amount_spent':oof_spent_lgb})
lgb_pred = pd.DataFrame({'acc_id':test1['acc_id'],'survival_time':pred_time_lgb.astype(int),'amount_spent':pred_spent_lgb})

# xgb_oof = pd.DataFrame({'acc_id':train['acc_id'],'survival_time':oof_time_xgb,'amount_spent':oof_spent_xgb})
# xgb_pred = pd.DataFrame({'acc_id':test1['acc_id'],'survival_time':pred_time_xgb,'amount_spent':pred_spent_xgb})

In [0]:
lgb_pred.to_csv('test1_predict.csv',index=False,encoding='utf-8')

In [0]:
lgb_pred.loc[lgb_pred['survival_time']<1,'survival_time'] = 1
lgb_pred.loc[lgb_pred['amount_spent']<0,'amount_spent'] = 0

In [0]:
test1_predict = pd.DataFrame({'acc_id':lgb_pred['acc_id'],'survival_time':np.zeros(len(lgb_pred))+4,'amount_spent':np.zeros(len(lgb_pred))+4})
test2_predict = pd.DataFrame({'acc_id':test2['acc_id'],'survival_time':np.zeros(len(test2['acc_id']))+4,'amount_spent':np.zeros(len(test2['acc_id']))+4})


In [0]:
os.getcwd()

'/content'

In [0]:
from google.colab import files

# test1_predict.to_csv('test1_predict.csv',encoding='utf-8')
test2_predict.to_csv('test2_predict.csv',encoding='utf-8')

files.download('test1_predict.csv')
files.download('test2_predict.csv')

# submit

In [0]:
ensembel_pred = (rf_pred.loc[:,['survival_time','amount_spent']] + lgb_pred.loc[:,['survival_time','amount_spent']])/2
ensembel_pred = pd.concat([test1['acc_id'],ensembel_pred],axis=1)
ensembel_pred['survival_time'] = ensembel_pred['survival_time'].astype(int)
ensembel_pred['amount_spent'] = ensembel_pred['amount_spent']/100

In [0]:
path

'drive/My Drive/bigcontest2019/scripts'

In [0]:
ensembel_pred.to_csv('test1_pred.csv',index=False)


# score

In [0]:
path = 'drive/My Drive/bigcontest2019/scripts'
os.chdir(path)
from model.metrics import score_function

In [0]:
ensembel_oof = (rf_oof.loc[:,['survival_time','amount_spent']] + lgb_oof.loc[:,['survival_time','amount_spent']])/2
ensembel_oof = pd.concat([train['acc_id'],ensembel_oof],axis=1)
ensembel_oof['survival_time'] = ensembel_oof['survival_time'].astype(int)

In [0]:
ensembel_oof.head()

Unnamed: 0,acc_id,survival_time,amount_spent
22410,2,25,0.000289
12651,5,33,0.001262
5494,8,22,0.001814
6811,17,27,0.000653
16854,20,31,0.000735


In [0]:
train_label = train.loc[:,['acc_id','survival_time','amount_spent']]

In [0]:
score_function.score_function(ensembel_oof, train_label, path=False)

6.530866741169218


6.530866741169218

# 여기부터 잡다한 거시여



In [0]:
# # model
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import KFold
# from sklearn.linear_model import Ridge 

# import xgboost as xgb
# import warnings

# class model(object):
#     def __init__(self, train, test, label, folds=FOLDS, seed=SEED, log=True):
#         self.train = train
#         self.test = test #.drop(['amount_spent','survival_time'], axis=1)
#         self.label = label
#         self.kf = KFold(n_splits=folds, random_state=seed)
#         self.skf = StratifiedKFold(n_splits=folds,random_state=seed)
#         self.log = log
    
#     def ridge_model(self):
#         oof = np.zeros(len(self.train))
#         pred = np.zeros(len(self.test))
#         for trn_idx, val_idx in self.kf.split(self.train):
#             train_df = self.train.loc[trn_idx]
#             valid_df = self.train.loc[val_idx].drop(['amount_spent','survival_time'], axis=1)
#             if self.log:
#                 ridge_model = Ridge().fit(train_df.drop(['amount_spent','survival_time'], axis=1), np.log1p(train_df[self.label]))
#             else:
#                 ridge_model = Ridge().fit(train_df.drop(['amount_spent','survival_time'], axis=1), train_df[self.label])
#             oof[val_idx] = ridge_model.predict(valid_df)
#             pred += ridge_model.predict(self.test)/self.kf.n_splits
#         return oof, pred
    
#     def rf_model(self):
        
#         oof = np.zeros(len(self.train))
#         pred = np.zeros(len(self.test))
#         for trn_idx, val_idx in self.kf.split(self.train):
#             train_df = self.train.loc[trn_idx]
#             valid_df = self.train.loc[val_idx].drop(['amount_spent','survival_time'], axis=1)
#             if self.log:
#                 rf = RandomForestRegressor().fit(X=train_df.drop(['amount_spent','survival_time'], axis=1), y=np.log1p(train_df[self.label]))
#             else:
#                 rf = RandomForestRegressor().fit(X= train_df.drop(['amount_spent','survival_time'], axis=1), y = train_df[self.label])
#             oof[val_idx] = rf.predict(valid_df)
#             pred += rf.predict(self.test)/self.kf.n_splits
#         return oof, pred
        
#     def xgb_model(self):
#         warnings.filterwarnings(action='ignore')
        
#         params = {
#             'objective':'survival:cox',
#             'n_estimators':10000,
#             'max_depth':16,
#             'learning_rate':0.001,
#             'subsample':0.7,
#             'colsample_bytree':0.7,
#             'reg_alpha':0.2,
#             'tree_method':'gpu_hist',
#             'seed':42
#         }
        
#         def xgb_f1_score(y_hat, data):
#             y_true = data.get_label()
#             y_hat = custom_round(y_hat, np.quantile(y_hat, 0.85))
#             return 'f1', -f1_score(y_true, y_hat)
    
#         oof = np.zeros(len(self.train))
#         pred = np.zeros(len(self.test))
#         for trn_idx, val_idx in self.kf.split(self.train):
#             if self.log:
#                 train_df = xgb.DMatrix(self.train.loc[trn_idx].drop(['amount_spent','survival_time'], axis=1), label=np.log1p(self.train.loc[trn_idx, self.label]))
#                 valid_df = xgb.DMatrix(self.train.loc[val_idx].drop(['amount_spent','survival_time'], axis=1), label=np.log1p(self.train.loc[val_idx, self.label]))
#             else:
#                 train_df = xgb.DMatrix(self.train.loc[trn_idx].drop(['amount_spent','survival_time'], axis=1), label=self.train.loc[trn_idx, self.label])
#                 valid_df = xgb.DMatrix(self.train.loc[val_idx].drop(['amount_spent','survival_time'], axis=1), label=self.train.loc[val_idx, self.label])
#             xgb_model = xgb.train(params, train_df, num_boost_round=5000, evals=[(train_df, 'train'), (valid_df, 'val')], verbose_eval=100, early_stopping_rounds=10,feval=xgb_f1_score)
#             oof[val_idx] = xgb_model.predict(xgb.DMatrix(self.train.loc[val_idx].drop(['amount_spent','survival_time'], axis=1)))
#             pred += xgb_model.predict(xgb.DMatrix(self.test))/self.kf.n_splits
#         return oof, pred
    
#     def rmse(self, true, pred):
        
#         if self.log:
#             true = np.expm1(pred)
#             mse = mean_squared_error(true, pred)
#             rmse = np.round(np.sqrt(mse), 2)
#         else:
#             mse = mean_squared_error(true, pred)
#             rmse = np.round(np.sqrt(mse), 2)
    
#         return rmse

#     def lgb_model(self):
        
#         params = {
#             'objective':'regression',
#             "boosting": "gbdt",
#             "num_iterations ":100,
#             'num_leaves': 10000,
#             'max_depth': 8,
#             'learning_rate': 0.1,
#             'min_data_in_leaf': 32, 
#             'min_child_samples': 30,
#             'min_child_weight': 0.5,
#             'min_split_gain': 0.005,
#             "feature_fraction": 0.9,
#             "bagging_fraction": 0.9 ,
#             "bagging_freq": 2,
#             "bagging_seed": 42,
#             "metric": 'rmse',
#             "lambda_l1": 0.1,
#             "lambda_l2": 0.1
#         }
                
#         oof_time = np.zeros(len(self.train))
#         oof_spent = np.zeros(len(self.train))
#         pred_time = np.zeros(len(self.test))
#         pred_spent = np.zeros(len(self.test))
        
#         for trn_idx, val_idx in self.kf.split(train):
            
#             train_df = lgb.Dataset(self.train.loc[trn_idx].drop(['amount_spent','survival_time'], axis=1), label=self.train.loc[trn_idx, 'amount_spent'])
#             valid_df = lgb.Dataset(self.train.loc[val_idx].drop(['amount_spent','survival_time'], axis=1), label=self.train.loc[val_idx, 'amount_spent'])
            
#             lgb_model = lgb.train(params, train_df, 5000, valid_sets = [train_df, valid_df], early_stopping_rounds = 500, verbose_eval=5000)
#             oof_spent[val_idx] = lgb_model.predict(train.loc[val_idx])/self.skf.n_splits
#             pred_spent += lgb_model.predict(self.test)/self.kf.n_splits
        
#         params = {
#             'objective':'multiclass',
#             'num_class':65,
#             "boosting": "gbdt",
#             "num_iterations ":100,
#             'num_leaves': 10000,
#             'max_depth': 8,
#             'learning_rate': 0.1,
#             'min_data_in_leaf': 32, 
#             'min_child_samples': 30,
#             'min_child_weight': 0.5,
#             'min_split_gain': 0.005,
#             "feature_fraction": 0.9,
#             "bagging_fraction": 0.9 ,
#             "bagging_freq": 2,
#             "bagging_seed": 42,
#             "metric": 'multi_logloss',
#             "lambda_l1": 0.1,
#             "lambda_l2": 0.1
#         }

#         for trn_idx, val_idx in self.skf.split(self.train, self.train['survival_time']):
            
#             train_df = lgb.Dataset(self.train.loc[trn_idx].drop(['amount_spent','survival_time'], axis=1), label=self.train.loc[trn_idx, 'survival_time'])
#             valid_df = lgb.Dataset(self.train.loc[val_idx].drop(['amount_spent','survival_time'], axis=1), label=self.train.loc[val_idx, 'survival_time'])
            
#             lgb_model = lgb.train(params, train_df, 5000, valid_sets = [train_df, valid_df], early_stopping_rounds = 500, verbose_eval=5000)
#             oof_time[val_idx] = lgb_model.predict(train.loc[val_idx])/self.skf.n_splits
#             pred_time += lgb_model.predict(self.test)/self.skf.n_splits
        
#         return oof_time, oof_spent, pred_time, pred_spent

# amount_spent

In [0]:
# test1
as_model1 = model(train=train,test=test1,label='amount_spent' ,log=False)

# randomforest
as_rf_oof1, as_rf_pred1 = as_model1.rf_model()
as_rf_pred_df1 = pd.DataFrame([test1['acc_id'],as_rf_pred1]).transpose()
as_rf_pred_df1.columns = ['acc_id','amount_spent']

# ridge
as_ridge_oof1, as_ridge_pred1 = as_model1.ridge_model()
as_ridge_pred_df1 = pd.DataFrame([test1['acc_id'],as_ridge_oof1]).transpose()
as_ridge_pred_df1.columns = ['acc_id','amount_spent']




In [0]:
# test2
as_model2 = model(train=train,test=test2,label = 'amount_spent' ,log=False)

# randomforest
as_rf_oof2, as_rf_pred2 = as_model2.rf_model()
as_rf_pred_df2 = pd.DataFrame([test2['acc_id'],as_rf_pred2]).transpose()
as_rf_pred_df2.columns = ['acc_id','amount_spent']

# ridge
as_ridge_oof2, as_ridge_pred2 = as_model2.ridge_model()
as_ridge_pred_df2 = pd.DataFrame([test2['acc_id'],as_ridge_oof2]).transpose()
as_ridge_pred_df2.columns = ['acc_id','amount_spent']

# as oof's dataframe2
as_oof_df2 = pd.DataFrame([train['acc_id'],train['amount_spent'],as_rf_oof2,as_ridge_oof2]).transpose()
as_oof_df2.columns = ['acc_id','amount_spent','rf_amount_spent','rid_amount_spent']



In [0]:
as_rf_pred_df1.loc[as_rf_pred_df1['amount_spent']<0,'amount_spent'] = 0
as_rf_pred_df2.loc[as_rf_pred_df2['amount_spent']<0,'amount_spent'] = 0

as_ridge_pred_df1.loc[as_ridge_pred_df1['amount_spent']<0,'amount_spent'] = 0
as_ridge_pred_df2.loc[as_ridge_pred_df2['amount_spent']<0,'amount_spent'] = 0

# Survival time

In [0]:
# test1
sv_model1 = model(train=train,test=test1,label = 'survival_time' ,log=False)

# randomforest
sv_rf_oof1, sv_rf_pred1 = sv_model1.rf_model()
sv_rf_pred_df1 = pd.DataFrame([test1['acc_id'],sv_rf_pred1.astype(int)]).transpose()
sv_rf_pred_df1.columns = ['acc_id','survival_time']

# ridge
sv_ridge_oof1, sv_ridge_pred1 = sv_model1.ridge_model()
sv_ridge_pred_df1 = pd.DataFrame([test1['acc_id'],sv_ridge_oof1.astype(int)]).transpose()
sv_ridge_pred_df1.columns = ['acc_id','survival_time']

# sv oof's dataframe1
sv_oof_df1 = pd.DataFrame([train['acc_id'],train['survival_time'],sv_rf_oof1.astype(int),sv_ridge_oof1.astype(int)]).transpose()
sv_oof_df1.columns = ['acc_id','survival_time','rf_survival_time','rid_survival_time']


In [0]:
# test2
sv_model2 = model(train=train,test=test2,label = 'survival_time' ,log=False)

# randomforest
sv_rf_oof2, sv_rf_pred2 = sv_model2.rf_model()
sv_rf_pred_df2 = pd.DataFrame([test2['acc_id'],sv_rf_pred2.astype(int)]).transpose()
sv_rf_pred_df2.columns = ['acc_id','survival_time']

# ridge
sv_ridge_oof2, sv_ridge_pred2 = sv_model2.ridge_model()
sv_ridge_pred_df2 = pd.DataFrame([test2['acc_id'],sv_ridge_oof2.astype(int)]).transpose()
sv_ridge_pred_df2.columns = ['acc_id','survival_time']


In [0]:
sv_rf_pred_df1.loc[sv_rf_pred_df1['survival_time']<0,'survival_time'] = 0
sv_rf_pred_df2.loc[sv_rf_pred_df2['survival_time']<0,'survival_time'] = 0

sv_ridge_pred_df1.loc[sv_ridge_pred_df1['survival_time']<0,'survival_time'] = 0
sv_ridge_pred_df2.loc[sv_ridge_pred_df2['survival_time']<0,'survival_time'] = 0

NameError: ignored

In [0]:
# OOF dataframe
rf_oof = pd.DataFrame({'acc_id':train['acc_id'],'survival_time':sv_rf_oof1.astype(int),'amount_spent':as_rf_oof1})
rf_oof.head()

rid_oof = pd.DataFrame({'acc_id':train['acc_id'],'survival_time':as_ridge_oof1.astype(int),'amount_spent':as_ridge_oof1})
rid_oof.head()

Unnamed: 0,acc_id,survival_time,amount_spent
0,27835,50,0.056183
1,12351,58,0.198623
2,125437,64,0.152184
3,104483,46,0.068319
4,4704,62,0.197726


Unnamed: 0,acc_id,survival_time,amount_spent
0,27835,0,0.165782
1,12351,0,0.140505
2,125437,0,0.171106
3,104483,0,0.075382
4,4704,0,0.121095


# Merge

In [0]:
rf_test1 = pd.merge(sv_rf_pred_df1,as_rf_pred_df1,on='acc_id',how='inner')
rf_test2 = pd.merge(sv_rf_pred_df2,as_rf_pred_df2,on='acc_id',how='inner')

ridge_test1 = pd.merge(sv_ridge_pred_df1,as_ridge_pred_df1,on='acc_id',how='inner')
ridge_test2 = pd.merge(sv_ridge_pred_df2,as_ridge_pred_df2,on='acc_id',how='inner')

train_oof = pd.merge(sv_oof_df2,as_oof_df1,on='acc_id')

In [0]:
train_label.describe()

Unnamed: 0,acc_id,survival_time,amount_spent
count,40000.0,40000.0,40000.0
mean,65281.10555,45.7067,0.124931
std,37525.623536,23.265907,0.722272
min,2.0,1.0,0.0
25%,32792.75,24.0,0.0
50%,65359.0,64.0,0.020343
75%,97685.75,64.0,0.106119
max,130473.0,64.0,39.412632


In [0]:
rf_test1['amount_spent'] = [i/100 for i in rf_test1['amount_spent']]

In [0]:
rf_test2['amount_spent'] = [i/100 for i in rf_test2['amount_spent']]

In [0]:
rf_test1

Unnamed: 0,acc_id,survival_time,amount_spent
0,7,35,0.023401
1,15,36,0.165975
2,16,27,0.082206
3,18,26,0.149132
4,19,43,0.029035
5,22,36,0.014787
6,24,30,0.051779
7,28,29,0.149756
8,39,34,0.014978
9,51,25,0.123885


In [0]:
rf_test1['acc_id'] = rf_test1['acc_id'].astype(int)
rf_test2['acc_id'] = rf_test2['acc_id'].astype(int)

ridge_test1['acc_id'] = ridge_test1['acc_id'].astype(int)
ridge_test2['acc_id'] = ridge_test2['acc_id'].astype(int)

In [0]:
# output_path = 'drive/My Drive/bigcontest2019/scripts/model/metrics/bjw_inference/'

rf_test1.to_csv('test1_predict.csv', index=False,encoding= 'utf-8')
rf_test2.to_csv('test2_predict.csv', index=False,encoding = 'utf-8')

# ridge_test1.to_csv(output_path+'test1_ridge_predict.csv', index=False)
# ridge_test2.to_csv(output_path+'test2_ridge_predict.csv', index=False)

In [0]:
files.download('test1_predict.csv')
files.download('test2_predict.csv')

In [0]:
train.to_csv('train.csv', index=False)
test1.to_csv('test1.csv', index=False)
test2.to_csv('test2.csv', index=False)

!ls

'data merge'  'explortary data'   model   test1.csv   test2.csv   train.csv


In [0]:
test1.head()

Unnamed: 0,acc_id,activity_day_nunique,activity_char_id_nunique,activity_server_nunique,activity_playtime_sum,activity_playtime_mean,activity_npc_kill_sum,activity_npc_kill_mean,activity_solo_exp_sum,activity_party_exp_sum,activity_quest_exp_sum,activity_rich_monster_sum,activity_death_sum,activity_revive_sum,activity_exp_recovery_sum,activity_fishing_sum,activity_private_shop_sum,activity_game_money_change_sum,activity_enchant_count_sum,payment_day,payment_amount_spent,combat_day,combat_char_id,combat_server,combat_class,combat_pledge_cnt,combat_random_attacker_cnt,combat_random_defender_cnt,combat_temp_cnt,combat_same_pledge_cnt,combat_etc_cnt,combat_num_opponent,pledge_day,pledge_char_id,pledge_server,pledge_pledge_id,pledge_play_char_cnt,pledge_combat_char_cnt,pledge_pledge_combat_cnt,pledge_random_attacker_cnt,pledge_random_defender_cnt,pledge_same_pledge_cnt,pledge_temp_cnt,pledge_etc_cnt,pledge_combat_play_time,pledge_non_combat_play_time,trade_seller_day_count,trade_seller_type_nunique,trade_seller_type_personal,trade_seller_type_exchange,trade_seller_server_nunique,trade_seller_source_char_id_nunique,trade_seller_target_acc_id_nunique,trade_seller_target_char_id_nunique,trade_seller_item_type_nunique,trade_seller_item_amount_sum,trade_seller_item_price_sum,trade_buyer_day_count,trade_buyer_type_nunique,trade_buyer_type_personal,trade_buyer_type_exchange,trade_buyer_server_nunique,trade_buyer_target_char_id_nunique,trade_buyer_source_acc_id_nunique,trade_buyer_source_char_id_nunique,trade_buyer_item_type_nunique,trade_buyer_item_amount_sum,trade_buyer_item_price_sum
0,7,10,3,2,1.252251,0.104354,0.719474,0.059956,8.62429,0.0,34.556743,0,0.73765,0.494674,0.0,0.0,0.0,0.001291,0.0,1.0,0.035204,10,3,2,2,0.0,0.0,0.0,1.918911,0.0,0.0,0.294386,10.0,2.0,2.0,2.0,19.560411,7.614846,0.506312,19.527071,4.721068,0.0,12.558147,4.098316,12.579598,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,2.396984e-08,0.002893
1,15,28,3,3,85.705431,1.032596,18.207875,0.219372,0.630317,0.0,0.0,0,7.868271,6.183422,0.0,0.0,0.0,0.023029,0.0,0.0,0.0,28,3,3,2,0.0,0.0,39.041128,0.0,0.0,2.027007,2.256961,28.0,3.0,3.0,3.0,99.317807,15.410139,0.0,0.0,48.355185,0.0,4.054205,1.797879,107.054983,16.478587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,16,3,7,1,0.081923,0.009103,0.0,0.0,0.0,0.0,0.100846,0,0.0,0.0,0.0,0.0,0.0,0.034526,0.0,1.0,0.035204,3,7,1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,7.0,1.0,8.0,1.190948,0.036089,0.0,0.0,0.0,0.0,0.0,0.007078,0.074364,0.056062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,18,13,1,1,40.00882,3.077602,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,36.945877,0.921976,0.0,0.0,0.0,13,1,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48.0,2.0,47.0,1.0,1.0,1.0,32.0,32.0,3.0,0.304424,2.569245,5.0,1.0,0.0,5.0,1.0,1.0,1.0,1.0,3.0,4.817938e-06,0.0
4,19,28,4,4,84.139533,2.052184,79.866399,1.947961,0.271431,0.954236,0.007157,2,0.73765,0.494674,0.0,37.465503,0.0,1.863396,0.0,0.0,0.0,28,4,4,3,0.0,3.131975,9.464516,1.918911,0.0,2.815288,4.219536,28.0,2.0,4.0,1.0,17.034159,2.526252,0.0,0.710075,1.430627,1.591434,5.339685,0.481322,10.91221,2.733728,12.0,1.0,0.0,12.0,1.0,1.0,10.0,10.0,4.0,4.077989,0.0,53.0,2.0,36.0,17.0,1.0,1.0,29.0,29.0,5.0,4.237634,2.005447


In [0]:
from google.colab import files

files.download('train.csv')
files.download('test1.csv')
files.download('test2.csv')

# Score

In [0]:
score_path = 'drive/My Drive/bigcontest2019/scripts'
os.chdir(score_path)
from model.metrics import score_function

In [0]:
score_function.score_function(train_label, train_label, path=False) # 91693.76240883442
print()
score_function.score_function(rf_oof, train_label, path=False)

91693.76240883442


91693.76240883442


10780.063262817888


10780.063262817888

In [0]:
# from google.colab import files

# output_path = 'drive/My Drive/bigcontest2019/scripts/model/metrics/bjw_inference/'
# os.chdir(output_path)

# files.download('test1_rf_predict.csv')
# files.download('test2_rf_predict.csv')


In [0]:
lst = ['a','b','c','d']

In [0]:
import numpy as np

index = np.random.randint(0,len(lst)-1)
try:
    

2

In [0]:
word = np.random.choice(lst,1)

In [0]:
if word in lst:
    

True

In [0]:
    def rf_train_st(self, params, iteration, seed):
        
        def under_sampling(self, data):
            all_week_under_sample = pd.DataFrame()

            for week in range(1, 5):
                if self.kind == 'activity':
                    _, under_sample = train_test_split(data[(data['week']==week) & (data['survival_time']==64)], test_size=0.01, random_state=42, shuffle=True, 
                                                       stratify=data.loc[(data['week']==week) & (data['survival_time']==64), 'day'])
                elif self.kind == 'payment':
                    _, under_sample = train_test_split(data[(data['week']==week) & (data['survival_time']==64)], test_size=0.01, random_state=42, shuffle=True)

                elif self.kind == 'trade':
                    _, under_sample = train_test_split(data[(data['week']==week) & (data['survival_time']==64)], test_size=0.01, random_state=42, shuffle=True)

                elif self.kind == 'combat':
                    _, under_sample = train_test_split(data[(data['week']==week) & (data['survival_time']==64)], test_size=0.01, random_state=42, shuffle=True,
                                                      stratify=data.loc[(data['week']==week) & (data['survival_time']==64), 'daynunique'])
                elif self.kind == 'pledge':
                    _, under_sample = train_test_split(data[(data['week']==week) & (data['survival_time']==64)], test_size=0.01, random_state=42, shuffle=True,
                                                      stratify=data.loc[(data['week']==week) & (data['survival_time']==64), 'daynunique'])

                all_week_under_sample = pd.concat([all_week_under_sample, under_sample])
            else:
                data = pd.concat([data[data['survival_time']!=64], all_week_under_sample]).reset_index(drop=True)
                return data
            
        LABEL='survival_time'
        skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

        for idx, (trn_idx, val_idx) in enumerate(skf.split(self.train_data, self.train_data[LABEL])):
            
            temp_train_data = under_sampling(self, self.train_data.loc[trn_idx])
            
            trn_label = temp_train_data[LABEL] - 1
            val_label = self.train_data.loc[val_idx, LABEL] - 1
            train_df = temp_train_data[self.features_]
            valid_df = self.train_data.loc[val_idx, self.features_]
            
            rf_model = RandomForestClassifier(n_estimators=100).fit(train_df, trn_label)
            self.rf_model_st['model'+str(idx)] = rf_model
    
    def rf_train_tas(self, params, iteration, seed):
        
        LABEL='total_amount_spent'
        kf = KFold(n_splits=5, random_state=42, shuffle=True)
        self.features_.extend(['survival_time'])
        for idx, (trn_idx, val_idx) in enumerate(kf.split(self.train_data)):
            
            trn_label = self.train_data.loc[trn_idx, LABEL]
            val_label = self.train_data.loc[val_idx, LABEL]
            
            train_df = self.train_data.loc[trn_idx, self.features_]
            valid_df = self.train_data.loc[val_idx, self.features_]

            rf_model = RandomForestRegressor(n_estimators=100).fit(train_df, trn_label.values)
            self.rf_model_tas['model'+str(idx)] = rf_model


