In [12]:
import pandas as pd
import numpy as np

# DRAGONS
import xgboost as xgb
import lightgbm as lgb
import catboost as cat
# plots
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# pandas / plt options
pd.options.display.max_columns = 999
plt.rcParams['figure.figsize'] = (14, 7)
font = {'family' : 'verdana',
        'weight' : 'bold',
        'size'   : 14}
plt.rc('font', **font)

# remove warnings
import warnings
warnings.simplefilter("ignore")
import json
# garbage collector
import gc
gc.enable()
from pandas.io.json import json_normalize
import pickle
import pytz
from datetime import datetime

from sklearn.metrics import mean_squared_error,roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing



In [2]:
def load_df(csv_path):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    ans = pd.DataFrame()
    dfs = pd.read_csv(csv_path, sep=',',usecols=lambda col: col not in ["hits"],
            converters={column: json.loads for column in JSON_COLUMNS}, 
            dtype={'fullVisitorId': 'str'}, # Important!!
            chunksize=100000)
    for df in dfs:
        df.reset_index(drop = True,inplace = True)
        for column in JSON_COLUMNS:
            column_as_df = json_normalize(df[column])
            column_as_df.columns = [column+"."+subcolumn for subcolumn in column_as_df.columns]
            df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)

        #print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
#         use_df = df[features]
#         del df
        gc.collect()
        ans = pd.concat([ans,df], axis=0).reset_index(drop=True)
        #print(ans.shape)
    return ans

train = load_df("/home/mediwhale-2/GAC_kaggle/data/train.csv")
test= load_df("/home/mediwhale-2/GAC_kaggle/data/test.csv")


print(train.shape, test.shape)

(903653, 55) (804684, 53)


In [3]:
train["totals.transactionRevenue"] = train["totals.transactionRevenue"].astype(float)

In [4]:
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [5]:
# label encode the categorical variables and convert the numerical variables to float
cat_cols = ["channelGrouping", "device.browser", 
            "device.deviceCategory", "device.operatingSystem", 
            "geoNetwork.city", "geoNetwork.continent", 
            "geoNetwork.country", "geoNetwork.metro",
            "geoNetwork.networkDomain", "geoNetwork.region", 
            "geoNetwork.subContinent", "trafficSource.adContent", 
            "trafficSource.adwordsClickInfo.adNetworkType", 
            "trafficSource.adwordsClickInfo.gclId", 
            "trafficSource.adwordsClickInfo.page", 
            "trafficSource.adwordsClickInfo.slot", "trafficSource.campaign",
            "trafficSource.keyword", "trafficSource.medium", 
            "trafficSource.referralPath", "trafficSource.source",
            'trafficSource.adwordsClickInfo.isVideoAd', 'trafficSource.isTrueDirect']

num_cols = ["totals.hits", "totals.pageviews", "visitNumber", 'totals.bounces',  'totals.newVisits']   

## Feature Engineering

In [15]:
train["hits_per_pageviews"] = train["totals.hits"] / train["totals.pageviews"]

num_cols = num_cols + ["hits_per_pageviews"]

In [6]:

for col in cat_cols:
    print(col)
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train[col].values.astype('str')) + list(test[col].values.astype('str')))
    train[col] = lbl.transform(list(train[col].values.astype('str')))
    test[col] = lbl.transform(list(test[col].values.astype('str')))


for col in num_cols:
    train[col] = train[col].astype(float)
    test[col] = test[col].astype(float)

channelGrouping
device.browser
device.deviceCategory
device.operatingSystem
geoNetwork.city
geoNetwork.continent
geoNetwork.country
geoNetwork.metro
geoNetwork.networkDomain
geoNetwork.region
geoNetwork.subContinent
trafficSource.adContent
trafficSource.adwordsClickInfo.adNetworkType
trafficSource.adwordsClickInfo.gclId
trafficSource.adwordsClickInfo.page
trafficSource.adwordsClickInfo.slot
trafficSource.campaign
trafficSource.keyword
trafficSource.medium
trafficSource.referralPath
trafficSource.source
trafficSource.adwordsClickInfo.isVideoAd
trafficSource.isTrueDirect


In [8]:
dev_df = train[train['date']<=20170531]
val_df = train[train['date']>20170531]

dev_df = dev_df.reset_index()
val_df = val_df.reset_index()

In [9]:
from sklearn.model_selection import GroupKFold

class KFoldValidation_non_zero_clf():
    def __init__(self, data, n_splits=5):
        unique_vis = np.array(sorted(data['fullVisitorId'].astype(str).unique()))
        folds = GroupKFold(n_splits)
        ids = np.arange(data.shape[0])
        
        self.fold_ids = []
        for trn_vis, val_vis in folds.split(X=unique_vis, y=unique_vis, groups=unique_vis):
            self.fold_ids.append([
                    ids[data['fullVisitorId'].astype(str).isin(unique_vis[trn_vis])],
                    ids[data['fullVisitorId'].astype(str).isin(unique_vis[val_vis])]
                ])
            
    def validate(self, train, test, features, model, name="", prepare_stacking=False, 
                 fit_params={"early_stopping_rounds": 50, "verbose": 100, "eval_metric": "auc"}):
        model.FI = pd.DataFrame(index=features)
        full_score = 0
        
        if prepare_stacking:
            test[name] = 0
            train[name] = np.NaN
        
        for fold_id, (trn, val) in enumerate(self.fold_ids):
            devel = train[features].iloc[trn]
            y_devel = train["totals.transactionRevenue"].iloc[trn] > 0
            valid = test[features]
            y_valid = test["totals.transactionRevenue"] > 0
                       
            print("Fold ", fold_id, ":")
            if name == "catclf" :
                model.fit(devel, y_devel, eval_set=[(valid, y_valid)],early_stopping_rounds= 50,verbose= 100)
            else :
                model.fit(devel, y_devel, eval_set=[(valid, y_valid)], **fit_params)

                
            
            if len(model.feature_importances_) == len(features):  # some bugs in catboost?
                model.FI['fold' + str(fold_id)] = model.feature_importances_ / model.feature_importances_.sum()

            predictions = model.predict_proba(valid)[:,1]
            predictions[predictions < 0] = 0
            
            val_predictions = model.predict_proba(train[features].iloc[val])[:,1]
            val_predictions[val_predictions < 0] = 0
            
            print("Fold ", fold_id, " error: ", roc_auc_score(y_valid, predictions))
            
            fold_score = roc_auc_score(y_valid, predictions)
            full_score += fold_score / len(self.fold_ids)
            print("Fold ", fold_id, " score: ", fold_score)
            
            if prepare_stacking:
                train[name].iloc[val] = val_predictions
                
                test_predictions = model.predict_proba(test[features])[:,1]
                test_predictions[test_predictions < 0] = 0
                test[name] += test_predictions / len(self.fold_ids)
                
        print("Final score: ", full_score)
        return full_score

In [13]:
Kfolder_clf = KFoldValidation_non_zero_clf(dev_df,5)



lgbmodel_clf = lgb.LGBMClassifier(n_estimators=1000, objective="binary", metric="auc", num_leaves=35, min_child_samples=100,
                      learning_rate=0.01, bagging_fraction=0.7, feature_fraction=0.5, bagging_frequency=5, 
                      bagging_seed=2019, subsample=.9, colsample_bytree=.9, use_best_model=True)

Kfolder_clf.validate(dev_df, val_df, num_cols + cat_cols, lgbmodel_clf, "lgb_zero_clf", prepare_stacking=True)

Fold  0 :
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's auc: 0.987806
[200]	valid_0's auc: 0.988104
[300]	valid_0's auc: 0.988317
[400]	valid_0's auc: 0.988481
[500]	valid_0's auc: 0.988538
Early stopping, best iteration is:
[495]	valid_0's auc: 0.988539
Fold  0  error:  0.9885393632207856
Fold  0  score:  0.9885393632207856
Fold  1 :
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's auc: 0.98777
[200]	valid_0's auc: 0.988063
[300]	valid_0's auc: 0.988248
[400]	valid_0's auc: 0.988414
[500]	valid_0's auc: 0.988515
Early stopping, best iteration is:
[526]	valid_0's auc: 0.988522
Fold  1  error:  0.9885223056721457
Fold  1  score:  0.9885223056721457
Fold  2 :
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's auc: 0.987856
[200]	valid_0's auc: 0.988129
[300]	valid_0's auc: 0.988332
[400]	valid_0's auc: 0.988445
[500]	valid_0's auc: 0.988489
Early stopping, best iteration is:
[496]	valid_0's auc: 0.988

0.98851859077952

In [14]:
from sklearn.model_selection import GroupKFold

from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
def score(data, y):
    validation_res = pd.DataFrame(
    {"fullVisitorId": data["fullVisitorId"].values,
     "transactionRevenue": data["totals.transactionRevenue"].values,
     "predictedRevenue": np.expm1(y)})

    validation_res = validation_res.groupby("fullVisitorId")["transactionRevenue", "predictedRevenue"].sum().reset_index()
    return np.sqrt(mean_squared_error(np.log1p(validation_res["transactionRevenue"].values), 
                                     np.log1p(validation_res["predictedRevenue"].values)))

class KFoldValidation():
    def __init__(self, data, n_splits=5):
        unique_vis = np.array(sorted(data['fullVisitorId'].astype(str).unique()))
        folds = GroupKFold(n_splits)
        ids = np.arange(data.shape[0])
        
        self.fold_ids = []
        for trn_vis, val_vis in folds.split(X=unique_vis, y=unique_vis, groups=unique_vis):
            self.fold_ids.append([
                    ids[data['fullVisitorId'].astype(str).isin(unique_vis[trn_vis])],
                    ids[data['fullVisitorId'].astype(str).isin(unique_vis[val_vis])]
                ])
            
    def validate(self, train, test, features, model, name="", prepare_stacking=False, 
                 fit_params={"early_stopping_rounds": 50, "verbose": 100, "eval_metric": "rmse"}):
        model.FI = pd.DataFrame(index=features)
        full_score = 0
        
        if prepare_stacking:
            test[name] = 0
            train[name] = np.NaN
        
        for fold_id, (trn, val) in enumerate(self.fold_ids):
            devel = train[features].iloc[trn]
            y_devel = np.log1p(train["totals.transactionRevenue"].iloc[trn])
            valid = test[features]
            y_valid = np.log1p(test["totals.transactionRevenue"])
                       
            print("Fold ", fold_id, ":")
            model.fit(devel, y_devel, eval_set=[(valid, y_valid)], **fit_params)

            if len(model.feature_importances_) == len(features):  # some bugs in catboost?
                model.FI['fold' + str(fold_id)] = model.feature_importances_ / model.feature_importances_.sum()

            predictions = model.predict(valid)
            predictions[predictions < 0] = 0
            
            val_predictions = model.predict(train[features].iloc[val])
            val_predictions[val_predictions < 0] = 0
            
            print("Fold ", fold_id, " error: ", mean_squared_error(y_valid, predictions)**0.5)
            
            fold_score = score(test, predictions)
            full_score += fold_score / len(self.fold_ids)
            print("Fold ", fold_id, " score: ", fold_score)
            
            
            
            if prepare_stacking:
                train[name].iloc[val] = val_predictions
                
                test_predictions = model.predict(test[features])
                test_predictions[test_predictions < 0] = 0
                test[name] += test_predictions / len(self.fold_ids)
                test["prediction_"+str(fold_id)] = model.predict(test[features])
                
        print("Final score: ", full_score)
        return full_score

In [15]:
Kfolder = KFoldValidation(dev_df,5)



lgbmodel = lgb.LGBMRegressor(n_estimators=1000, objective="regression", metric="rmse", num_leaves=11, min_child_samples=100,
                      learning_rate=0.02, bagging_fraction=0.7, feature_fraction=0.5, bagging_frequency=5, 
                      bagging_seed=2019, subsample=.9, colsample_bytree=.9, use_best_model=True)

Kfolder.validate(dev_df, val_df, num_cols +["lgb_zero_clf"] + cat_cols, lgbmodel, "lgbpred", prepare_stacking=True)



Fold  0 :
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's rmse: 1.74564
[200]	valid_0's rmse: 1.7134
[300]	valid_0's rmse: 1.70775
[400]	valid_0's rmse: 1.70661
[500]	valid_0's rmse: 1.70613
Early stopping, best iteration is:
[522]	valid_0's rmse: 1.70609
Fold  0  error:  1.7060483507567654
Fold  0  score:  1.7490531569179502
Fold  1 :
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's rmse: 1.74695
[200]	valid_0's rmse: 1.71524
[300]	valid_0's rmse: 1.70978
[400]	valid_0's rmse: 1.70844
Early stopping, best iteration is:
[396]	valid_0's rmse: 1.70834
Fold  1  error:  1.7082999504880658
Fold  1  score:  1.7524473622050152
Fold  2 :
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's rmse: 1.7492
[200]	valid_0's rmse: 1.71787
[300]	valid_0's rmse: 1.71289
[400]	valid_0's rmse: 1.7118
Early stopping, best iteration is:
[392]	valid_0's rmse: 1.7117
Fold  2  error:  1.711684614584181
Fold  2  score:  1.75668

1.75269977107052