In [56]:
import os
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
import lightgbm as lgb

In [3]:
# dirs
input_dir = '../input/'
working_dir = '../working/'
output_dir = '../output/'

In [19]:
# https://www.kaggle.com/julian3833/1-quick-start-read-csv-and-flatten-json-fields

def load_df(csv_path=os.path.join(input_dir, 'train.csv'), nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    
    shorten_col_names = [i.split('.')[-1] for i in df.columns]
    df.columns = shorten_col_names
    return df

In [28]:
train = load_df(csv_path=os.path.join(input_dir, 'train.csv'), nrows=100000)
test = load_df(csv_path=os.path.join(input_dir, 'test.csv'), nrows=100000)

Loaded train.csv. Shape: (100000, 55)
Loaded test.csv. Shape: (100000, 53)


In [None]:
# train = load_df(csv_path=os.path.join(input_dir, 'train.csv'))
# test = load_df(csv_path=os.path.join(input_dir, 'test.csv'))

In [8]:
sample_submission = pd.read_csv(os.path.join(input_dir, 'sample_submission.csv'))
sample_submission.shape

(617242, 2)

In [33]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 35 columns):
channelGrouping       100000 non-null object
date                  100000 non-null int64
fullVisitorId         100000 non-null object
sessionId             100000 non-null object
visitId               100000 non-null int64
visitNumber           100000 non-null int64
visitStartTime        100000 non-null int64
browser               100000 non-null object
deviceCategory        100000 non-null object
isMobile              100000 non-null bool
operatingSystem       100000 non-null object
city                  100000 non-null object
continent             100000 non-null object
country               100000 non-null object
metro                 100000 non-null object
networkDomain         100000 non-null object
region                100000 non-null object
subContinent          100000 non-null object
bounces               48916 non-null object
hits                  100000 non-null obje

In [29]:
const_cols = [c for c in train.columns if train[c].nunique(dropna=False)==1 ]
print(const_cols)
train = train.drop(const_cols, axis=1)
test = test.drop(const_cols, axis=1)

['socialEngagementType', 'browserSize', 'browserVersion', 'flashVersion', 'language', 'mobileDeviceBranding', 'mobileDeviceInfo', 'mobileDeviceMarketingName', 'mobileDeviceModel', 'mobileInputSelector', 'operatingSystemVersion', 'screenColors', 'screenResolution', 'cityId', 'latitude', 'longitude', 'networkLocation', 'visits', 'criteriaParameters']


In [None]:
# campaignCode is not in the test data
train = train.drop('campaignCode', axis=1)

In [32]:
train.shape

(100000, 35)

## bounce が not null、もしくは hits が '1' の場合は Revenue は必ず 0
よってこれらに該当するサンプルは学習に用いず、予測にも含めない

In [42]:
train_rev_isnotnull = ~train['transactionRevenue'].isnull()
train_bounces_isnotnull = ~train['bounces'].isnull()
pd.crosstab(train_rev_isnotnull, train_bounces_isnotnull)

bounces,False,True
transactionRevenue,Unnamed: 1_level_1,Unnamed: 2_level_1
False,49685,48916
True,1399,0


In [53]:
train_hits_isnotnull = train['hits']=='1'
pd.crosstab(train_rev_isnotnull, train_hits_isnotnull)

hits,False,True
transactionRevenue,Unnamed: 1_level_1,Unnamed: 2_level_1
False,50087,48514
True,1399,0


In [51]:
train.hits[0]

'1'

In [54]:
for col in train.columns:
    if train[col].dtype == 'O':
        train[col] = train[col].astype('category')

In [58]:
is_tobe0 = (train['transactionRevenue'].isnull() | train['hits']=='1')

ValueError: fill value must be in categories

In [55]:
X_train, X_test, y_train, y_test = train_test_split()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 35 columns):
channelGrouping       100000 non-null category
date                  100000 non-null int64
fullVisitorId         100000 non-null category
sessionId             100000 non-null category
visitId               100000 non-null int64
visitNumber           100000 non-null int64
visitStartTime        100000 non-null int64
browser               100000 non-null category
deviceCategory        100000 non-null category
isMobile              100000 non-null bool
operatingSystem       100000 non-null category
city                  100000 non-null category
continent             100000 non-null category
country               100000 non-null category
metro                 100000 non-null category
networkDomain         100000 non-null category
region                100000 non-null category
subContinent          100000 non-null category
bounces               48916 non-null category
hits          