# Extend colmuns

In [1]:
import os
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
%matplotlib inline
import lightgbm as lgb

In [2]:
# dirs
input_dir = '../input/'
working_dir = '../working/'
output_dir = '../output/'

In [3]:
# https://www.kaggle.com/julian3833/1-quick-start-read-csv-and-flatten-json-fields

def load_df(csv_path=os.path.join(input_dir, 'train.csv'), nrows=None):
    JSON_COLUMNS = ['device', 'geoNetwork', 'totals', 'trafficSource']
    
    df = pd.read_csv(csv_path, 
                     converters={column: json.loads for column in JSON_COLUMNS}, 
                     dtype={'fullVisitorId': 'str'}, # Important!!
                     nrows=nrows)
    
    for column in JSON_COLUMNS:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}.{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis=1).merge(column_as_df, right_index=True, left_index=True)
    print(f"Loaded {os.path.basename(csv_path)}. Shape: {df.shape}")
    
    # shorten col names
    short_col_names = [i.split('.')[-1] for i in df.columns]
    df.columns = short_col_names
    return df

In [4]:
train = load_df(csv_path=os.path.join(input_dir, 'train.csv'))
test = load_df(csv_path=os.path.join(input_dir, 'test.csv'))

Loaded train.csv. Shape: (903653, 55)
Loaded test.csv. Shape: (804684, 53)


## Drop columns

In [6]:
# drop constant columns
# https://www.kaggle.com/sudalairajkumar/simple-exploration-baseline-ga-customer-revenue
const_cols = [c for c in train.columns if train[c].nunique(dropna=False)==1]
print(const_cols)
train = train.drop(const_cols, axis=1)
test = test.drop(const_cols, axis=1)

['socialEngagementType', 'browserSize', 'browserVersion', 'flashVersion', 'language', 'mobileDeviceBranding', 'mobileDeviceInfo', 'mobileDeviceMarketingName', 'mobileDeviceModel', 'mobileInputSelector', 'operatingSystemVersion', 'screenColors', 'screenResolution', 'cityId', 'latitude', 'longitude', 'networkLocation', 'visits', 'criteriaParameters']


In [7]:
# campaignCode is not in the test data
train = train.drop('campaignCode', axis=1)

## dtype encoding

In [9]:
train['transactionRevenue'] = train['transactionRevenue'].astype('float')

In [10]:
# encode data type
dtype_dict = {
    # 'transactionRevenue' is not contained
    'channelGrouping':'category',
    'date':'datetime64',
    'fullVisitorId':'str',
    'sessionId':'str',
    'visitId':'str',
    'visitNumber':'int',
    'visitStartTime':'str',
    'browser':'category',
    'deviceCategory':'category',
    'isMobile':'bool',
    'operatingSystem':'category',
    'city':'category',
    'continent':'category',
    'country':'category',
    'metro':'category',
    'networkDomain':'category',
    'region':'category',
    'subContinent':'category',
    'bounces':'bool',
    'hits':'int',
    'newVisits':'bool',
    'pageviews':'float',
    'adContent':'category',
    'adNetworkType':'category',
    'gclId':'category',
    'isVideoAd':'bool',
    'page':'category',
    'slot':'category',
    'campaign':'category',
    'isTrueDirect':'bool',
    'keyword':'category',
    'medium':'category',
    'referralPath':'category',
    'source':'category'
}

In [11]:
def dtype_encode(df):
    # data type handling
    df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
    df['bounces'] = df['bounces']==1
    df['newVisits'] = df['newVisits']==1
    df['isVideoAd'] = df['isVideoAd']!=False
    df['page'] = df['page'].astype('category') #I think page number should be category.
    df['isTrueDirect'] = df['isTrueDirect']!=False

    for col_name, dtype in dtype_dict.items():
        df[col_name] = df[col_name].astype(dtype)
        
    return df

train = dtype_encode(train)
test = dtype_encode(test)

In [12]:
train.to_pickle(os.path.join(input_dir, 'train.pkl'))
test.to_pickle(os.path.join(input_dir, 'test.pkl'))

In [13]:
train = pd.read_pickle(os.path.join(input_dir, 'train.pkl'))
test = pd.read_pickle(os.path.join(input_dir, 'test.pkl'))
sample_submission = pd.read_csv(os.path.join(input_dir, 'sample_submission.csv'), dtype={'fullVisitorId': 'str'})

In [11]:
tmp = 'newVisits'
print(train[tmp][:10])
print(test[tmp].unique())

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9    False
Name: newVisits, dtype: bool
[False]


In [12]:
train[tmp].unique()[:50]

array([False])

In [13]:
train.shape[0] - (train[tmp].isnull()).sum()

903653

In [14]:
len(train[tmp].unique())

1