# 1. SETTINGS

In [1]:
##### LIBRARIES

import pandas as pd
import numpy as np

from pandas.io.json import json_normalize
import json
from ast import literal_eval

import os
import gc
import warnings

In [2]:
##### CONFIGURATION

# pandas options
pd.set_option("display.max_columns", None)

# ignore warnings
warnings.filterwarnings("ignore")

# garbage collection
gc.enable()

In [None]:
##### MODULES

sys.path.append('../codes')
from preprocessing import *

# 2. IMPORT

In [None]:
# JSON columns
json_cols = ['device', 'geoNetwork', 'totals', 'trafficSource']

# import data
train = read_csv_with_json("../data/train_v2.csv", json_cols = json_cols)
test = read_csv_with_json("../data/test_v2.csv",   json_cols = json_cols)

In [None]:
# check data
train.head()

In [None]:
# check data
test.head()

In [13]:
# drop hits [TEMPORARY]
del train['hits']
del test['hits']

# 3. MERGER

In [14]:
# align columns
train = train.reindex_axis(sorted(train.columns), axis = 1)
test  = test.reindex_axis(sorted(test.columns),   axis = 1)

# delete vars not in test
del train['trafficSource_campaignCode']

# check equality
train.columns == test.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True])

In [15]:
# concatenate
df = pd.concat([train, test], axis = 0)
del train, test
print(df.shape)

(2109926, 58)


# 4. PROCESSING

In [16]:
train.hits[0]

"[{'hitNumber': '1', 'time': '0', 'hour': '17', 'minute': '0', 'isInteraction': True, 'isEntrance': True, 'isExit': True, 'referer': 'https://www.google.co.uk/search?q=water+bottle&ie=utf-8&num=100&oe=utf-8&hl=en&gl=GB&uule=w+CAIQIFISCamRx0IRO1oCEXoliDJDoPjE&glp=1&gws_rd=cr&fg=1', 'page': {'pagePath': '/google+redesign/bags/water+bottles+and+tumblers', 'hostname': 'shop.googlemerchandisestore.com', 'pageTitle': 'Water Bottles & Tumblers | Drinkware | Google Merchandise Store', 'pagePathLevel1': '/google+redesign/', 'pagePathLevel2': '/bags/', 'pagePathLevel3': '/water+bottles+and+tumblers', 'pagePathLevel4': ''}, 'transaction': {'currencyCode': 'USD'}, 'item': {'currencyCode': 'USD'}, 'appInfo': {'screenName': 'shop.googlemerchandisestore.com/google+redesign/bags/water+bottles+and+tumblers', 'landingScreenName': 'shop.googlemerchandisestore.com/google+redesign/bags/water+bottles+and+tumblers', 'exitScreenName': 'shop.googlemerchandisestore.com/google+redesign/bags/water+bottles+and+tum

In [16]:
# unfold custom dimensions
print(df.shape)
df = add_custom_dim(df)
print(df.shape)

(2109926, 58)
(2109926, 59)


In [17]:
# fill missings
df = fill_na(df)

In [18]:
# convert to integers
to_int = ['customDimensions_index',
          'totals_bounces',
          'totals_newVisits',
          'totals_pageviews',
          'totals_hits',
          'totals_sessionQualityDim',
          'totals_visits',
          'totals_timeOnSite',
          'trafficSource_adwordsClickInfo.page',
          'totals_transactions',
          'totals_transactionRevenue',
          'totals_totalTransactionRevenue']
for col in to_int :
    df[col] = df[col].astype('int64')

In [19]:
# convert date
df['date'] = pd.to_datetime(df['date'].astype('str'), infer_datetime_format = True)

In [20]:
# remove columns with a single value
print(df.shape)
df = df.loc[:, df.nunique(dropna = False) != 1]
print(df.shape)

(2109926, 59)
(2109926, 40)


In [21]:
# convert boolean to objects
bools = ['device_isMobile', 'trafficSource_adwordsClickInfo.isVideoAd', 'trafficSource_isTrueDirect']
for var in bools:
    df[var] = df[var].astype('object')

In [22]:
# check data types
df.dtypes

channelGrouping                                         object
date                                            datetime64[ns]
device_browser                                          object
device_deviceCategory                                   object
device_isMobile                                         object
device_operatingSystem                                  object
fullVisitorId                                           object
geoNetwork_city                                         object
geoNetwork_continent                                    object
geoNetwork_country                                      object
geoNetwork_metro                                        object
geoNetwork_networkDomain                                object
geoNetwork_region                                       object
geoNetwork_subContinent                                 object
totals_bounces                                           int64
totals_hits                                            

# 5. EXPORT

In [23]:
# export CSV
df.to_csv("../data/data_v1.csv.gz", index = False, compression = "gzip")
df.shape

(2109926, 40)