# 1. SETTINGS

In [None]:
import pandas as pd
import numpy as np

from pandas.io.json import json_normalize
import json
from ast import literal_eval

import os
import getpass

In [None]:
# pandas options
pd.set_option("display.max_columns", None)

In [None]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# garbage collection
import gc
gc.enable()

# 2. FUNCTIONS

In [None]:
##### FUNCTION 1: LOADING DATA WITH JSON
def read_csv_with_json_zipped(path, json_cols, nrows = None):
        
    # import data frame
    df = pd.read_csv(path, 
                     converters = {column: json.loads for column in json_cols}, 
                     compression = 'zip',
                     dtype = {'fullVisitorId': 'str'},
                     nrows = nrows)
    
    # extract values
    for column in json_cols:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}_{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis = 1).merge(column_as_df, right_index = True, left_index = True)

    # return data
    print(f"Loaded {os.path.basename(path)}: {df.shape}")
    return df

In [None]:
##### FUNCTION 1: LOADING DATA WITH JSON
def read_csv_with_json(path, json_cols, nrows = None):
        
    # import data frame
    df = pd.read_csv(path, 
                     converters = {column: json.loads for column in json_cols}, 
                     dtype = {'fullVisitorId': 'str'},
                     nrows = nrows)
    
    # extract values
    for column in json_cols:
        column_as_df = json_normalize(df[column])
        column_as_df.columns = [f"{column}_{subcolumn}" for subcolumn in column_as_df.columns]
        df = df.drop(column, axis = 1).merge(column_as_df, right_index = True, left_index = True)

    # return data
    print(f"Loaded {os.path.basename(path)}: {df.shape}")
    return df

In [None]:
##### FUNCTION 2: UNFOLD CUSTOM DIMENSIONS
def add_custom_dim(df):

    # extract custom dimensions
    df['customDimensions'] = df['customDimensions'].apply(literal_eval)
    df['customDimensions'] = df['customDimensions'].str[0]
    df['customDimensions'] = df['customDimensions'].apply(lambda x: {'index':np.NaN,'value':np.NaN} if pd.isnull(x) else x)

    column_as_df = json_normalize(df['customDimensions'])
    column_as_df.columns = [f"customDimensions_{subcolumn}" for subcolumn in column_as_df.columns]
    df = df.drop('customDimensions', axis=1).merge(column_as_df, right_index = True, left_index = True)
    
    return df

In [None]:
##### FUNCTION 3: FILL NA
def fill_na(df):
    
    
    ##### IMPUTE NA DIFFERENTLY
    
    # NA = unknown
    to_NA_cols = ['trafficSource_adContent',
                  'trafficSource_adwordsClickInfo.adNetworkType',
                  'trafficSource_adwordsClickInfo.slot',
                  'trafficSource_adwordsClickInfo.gclId',
                  'trafficSource_keyword',
                  'trafficSource_referralPath',
                  'customDimensions_value']

    # NA = zero
    to_0_cols = ['totals_transactionRevenue',
                 'trafficSource_adwordsClickInfo.page',
                 'totals_sessionQualityDim','totals_bounces',
                 'totals_timeOnSite',
                 'totals_newVisits',
                 'totals_pageviews',
                 'customDimensions_index',
                 'totals_transactions',
                 'totals_totalTransactionRevenue']

    # NA = TRUE / FALSE
    to_true_cols  = ['trafficSource_adwordsClickInfo.isVideoAd']
    to_false_cols = ['trafficSource_isTrueDirect']
    
    # impute missings
    df[to_NA_cols]    = df[to_NA_cols].fillna('NA')
    df[to_0_cols]     = df[to_0_cols].fillna(0)
    df[to_true_cols]  = df[to_true_cols].fillna(True)
    df[to_false_cols] = df[to_false_cols].fillna(False)
    
    
    
    ##### REPLACE SOME LEVELS WITH NA
    
    # not available, not provided, etc.
    cols_to_replace = {
        'socialEngagementType' : 'Not Socially Engaged',
        'device_browserSize' : 'not available in demo dataset', 
        'device_flashVersion' : 'not available in demo dataset', 
        'device_browserVersion' : 'not available in demo dataset', 
        'device_language' : 'not available in demo dataset',
        'device_mobileDeviceBranding' : 'not available in demo dataset',
        'device_mobileDeviceInfo' : 'not available in demo dataset',
        'device_mobileDeviceMarketingName' : 'not available in demo dataset',
        'device_mobileDeviceModel' : 'not available in demo dataset',
        'device_mobileInputSelector' : 'not available in demo dataset',
        'device_operatingSystemVersion' : 'not available in demo dataset',
        'device_screenColors' : 'not available in demo dataset',
        'device_screenResolution' : 'not available in demo dataset',
        'geoNetwork_city' : 'not available in demo dataset',
        'geoNetwork_cityId' : 'not available in demo dataset',
        'geoNetwork_latitude' : 'not available in demo dataset',
        'geoNetwork_longitude' : 'not available in demo dataset',
        'geoNetwork_metro' : ['not available in demo dataset', '(not set)'], 
        'geoNetwork_networkDomain' : ['unknown.unknown', '(not set)'], 
        'geoNetwork_networkLocation' : 'not available in demo dataset',
        'geoNetwork_region' : 'not available in demo dataset',
        'trafficSource_adwordsClickInfo.criteriaParameters' : 'not available in demo dataset',
        'trafficSource_campaign' : '(not set)', 
        'trafficSource_keyword' : ['(not provided)', '(not set)'], 
        'networkDomain': '(not set)', 
        'city': '(not set)'
    }
    df = df.replace(cols_to_replace,'NA')
    
    return df

# 3. IMPORT

In [None]:
# JSON columns
json_cols = ['device', 'geoNetwork', 'totals', 'trafficSource']

# import data
if getpass.getuser() == 'zinovyee.hub':
    train = read_csv_with_json_zipped("../../train_v2.csv.zip",  json_cols = json_cols)
    test = read_csv_with_json_zipped("../../test_v2.csv.zip",   json_cols = json_cols)
else:
    train = read_csv_with_json("../data/train_v2.csv", json_cols = json_cols, nrows = 100)
    test = read_csv_with_json("../data/test_v2.csv",   json_cols = json_cols, nrows = 100)

In [None]:
# check data
train.head()

In [None]:
# check data
test.head()

In [None]:
# drop hits [TEMPORARY]
if getpass.getuser() == 'zinovyee.hub':
    train.drop('hits', axis=1, inplace=True)
    test.drop('hits', axis=1, inplace=True)
else:
        del train['hits']
        del test['hits']

# 4. MERGER

In [None]:
# align columns
train = train.reindex_axis(sorted(train.columns), axis = 1)
test  = test.reindex_axis(sorted(test.columns),   axis = 1)

# delete vars not in test
del train['trafficSource_campaignCode']

# check equalty
train.columns == test.columns

In [None]:
# concatenate
df = pd.concat([train, test], axis = 0)
del train, test
print(df.shape)

# 5. PROCESSING

In [None]:
# unfold custom dimensions
print(df.shape)
df = add_custom_dim(df)
print(df.shape)

In [None]:
# fill missings
df = fill_na(df)

In [None]:
# convert to integers
to_int = ['customDimensions_index',
          'totals_bounces',
          'totals_newVisits',
          'totals_pageviews',
          'totals_hits',
          'totals_sessionQualityDim',
          'totals_visits',
          'totals_timeOnSite',
          'trafficSource_adwordsClickInfo.page',
          'totals_transactions',
          'totals_transactionRevenue',
          'totals_totalTransactionRevenue']
for col in to_int :
    df[col] = df[col].astype('int64')

In [None]:
# convert date
df['date'] = pd.to_datetime(df['date'].astype('str'), infer_datetime_format = True)

In [None]:
# remove columns with a single value
print(df.shape)
df = df.loc[:, df.nunique(dropna = False) != 1]
print(df.shape)

In [None]:
# convert boolean to objects
bools = ['device_isMobile', 'trafficSource_adwordsClickInfo.isVideoAd', 'trafficSource_isTrueDirect']
for var in bools:
    df[var] = df[var].astype('object')

In [None]:
# check data types
df.dtypes

# 6. EXPORT

In [None]:
# export CSV
df.to_csv("../data/data_v1.csv.gz", index = False, compression = "gzip")
df.shape

In [None]:
import pandas as pd
pd.__version__