In [None]:
#!pip install tensorflow==1.12
import google.datalab.bigquery as bq
import matplotlib.pyplot as plot
import numpy as np
import pandas as pd
import datetime
import math
from pandas.io import gbq
import tensorflow as tf
import re
from tensorflow import keras
import seaborn as sns

In [None]:
query = """#standardSQL
CREATE TEMP FUNCTION
  customDimensionByIndex(indx INT64,
    arr ARRAY<STRUCT<index INT64,
    value STRING>>) AS ( (
    SELECT
      x.value
    FROM
      UNNEST(arr) x
    WHERE
      indx=x.index) );
SELECT
  fullVisitorId,
  visitId,
  EXTRACT(HOUR from TIMESTAMP_SECONDS(visitStartTime)) AS VisitingHour,
  EXTRACT(DAYOFWEEK from TIMESTAMP_SECONDS(visitStartTime)) AS VisitingDayOfWeek,
  trafficSource.source, 
  device.browser,
  device.operatingSystem,
  device.LANGUAGE,
  device.deviceCategory,
  
 ---session scope custom dimension---
  customDimensionByIndex(3,
    t.customDimensions) AS contentCategory, 
  ---User scope custom dimension---
  EXTRACT(YEAR FROM CURRENT_DATE())- CAST(SUBSTR(customDimensionByIndex(6,t.customDimensions),1,4) AS NUMERIC) AS userAge

FROM
  `tencent-ga-bigquery-217708.18845258.ga_sessions_*`t
WHERE
  _TABLE_SUFFIX BETWEEN '20181001' AND '20181201' 
    --filter out rows that gender is null
   AND customDimensionByIndex(6,t.customDimensions) IS NOT NULL
   AND(
    --filter for correct syntax of birthdays 
    REGEXP_CONTAINS(customDimensionByIndex(6,t.customDimensions),r'^\\d{8}$')
    OR REGEXP_CONTAINS(customDimensionByIndex(6,t.customDimensions),r'^\\d{4}-\\d{2}-\\d{2}$')
  ) 
  
limit 1000000
"""

data_original = gbq.read_gbq(query,project_id = "tencent-ga-bigquery-217708" )

In [None]:
data = data_original.copy()
print(data.columns)

In [None]:
plot.hist(data.userAge.astype('int'), bins = range(5,75,5))

In [None]:
# remove rows with null values
data.dropna(axis=0, inplace=True)
# remove rows where userAge=5 or userAge>=70
data = data[(data.userAge>=5) & (data.userAge<70)]

data = data.drop(['visitId','VisitingHour','VisitingDayOfWeek'],axis=1)

# prepare column browser, change browers that are not specified below to 'other'
data.loc[(data.browser != 'Chrome') &
         (data.browser != 'Safari') &
         (data.browser != 'Firefox') &
         (data.browser != 'Samsung Internet') &
         (data.browser != 'Android Webview') &
         (data.browser != 'Edge') &
         (data.browser !='Internet Explorer'),'browser'] = 'other'

# prepare column language
data.rename(columns = {'LANGUAGE':'language'}, inplace=True)
data.loc[data.language.str.contains('en'),'language'] = 'en'  #data has inconsistent values e.g. 'en','en-en','en-bg'
data.loc[data.language.str.contains('th'),'language'] = 'th'
data.language.str.contains('en|th')
data.loc[(data.language!='th') & 
         (data.language!='en'),'language'] = 'other'

# prepare column operatingSystem
data.loc[(data.operatingSystem != 'Windows') &
         (data.operatingSystem != 'Android') &
         (data.operatingSystem != 'Macintosh') &
         (data.operatingSystem !='iOS'),'operatingSystem'] = 'other'

# prepare column source 
data.loc[data.source.str.contains('facebook'),'source'] = 'facebook' #data has inconsistent value such as 'facebook','facebook.com'
data.loc[data.source.str.contains('google'),'source'] = 'google'
data.loc[data.source.str.contains('sanook'),'source'] = 'sanook'
data.loc[data.source.str.contains('direct'),'source'] = 'direct'
data.loc[(data.source != 'facebook') &
         (data.source != 'google') &
         (data.source != 'sanook') &
         (data.source !='direct'),'source'] = 'other'

# remove rows where contentCategory contains non-english characters
data = data[data.contentCategory.str.contains('^[A-Za-z]+$', regex=True)]
data.loc[:,'contentCategory'] = data.contentCategory.str.lower()
print(data.shape)

In [None]:
def make_pivot_table(columns):
    result =None
    for column in columns:
        pivot = data.loc[:,['fullVisitorId',column]]
        pivot = pivot.pivot_table(index='fullVisitorId',columns=column,aggfunc= any)
        pivot.columns = ['_'.join((column,i)) for i  in pivot.columns]
        if result is None:
          result = pivot
        else:
          result = result.join(pivot)
        
    result = result.replace({
      True:1,
      None:0
    })
    return result

age = data.loc[:,['fullVisitorId','userAge']].drop_duplicates().set_index('fullVisitorId')    
data = make_pivot_table(data.columns[1:-1])
data = data.join(age)

In [None]:
#remove columns with less than 0.1% of 1s  
#data = data[data.columns[data.sum()>data.shape[0]*0.001]]


In [None]:
############## without bins  #################

train_dataset = data.sample(frac=0.8,random_state=0).astype('float64')   #############
test_dataset = data.drop(train_dataset.index).astype('float64')

train_labels = train_dataset['userAge']
train_dataset = train_dataset.drop('userAge',axis = 1)
test_labels = test_dataset['userAge']
test_dataset = test_dataset.drop('userAge',axis = 1)

#covnert dataframe to ndarrays for tensorflow input 
train_dataset = train_dataset.values
train_labels = train_labels.values
test_dataset = test_dataset.values
test_labels = test_labels.values

In [44]:
  # regression, for age before binning 
    
model1 = keras.Sequential([
    keras.layers.Dense(16,activation = tf.nn.relu, input_shape = (train_dataset.shape[1],) ),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(16,activation = tf.nn.relu),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(1)   
  ])
  
  
model1.compile(loss= 'mse',  # mean square error
               optimizer = tf.train.RMSPropOptimizer(0.001),
               metrics=['mae'])
# automatically stop epochs when val_loss stops decreasing
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss',patience =20)
history1 = model1.fit(train_dataset,train_labels, epochs = 20, batch_size = 1000,validation_split = 0.2, verbose=1, 
                              callbacks = [early_stop])



Train on 288096 samples, validate on 72024 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [45]:
model2 = keras.Sequential([
    keras.layers.Dense(256,activation = tf.nn.relu, input_shape = (train_dataset.shape[1],) ),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(128,activation = tf.nn.relu),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(1)   
  ])
  
  
model2.compile(loss= 'mse',  # mean square error
               optimizer = tf.train.RMSPropOptimizer(0.001),
               metrics=['mae'])
# automatically stop epochs when val_loss stops decreasing
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss',patience =20)
histor2 = model2.fit(train_dataset,train_labels, epochs = 20, batch_size = 1000,validation_split = 0.2, verbose=1, 
                              callbacks = [early_stop])


Train on 288096 samples, validate on 72024 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [46]:
def plot_history(histories, key='mean_absolute_error'):
  plot.figure(figsize=(16,10))
  for name, history in histories:
    val = plot.plot(history.epoch, history.history['val_'+key],
                   '--', label=name.title()+' Validation')
    plot.plot(history.epoch, history.history[key], color=val[0].get_color(),
             label=name.title()+' Train')

  plot.xlabel('Epochs')
  plot.ylabel(key.replace('_',' ').title())
  plot.legend()
  plot.xlim([0,max(history.epoch)])
  
  
plot_history([('model 1',history1),('model 2',history2)])
plot.title('comparison of different models')

NameError: name 'history2' is not defined

In [None]:
test_predictions = model2.predict(test_dataset)
plot.scatter(test_labels, test_predictions)
plot.plot(range(10,60), range(10,60), 'r')
plot.xlabel('testset - actual age')
plot.ylabel('testset - predicted age')
plot.title('actual vs prediction')


In [None]:
############## data with age binned ##############
# put age into different bins, 5 years per bin, from 5 to 70, total 13 bins 
# [5-10) => 0
# [10-15) => 1
# [65-70) => 12
data['userAgeBins'] = ((1+data.userAge)/5 ).apply(math.ceil)-2

data.drop('userAge',axis =1)
train_dataset_bin = data.sample(frac=0.8,random_state=0).astype('float64')   #############
test_dataset_bin = data.drop(train_dataset_bin.index).astype('float64')

train_labels_bin = train_dataset_bin['userAgeBins']
train_dataset_bin = train_dataset_bin.drop('userAgeBins',axis = 1)
test_labels_bin = test_dataset_bin['userAgeBins']
test_dataset_bin = test_dataset_bin.drop('userAgeBins',axis = 1)

#covnert dataframe to ndarrays for tensorflow input 
train_dataset_bin = train_dataset_bin.values
train_labels_bin = train_labels_bin.values
test_dataset_bin = test_dataset_bin.values
test_labels_bin = test_labels_bin.values

In [None]:
plot.figure()
plot.hist(data.userAgeBins, bins = range(0,13))
plot.figure()
plot.hist(data.userAge.astype('int'), bins = range(5,75,5))

In [None]:
model_bin = keras.Sequential([
    keras.layers.Dense(64, activation=tf.nn.relu, input_shape = (train_dataset_bin.shape[1],)),
    keras.layers.Dense(64, activation=tf.nn.relu),
    keras.layers.Dense(13, activation=tf.nn.softmax)
])

model_bin.compile(optimizer=tf.train.AdamOptimizer(), 
              loss= 'sparse_categorical_crossentropy',
              metrics=['accuracy'])

history_bin = model_bin.fit(train_dataset_bin,train_labels_bin, epochs =30, batch_size = 1000 ,validation_split=0.1, verbose=0)
results = model_bin.evaluate(test_dataset_bin, test_labels_bin)
print(results)

In [None]:
plot.figure(figsize = (20,10))
plot.xlabel('Epoch')
plot.ylabel('accuracy')
plot.plot(history_bin.epoch, history_bin.history['acc'],'g',label='Train accuracy')
plot.plot(history_bin.epoch, history_bin.history['val_acc'], 'g--',label='Val accuracy')
plot.legend()
plot.title('accuracy for the model with user age binned')

In [None]:

test_predictions_bin = model_bin.predict(test_dataset_bin)
test_predictions_bin = pd.DataFrame(test_predictions_bin).apply(np.argmax, axis =1)


In [None]:
plot.subplot(2,1,1)
plot.hist(test_predictions_bin)
plot.title('predcition')

plot.subplot(2,1,2)
plot.hist(test_labels_bin)
plot.title('test data')

In [None]:
# without jitter, lots of overlap 
plot.figure()
plot.scatter(test_labels_bin, test_predictions_bin)
plot.plot(range(0,13), range(0,13), 'r')
plot.xlabel('testset - actual age (bin)')
plot.ylabel('testset - predicted age (bin)')
plot.title('actual vs prediction (bin, without jitter)')
plot.xticks(np.arange(0,13,1))
plot.yticks(np.arange(0,13,1))


# manually add jitter 
plot.figure()
test_labels_bin_jitter = test_labels_bin + np.random.rand(test_labels_bin.shape[0])
test_predictions_bin_jitter = test_predictions_bin + np.random.rand(test_predictions_bin.shape[0])
plot.scatter(test_labels_bin_jitter, test_predictions_bin_jitter)
plot.plot(range(0,13), range(0,13), 'r')
plot.xlabel('testset - actual age (bin)')
plot.ylabel('testset - predicted age (bin)')
plot.title('actual vs prediction (bin, with jitter)')
plot.xticks(np.arange(0,13,1))
plot.yticks(np.arange(0,13,1))

In [None]:
np.random.rand(test_labels_bin.shape[0])*0.1
# test_labels_bin
plot.scatter(test_labels_bin + np.random.rand(test_labels_bin.shape[0])*1, test_predictions_bin+np.random.rand(test_labels_bin.shape[0])*1)

In [None]:
print('train_dataset has the shape:',train_dataset.shape)
print('train_labels has the shape:',train_labels.shape)