# Preprocessing Google Analytics Data

# Imports

In [0]:
!pip install markovclick

Collecting markovclick
  Downloading https://files.pythonhosted.org/packages/1a/ff/8be808d320bed494f310aaed44406df1502e306724b2d19d059cf01cf363/markovclick-0.1.1-py3-none-any.whl
Installing collected packages: markovclick
Successfully installed markovclick-0.1.1


In [0]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import os
import pandas as pd
import dask

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import ast
import json

#Visualization
import matplotlib.pyplot as plt

#plt.style.use('seaborn-white')
#plt.style.use('fivethirtyeight')
%matplotlib inline

In [0]:
import sklearn as skl
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import nltk
from gensim import corpora

import markovclick
import seaborn as sns

In [0]:
pd.set_option('display.max_colwidth', -1)

## Import Data (Seperate Tables)

In [0]:
id = '1mGi2ZF8OKq4pSFcHU0IVpnXeBXSZTKOo'

### Flatten JSON Columns
(totals, trafficSource, device, geoNetwordk, customDimensions, hits)

In [0]:
cols_to_parse = ['device', 'geoNetwork', 'totals', 'trafficSource', 'hits',
               'customDimensions']
  
other_cols = ['visitorId', 'visitNumber', 'visitStartTime','date','userId',
                'channelGrouping', 'socialEngagementType']
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('Filename.csv')  
   
  #read the data file, convert the columns in the list of columns to parse using json loader,
  #convert the `fullVisitorId` field as a string
data_df = pd.read_csv('Filename.csv', converters={column: ast.literal_eval for column in cols_to_parse}, dtype={'fullVisitorId': 'str'})

### Get Hits DF

In [0]:
hits_df = pd.DataFrame(data_df['hits'])

In [0]:
### Helper function to get paths per session

def get_paths(mylist):
  path = []
  for element in mylist:
      try: 
        path.append(element['page']['pageTitle'])
      except:
        pass
  return path

In [0]:
hits_df['paths'] = hits_df.hits.apply(get_paths)

In [0]:
#list of unique pages
pages = list(set(hits_df.paths.sum()))

#### Page Value

In [0]:
hits_df.hits[0]

[{'appInfo': {'appId': None,
   'appInstallerId': None,
   'appName': None,
   'appVersion': None,
   'exitScreenName': 'shop.googlemerchandisestore.com/signin.html',
   'id': None,
   'installerId': None,
   'landingScreenName': 'shop.googlemerchandisestore.com/home',
   'name': None,
   'screenDepth': '0',
   'screenName': 'shop.googlemerchandisestore.com/home',
   'version': None},
  'contentGroup': {'contentGroup1': '(not set)',
   'contentGroup2': '(not set)',
   'contentGroup3': '(not set)',
   'contentGroup4': '(not set)',
   'contentGroup5': '(not set)',
   'contentGroupUniqueViews1': None,
   'contentGroupUniqueViews2': None,
   'contentGroupUniqueViews3': None,
   'contentGroupUniqueViews4': None,
   'contentGroupUniqueViews5': None,
   'previousContentGroup1': '(entrance)',
   'previousContentGroup2': '(entrance)',
   'previousContentGroup3': '(entrance)',
   'previousContentGroup4': '(entrance)',
   'previousContentGroup5': '(entrance)'},
  'contentInfo': None,
  'customDim

### Encode Paths

In [0]:
#helper functions to encode paths
def encode_paths(list_of_pages):
  
  le = skl.preprocessing.LabelEncoder()
  le.fit(list_of_pages)
  
  encoded_paths =hits_df['paths'].apply(lambda x: le.transform(x))

  #encoding_dict = dict(zip(le.transform(le.classes_),le.classes_))
  
  return encoded_paths

In [0]:
hits_df['paths_encoded']= encode_paths(pages)

In [0]:
#paths_df = hits_df[['paths','paths_encoded']]
paths_df['paths'] = hits_df['paths']

In [0]:
type(paths_df)

pandas.core.series.Series

### Make Pairs (Bigrams)

In [0]:
def get_bigrams(mylist):
  bigrams = nltk.bigrams(mylist)
  return list(bigrams)
 

In [0]:
hits_df['path_links'] = hits_df.paths.apply(get_bigrams)

In [0]:
paths_df = hits_df[['paths','path_links']]

In [0]:
paths_df.head(1)

Unnamed: 0,paths,path_links
0,"[Home, Gift Cards, The Google Merchandise Store - Log In]","[(Home, Gift Cards), (Gift Cards, The Google Merchandise Store - Log In)]"


In [0]:
pairs_bigrams = paths_df.path_links.sum()

In [0]:
len(pairs_bigrams)

11295

In [0]:
pairs = pd.DataFrame(pairs_bigrams, columns=['state1', 'state2'])
counts = pairs.groupby('state1')['state2'].value_counts()
probs = (counts / counts.sum()).unstack()

In [0]:
probs = (pairs_gb / pairs_gb.sum()).unstack()

In [0]:
probs.head()

state2,20 oz Stainless Steel Insulated Tumbler,26 oz Double Wall Insulated Bottle,Accessories,Android,Android Glass Water Bottle with Black Sleeve,Android Men's Vintage Henley,Android Men's Vintage Tee,Android Men's Zip Hoodie,Android Rise 14 oz Mug,Android Toddler Short Sleeve T-shirt Aqua,...,Yoga Mat,YouTube,YouTube Men's 3/4 Sleeve Henley,YouTube Men's Short Sleeve Hero Tee Black,YouTube Men's Short Sleeve Hero Tee White,YouTube Men's Vintage Tank,YouTube Men's Vintage Tee,YouTube Trucker Hat,Your Wishlist,Дома
state1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20 oz Stainless Steel Insulated Tumbler,,,,,,,,,,,...,,,,,,,,,,
26 oz Double Wall Insulated Bottle,,,,,8.9e-05,,,,,,...,,,,,,,,,,
Accessories,,,0.002479,,,,,,,,...,,,,,,,,,,
Android,,,,0.003807,,8.9e-05,8.9e-05,0.000177,,,...,,0.000266,,,,,,,,
Android Glass Water Bottle with Black Sleeve,,,,,8.9e-05,,,,,,...,,,,,,,,,,


In [0]:
page_id = list(range(0,176))

In [0]:
page_names = probs.index

In [0]:
id2page_dict = dict(zip(page_names, page_id))

In [0]:
id2page_dict;

In [0]:
transition_df = probs

In [0]:
#probs = probs.set_index([page_id,page_names])

In [0]:
actions_prob_dict = probs.apply(lambda x: x.dropna().to_dict(),axis=1)

In [0]:
actions_prob_dict

state1
20 oz Stainless Steel Insulated Tumbler                               {'Drinkware': 8.853474988933157e-05, 'Electronics': 8.853474988933157e-05}                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

In [0]:
probs.index

MultiIndex(levels=[['20 oz Stainless Steel Insulated Tumbler', '26 oz Double Wall Insulated Bottle', 'Accessories', 'Android', 'Android Glass Water Bottle with Black Sleeve', 'Android Men's Vintage Henley', 'Android Men's Vintage Tee', 'Android Men's Zip Hoodie', 'Android Rise 14 oz Mug', 'Android Toddler Short Sleeve T-shirt Pewter', 'Android Toddler Short Sleeve T-shirt Pink', 'Android Women's Fleece Hoodie', 'Android Women's Short Sleeve Hero Tee Black', 'Android Youth Short Sleeve T-shirt Pewter', 'Apparel', 'Backpacks', 'Badge Holder', 'Bags', 'Bic Intensity Clic Gel Pen', 'Bic Leather Pen', 'Brands', 'Checkout Confirmation', 'Checkout Review', 'Checkout Your Information', 'Color Changing Grip Pen', 'Compact Eco Journal', 'Drinkware', 'Electronics', 'Frequently Asked Questions', 'Fun', 'Gift Cards', 'Google', 'Google Baby Essentials Set', 'Google Electronics Accessory Pouch', 'Google Heavyweight Long Sleeve Hero Tee Burgundy', 'Google Heavyweight Long Sleeve Hero Tee Navy', 'Googl

In [0]:
pages_transition_matrix = probs.columns

In [0]:
transition_matrix = probs.values

In [0]:
import pickle

In [0]:
f=open('actions_prob_dict.pkl','wb')

In [0]:
pickle.dump(actions_prob_dict, f)

In [0]:
f.close()

In [0]:
from google.colab import files
files.download('actions_prob_dict.pkl') 

In [0]:
possible_actions_dict = pairs.groupby('state1')['state2'].apply(list).to_dict()

SyntaxError: ignored

In [0]:
pickle_file(possible_actions_dict)

AttributeError: ignored

In [0]:
def get_data_d3(pairs_bigrams):
  links_dict = []
  for bigram in range(len(pairs_bigrams)):
    d = {}
    d['source'] = pairs_bigrams[bigram][0]
    d['target'] = pairs_bigrams[bigram][1]
    
    links_dict.append(d)
    
  return links_dict

In [0]:
links_dictionary = get_data_d3(pairs_bigrams)

In [0]:
print(links_dictionary)

[{'source': 88, 'target': 31}, {'source': 31, 'target': 118}, {'source': 88, 'target': 102}, {'source': 102, 'target': 104}, {'source': 88, 'target': 88}, {'source': 88, 'target': 15}, {'source': 15, 'target': 88}, {'source': 88, 'target': 15}, {'source': 15, 'target': 88}, {'source': 88, 'target': 88}, {'source': 88, 'target': 88}, {'source': 88, 'target': 16}, {'source': 16, 'target': 16}, {'source': 88, 'target': 88}, {'source': 88, 'target': 174}, {'source': 174, 'target': 88}, {'source': 88, 'target': 27}, {'source': 27, 'target': 28}, {'source': 28, 'target': 102}, {'source': 88, 'target': 88}, {'source': 88, 'target': 104}, {'source': 104, 'target': 28}, {'source': 28, 'target': 174}, {'source': 88, 'target': 27}, {'source': 27, 'target': 109}, {'source': 109, 'target': 31}, {'source': 31, 'target': 29}, {'source': 88, 'target': 28}, {'source': 28, 'target': 18}, {'source': 18, 'target': 18}, {'source': 18, 'target': 18}, {'source': 88, 'target': 88}, {'source': 88, 'target': 16

In [0]:
def clean_keys(paths_dict):
  for dict in paths_dict:
    for k,v in dict.items():
      k = k.replace("'",'')
    
  return paths_dict

In [0]:
links_dictionary

AttributeError: ignored

## Transition Matrix

In [0]:
#helper function to format label string

def get_label(list):
  return ["P{}".format(word) for word in list]

In [0]:
def format_paths(encoded_paths_series):
  path_labels = encoded_paths_series.apply(lambda x: x.astype('str').tolist())
  path_labels = path_labels.apply(get_label)
  
  return path_labels

In [0]:
paths_df['path_labels'] = format_paths(paths_df.paths_encoded)

In [0]:
from markovclick.models import MarkovClickstream
paths_clickstreams = paths_df.path_labels.tolist()
m = MarkovClickstream(paths_clickstreams)

In [0]:
transition_matrix = m.prob_matrix

In [0]:
states = m.pages

# Markov

In [0]:
class MarkovChain(object):
    def __init__(self, transition_matrix, states):
        """
        Initialize the MarkovChain instance.
 
        Parameters
        ----------
        transition_matrix: 2-D array
            A 2-D array representing the probabilities of change of 
            state in the Markov Chain.
 
        states: 1-D array 
            An array representing the states of the Markov Chain. It
            needs to be in the same order as transition_matrix.
        """
        self.transition_matrix = np.atleast_2d(transition_matrix)
        self.states = states
        self.index_dict = {self.states[index]: index for index in 
                           range(len(self.states))}
        self.state_dict = {index: self.states[index] for index in
                           range(len(self.states))}
 
    def next_state(self, current_state):
        """
        Returns the state of the random variable at the next time 
        instance.
 
        Parameters
        ----------
        current_state: str
            The current state of the system.
        """
        return np.random.choice(
         self.states, 
         p=self.transition_matrix[self.index_dict[current_state], :]
        )
 
    def generate_states(self, current_state, no=10):
        """
        Generates the next states of the system.
 
        Parameters
        ----------
        current_state: str
            The state of the current random variable.
 
        no: int
            The number of future states to generate.
        """
        future_states = []
        for i in range(no):
            next_state = self.next_state(current_state)
            future_states.append(next_state)
            current_state = next_state
        return future_states

In [0]:
webpages_chain = MarkovChain(transition_matrix=transition_matrix,
                                states=states)

In [0]:
webpages_chain.next_state(current_state='P0')

'P28'

In [0]:
webpages_chain.generate_states(current_state='P0', no=2)

['P28', 'P114']

### Converting to Datetime

In [0]:
##Helper function to make datetime objects

def get_datetime(visitors_df):
  #Convert date column to datetime object
  visitors_df['date'] = pd.to_datetime(visitors_df['date'])
  
  #Convert visitStartTime to datetime
  visitors_df['visitStartTime'] = pd.to_datetime(visitors_df['visitStartTime'], unit='s')
  
  return visitors_df


In [0]:
visitors = get_datetime(visitors)

# EDA

In [0]:
features.keys()

dict_keys(['device', 'geoNetwork', 'totals', 'trafficSource', 'hits', 'customDimensions'])

In [0]:
features['hits'][4][0]['page']['pageTitle']

TypeError: ignored

In [0]:
path_1 = []

for col in range(0, 230):
  try: 
    path_1.append(features['hits'][col][0]['page']['pageTitle'])
  except:
    pass

In [0]:
len(features['hits'])

1711

In [0]:

def get_paths():
  session_paths = {}
  for session in range(len(features['hits'])):
    path = []
    for col in range(0, 230):
      try: 
        path.append(features['hits'][col][session]['page']['pageTitle'])
      except:
        pass
    session_paths[session] = path
  return session_paths