In [110]:
from __future__ import division
import pandas as pd
import uuid
import random
import numpy as np
import datetime, time
import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
raw_df = pd.read_csv('/home/jasper/Downloads/user-actions000000000000', sep=',')

print raw_df.info()

raw_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8354396 entries, 0 to 8354395
Data columns (total 3 columns):
date             int64
fullVisitorId    uint64
transactions     float64
dtypes: float64(1), int64(1), uint64(1)
memory usage: 191.2 MB
None


Unnamed: 0,date,fullVisitorId,transactions
0,20160930,1769651648414523235,
1,20160930,1867790925252608084,
2,20160930,2083829884801365723,
3,20160930,272111073307289500,
4,20160930,291837269665288281,


For each day in the range, we need to calculate how many different visits there were.

For each visit we need:
    - Who the user was, if they were returning or not
    - Session start time
    - If they purchased
    - How many items they purchased
    - Which items they purchased (Product IDs)
    - The total transaction revenue
    - Google search terms
    - Channel
    - Onsite search terms
    - User Agent Stuff [
        device broser,
        browser version,
        OS,
        geo - town,
        lat,
        long]    

In [338]:
def how_many_visits_today(lower_bound, upper_bound):
    return np.random.randint(lower_bound, upper_bound)

def generate_session_id():
    return uuid.uuid4()

def generate_time_of_day():
    return {
        'hours': random.randrange(0, 23),
        'minutes': random.randrange(0, 59),
        'seconds': random.randrange(0, 59)
    }

def combine_times(input_date):
    yield input_date + datetime.timedelta(**generate_time_of_day())
    
def was_product_purchased():
    X = random.random()
    if X >= .95:
        return True
    else:
        return False
    
def which_product_was_purchased():
    p_id = np.random.multinomial(1, [1/7.]*5 + [2/7.], size=1).argmax()
    return {'id': '{0}'.format(p_id)}

def how_much_was_the_product(p_id):
    X = {
        '1': 10,
        '2': 20,
        '3': 30,
        '4': 40,
        '5': 50,
        '0': 5}
    return X[p_id]

def which_marketing_channel():
    channels = {
        0: 'direct',
        1: 'seo',
        2: 'cpc',
        3: 'display',
        4: 'other'
    }
    return channels[np.random.multinomial(1, [1/5.]*5, size=1).argmax()]

def which_browser():
    browser = {
        0: 'edge',
        1: 'chrome',
        2: 'firefox',
        3: 'apple',
        4: 'other'
    }
    return browser[np.random.multinomial(1, [1/5.]*5, size=1).argmax()]


def which_OS():
    OS = {
        0: 'windows',
        1: 'linux',
        2: 'apple',
        3: 'gentoo'
    }
    return OS[np.random.multinomial(1, [5/10., 3/10., 3/20., 1/20.], size=1).argmax()]


def generate_user_record(date):
    return {
        'session_id': generate_session_id(),
        'visitStartTime': combine_times(d),
        'transaction': was_product_purchased(),
        'marketing_channel': which_marketing_channel(),
        'deviceBrowser': which_browser(),
        'which_OS': which_OS()}

In [331]:
which_OS()

'linux'

In [340]:
pd.date_range()

datetime.datetime(2018, 1, 31, 0, 0)

In [339]:
d

Timestamp('2018-01-31 00:00:00', freq='D')

In [336]:
for d in pd.date_range(start=start_date, end='2018-01-31'):
    user_record = generate_user_record(d)

In [337]:
user_record

{'deviceBrowser': 'other',
 'marketing_channel': 'cpc',
 'session_id': UUID('a2e55c85-77e6-4790-a7c0-e0d3f70af16f'),
 'transaction': False,
 'visitStartTime': <generator object combine_times at 0x7f95996ef780>,
 'which_OS': 'windows'}

In [175]:
how_much_was_the_product(which_product_was_purchased()['id'])

30

In [195]:
which_product_was_purchased()

{'id': '5'}