### Table:
|visitor_id|page_name|visit_datetime|conversion_flag|
|:--------:|:--------:|:--------:|:--------:|
|123|A|11/1/2019 9:00:00|0|
|123|A|11/1/2019 9:20:00|1|
|123|B|11/1/2019 9:30:00|1|
|...|...|...|...|...|

## Login Data Generation

Load Python Libraries

In [1]:
from datetime import datetime as dt
import random as prandom
from random import random as rd
from random import randint as rdint
import string
import csv

Randomizer Functions:  
* random_id()
* random_string()
* random_date()
* random_boolean()

In [2]:
def random_id(from_id=1,to_id=100):
    return str(rdint(from_id,to_id))

def random_string(l=5,st='letters'):
    match st:
        case 'alphanumeric':
            return ''.join(prandom.SystemRandom().choice(string.ascii_letters + string.digits) for _ in range(l))
        case 'numbers':
            return ''.join(prandom.SystemRandom().choice(string.digits) for _ in range(l))
        case 'all':
            return ''.join(prandom.SystemRandom().choice(string.ascii_letters + string.digits + string.punctuation) for _ in range(l))
        case _:
            return ''.join(prandom.SystemRandom().choice(string.ascii_letters) for _ in range(l))
        
def random_date(start=(2010,1,1),end=(2024,1,31)):
    dt1 = dt(start[0],start[1],start[2],0,0,0).timestamp()
    dt2 = dt(end[0],end[1],end[2],23,59,59).timestamp()
    return dt.strftime(dt.fromtimestamp(dt1 + ((dt2-dt1)*rd())),'%Y-%m-%d %H:%M:%S')

def random_boolean(true_rate=None):
    if true_rate == None:
        true_rate = 50
    return (True if rdint(1,100) <= true_rate else False)

Upload and Process in PostgreDB

In [3]:
import psycopg2
con = psycopg2.connect(
    database='postgres',
    user='postgres',
    password='postgrepassword',
    host='localhost'
)
cur = con.cursor()
table_name = 'visitor_table'

# Drop table if exists
cur.execute(f'select * from information_schema.tables where table_name=\'{table_name}\';')
if bool(cur.rowcount):
    print(f'Table {table_name} exists, dropping...')
    cur.execute(f'DROP TABLE {table_name};')

# Create Table
print(f'Creating table {table_name}...')
cur.execute(f'CREATE TABLE {table_name} (id serial PRIMARY KEY,  visitor_id varchar(3), page_name varchar(2), visit_datetime timestamp, conversion_flag varchar(1));')

row_total = 10000000

# Insert Randomized data
print(f'Inserting {row_total:,} records, please wait...')
for i in range(row_total):
    data_insert = f'INSERT INTO {table_name} (visitor_id, page_name, visit_datetime, conversion_flag) VALUES (\'{str(random_id(111,255))}\',\'{str(random_string(1,"letters").upper())}\',\'{random_date((2024,2,1),(2024,2,29))}\',\'{"1" if random_boolean(15) else "0"}\');'
    # try:
    cur.execute(data_insert)
    # except:
    #     print(f'Error at: {vals}')
    if(i%10000 == 0 and i != 0):
        print(f'Inserted {i:,} rows...')

# Check table entries
# cur.execute(f'SELECT COUNT(*) FROM {table_name};')
# print(f'Created {cur.fetchone()[0]:,} rows!')

# Fix the data - Once they're converted, put the rest of the record of that visitor as converted
data_update = f'UPDATE {table_name} AS base SET conversion_flag = \'1\' FROM (SELECT visitor_id, MIN(visit_datetime) AS "latest_visit_datetime" FROM {table_name} WHERE conversion_flag = \'1\' GROUP BY visitor_id) AS identifiers WHERE base.visitor_id = identifiers.visitor_id AND base.visit_datetime >= identifiers.latest_visit_datetime;'
cur.execute(data_update)

print('Committing...')
con.commit()
print('Committed!')
cur.close()
con.close()


Table visitor_table exists, dropping...
Creating table visitor_table...
Inserting 10,000,000 records, please wait...
Inserted 10,000 rows...
Inserted 20,000 rows...
Inserted 30,000 rows...
Inserted 40,000 rows...
Inserted 50,000 rows...
Inserted 60,000 rows...
Inserted 70,000 rows...
Inserted 80,000 rows...
Inserted 90,000 rows...
Inserted 100,000 rows...
Inserted 110,000 rows...
Inserted 120,000 rows...
Inserted 130,000 rows...
Inserted 140,000 rows...
Inserted 150,000 rows...
Inserted 160,000 rows...
Inserted 170,000 rows...
Inserted 180,000 rows...
Inserted 190,000 rows...
Inserted 200,000 rows...
Inserted 210,000 rows...
Inserted 220,000 rows...
Inserted 230,000 rows...
Inserted 240,000 rows...
Inserted 250,000 rows...
Inserted 260,000 rows...
Inserted 270,000 rows...
Inserted 280,000 rows...
Inserted 290,000 rows...
Inserted 300,000 rows...
Inserted 310,000 rows...
Inserted 320,000 rows...
Inserted 330,000 rows...
Inserted 340,000 rows...
Inserted 350,000 rows...
Inserted 360,000 