### Exercise 04 : A/B-testing

* create a connection to the database using the library sqlite3

In [1]:
import pandas as pd
import sqlite3

conn = sqlite3.connect('../data/checking-logs.sqlite')

* using only one query for each of the groups, create two dataframes: test_results
and control_results with the columns time and avg_diff and only two rows
  * time should have the values: after and before
  * avg_diff contains the average delta among all the users for the time period
  before each of them made their first visit to the page and afterward
  * only take into account the users that have observations before and after

In [2]:
# test results
query_test = """
WITH diffs AS (
    SELECT 
        uid,
        CASE 
            WHEN first_commit_ts < first_view_ts THEN 'before'
            ELSE 'after'
        END AS time,
        (julianday(first_commit_ts) - julianday(datetime(d.deadlines, 'unixepoch'))) * 24 AS diff
    FROM test t
    JOIN deadlines d ON t.labname = d.labs
    WHERE t.labname != 'project1'
),
user_avg AS (
    SELECT uid, time, AVG(diff) AS avg_diff
    FROM diffs
    GROUP BY uid, time
),
valid_users AS (
    SELECT uid
    FROM user_avg
    GROUP BY uid
    HAVING COUNT(DISTINCT time) = 2
)
SELECT time, AVG(avg_diff) AS avg_diff
FROM user_avg
WHERE uid IN (SELECT uid FROM valid_users)
GROUP BY time
ORDER BY time;
"""


test_results = pd.read_sql(query_test, conn)
test_results

Unnamed: 0,time,avg_diff
0,after,-100.178032
1,before,-66.679398


In [3]:
query_control = """
WITH avg_view_ts AS (
    SELECT AVG(first_view_ts) AS ts FROM test
),
diffs AS (
    SELECT 
        uid,
        CASE 
            WHEN first_commit_ts < (SELECT ts FROM avg_view_ts) THEN 'before'
            ELSE 'after'
        END AS time,
        (julianday(first_commit_ts) - julianday(datetime(d.deadlines, 'unixepoch'))) * 24 AS diff
    FROM control c
    JOIN deadlines d ON c.labname = d.labs
    WHERE c.labname != 'project1'
),
user_avg AS (
    SELECT uid, time, AVG(diff) AS avg_diff
    FROM diffs
    GROUP BY uid, time
)
SELECT time, ROUND(AVG(avg_diff), 4) AS avg_diff
FROM user_avg
GROUP BY time
ORDER BY time;
"""

control_results = pd.read_sql(query_control, conn)


In [4]:
control_results

Unnamed: 0,time,avg_diff
0,after,-91.8034


In [5]:
print("Результаты для test:")
print(test_results)

print("\nРезультаты для control:")
print(control_results)

Результаты для test:
     time    avg_diff
0   after -100.178032
1  before  -66.679398

Результаты для control:
    time  avg_diff
0  after  -91.8034


In [6]:
conn.close()