In [16]:
import sqlite3
import pandas as pd

conn = sqlite3.connect("../data/checking-logs.sqlite")

query = """
CREATE TABLE IF NOT EXISTS datamart AS
SELECT
    c.uid,
    c.labname,
    c.timestamp AS first_commit_ts,
    MIN(p.datetime) AS first_view_ts
FROM checker c
LEFT JOIN pageviews p ON c.uid = p.uid
WHERE c.status = 'ready'
AND c.numTrials = 1
AND c.labname IN ('laba04', 'laba04s', 'laba05', 'laba06', 'laba06s', 'project1')
AND c.uid LIKE 'user_%'
GROUP BY c.uid, c.labname, c.timestamp;
"""

conn.execute(query)
conn.commit()

datamart = pd.read_sql("SELECT * FROM datamart", conn)

datamart["first_commit_ts"] = pd.to_datetime(datamart["first_commit_ts"])
datamart["first_view_ts"] = pd.to_datetime(datamart["first_view_ts"])

test = datamart.dropna(subset=["first_view_ts"])
control = datamart[datamart["first_view_ts"].isna()]

mean_first_view_ts = test["first_view_ts"].mean()
control["first_view_ts"].fillna(mean_first_view_ts, inplace=True)

test.to_sql("test_group", conn, if_exists="replace", index=False)
control.to_sql("control_group", conn, if_exists="replace", index=False)

conn.close()

print("The datamart table has been created and processed.")
print(f"Total users: {len(datamart)}")
print(f"Test group (with first_view_ts): {len(test)}")
print(f"Control group (without first_view_ts): {len(control)}")
print("Avg value of first_view_ts for the control group:", mean_first_view_ts)



The datamart table has been created and processed.
Total users: 140
Test group (with first_view_ts): 59
Control group (without first_view_ts): 81
Avg value of first_view_ts for the control group: 2020-04-27 00:40:05.761783552


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  control["first_view_ts"].fillna(mean_first_view_ts, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  control["first_view_ts"].fillna(mean_first_view_ts, inplace=True)
