**Create SQLight Database**

In [1]:
import sqlite3

# Create or connect to a SQLite database
conn = sqlite3.connect("telemetry.db")  # creates telemetry.db file
cur = conn.cursor()

**create the table for telemetry data**

In [2]:
cur.execute("""
CREATE TABLE IF NOT EXISTS telemetry (
    timestamp INTEGER,
    user_id INTEGER,
    version TEXT,
    event TEXT
)
""")
conn.commit()

**load .bz2 file into SQLight**

In [6]:
import bz2

BATCH_SIZE = 100_000_000
batch = []

with bz2.open(r"C:\Users\katsi\OneDrive\Business_Analytics\Thesis\Data\master-telemetry-distilled-sorted.bz2", "rt", encoding="utf-8") as f:
    for i, line in enumerate(f, 1):
        fields = line.rstrip("\n").split("\t")
        if len(fields) < 4:
            continue

        ts, user_id, version, event = fields[:4]

        # Cleaning
        try:
            ts = int(float(ts))
            if ts >= 1767225600:  # year >= 2025 #exclude timestamps referred to years over 2026
                continue
        except ValueError:
            continue

        try:
            user_id = int(user_id)
            if user_id < 0: #remove rows with negative user_ids
                continue
        except ValueError:
            continue

        valid_versions = {
            "5.0.0.34","5.0.0.46","5.1.0.2","5.4.0.14","5.4.0.100",
            "5.6.0.6","5.6.0.14","7.0.0.16","7.1.0.2","7.1.0.28",
            "7.1.0.64","7.2.0.50","7.4.0.4","7.5.0.20","7.6.0.2",
            "7.6.0.4","7.6.0.24","7.7.0.100","8.0.0.8","8.1.0.2",
            "8.1.0.4","8.1.0.18","8.1.0.22","8.2.0.72","8.3.0.16",
            "9.1.0.16","9.1.0.46"
        }
        if version not in valid_versions: #keep only valid versions
            continue

        batch.append((ts, user_id, version, event))

        # Insert in batches
        if len(batch) >= BATCH_SIZE:
            cur.executemany("INSERT INTO telemetry (timestamp, user_id, version, event) VALUES (?, ?, ?, ?)", batch)
            conn.commit()
            batch.clear()
            print(f"Inserted {i} rows...")

# Insert remaining rows
if batch:
    cur.executemany("INSERT INTO telemetry (timestamp, user_id, version, event) VALUES (?, ?, ?, ?)", batch)
    conn.commit()

Inserted 100365297 rows...
Inserted 200490623 rows...
Inserted 300525685 rows...
Inserted 400596308 rows...
Inserted 500697728 rows...
Inserted 600739595 rows...
Inserted 700769753 rows...
