# Introduction
This notebook performs the tasks in the exercise

In [1]:
%load_ext autoreload
%autoreload 2

In [47]:
from migrator import Migrator
from database import Database
from environs import Env
import tasks as t
import pandas as pd

## Part 1
Setting up and seeding the database.

In [3]:
env = Env()
env.read_env(".env")

In [72]:
database = Database(
    host=env.str("DB_HOST"),
    port=3307,
    user=env.str("DB_USER"),
    password=env.str("DB_PASSWORD"),
    database=env.str("DB_DATABASE")
)

Connected to: 8.1.0
You are connected to the database: ('mysql',)
-----------------------------------------------



### Create tables

In [33]:
migrator = Migrator(database, 2500)

In [34]:
migrator.migrate()

--------------------------------Starting migrate--------------------------------
Found migration files:  ['001_users.sql', '002_activities.sql', '003_trackpoints.sql']
Running migration:   001_users.sql       ✅
Running migration:   002_activities.sql  ✅
Running migration:   003_trackpoints.sql ✅
-------------------------Finished migrate in 58.146 ms--------------------------



### (DANGER) Wipe the DB
🚨 THIS WILL WIPE ALL DATA IN THE TABLES 🚨

In [32]:
migrator.wipe()

---------------------------------Starting wipe----------------------------------
Wiping TrackPoints
Wiping Activities
Wiping Users
--------------------------Finished wipe in 1134.808 ms--------------------------



### Seed Database from Data Set

In [35]:
migrator.seed_users()

------------------------------Starting seed_users-------------------------------
Seeded [(182,)] Users
------------------------Finished seed_users in 16.647 ms------------------------



In [36]:
migrator.seed_activities()

----------------------------Starting seed_activities----------------------------
Generating seed data for user: 135	✅
Generating seed data for user: 132	✅
Generating seed data for user: 104	✅
Generating seed data for user: 103	✅
Generating seed data for user: 168	✅
Generating seed data for user: 157	✅
Generating seed data for user: 150	✅
Generating seed data for user: 159	✅
Generating seed data for user: 166	✅
Generating seed data for user: 161	✅
Generating seed data for user: 102	✅
Generating seed data for user: 105	✅
Generating seed data for user: 133	✅
Generating seed data for user: 134	✅
Generating seed data for user: 160	✅
Generating seed data for user: 158	✅
Generating seed data for user: 167	✅
Generating seed data for user: 151	✅
Generating seed data for user: 169	✅
Generating seed data for user: 156	✅
Generating seed data for user: 024	✅
Generating seed data for user: 023	✅
Generating seed data for user: 015	✅
Generating seed data for user: 012	✅
Generating seed data for user: 

In [37]:
migrator.seed_track_points()

---------------------------Starting seed_track_points---------------------------
Generating seed data for user: 135	✅
Generating seed data for user: 132	✅
Generating seed data for user: 104	✅
Generating seed data for user: 103	✅
Generating seed data for user: 168	✅
Generating seed data for user: 157	✅
Generating seed data for user: 150	✅
Generating seed data for user: 159	✅
Generating seed data for user: 166	✅
Generating seed data for user: 161	✅
Generating seed data for user: 102	✅
Generating seed data for user: 105	✅
Generating seed data for user: 133	✅
Generating seed data for user: 134	✅
Generating seed data for user: 160	✅
Generating seed data for user: 158	✅
Generating seed data for user: 167	✅
Generating seed data for user: 151	✅
Generating seed data for user: 169	✅
Generating seed data for user: 156	✅
Generating seed data for user: 024	✅
Generating seed data for user: 023	✅
Generating seed data for user: 015	✅
Generating seed data for user: 012	✅
Generating seed data for user: 

In [38]:
migrator.create_indices()

----------------------------Starting create_indices-----------------------------
Executing statement
 ALTER TABLE TrackPoints ADD INDEX datetime_idx (datetime); ✅
--------------------Finished create_indices in 12093.948 ms---------------------



## Part 2

In [None]:
tasks = t.Task(database)

### Task 1

In [None]:
tasks.task1()

### Task 2

In [None]:
tasks.task2()

### Task 3

In [None]:
tasks.task3()

### Task 4

In [None]:
tasks.task4()

### Task 5

In [None]:
tasks.task5()

### Task 7

In [None]:
tasks.task7a()

In [None]:
tasks.task7b()

### Task 8

In [None]:
tasks.task8()



In [None]:
# query = """
#     SELECT DISTINCT left.user_id AS UserID1, right.user_id AS UserID2
#     FROM full AS left
#     INNER JOIN full AS right
#         ON left.user_id != right.user_id
#         AND left.datetime <= right.datetime
#         AND MBRContains(ST_BUFFER(left.geom, 50), right.geom)
#         AND TIME_TO_SEC(TIMEDIFF(right.datetime, left.datetime)) <= 30
# """



query = """
SELECT DISTINCT p1.user_id as UserID1, p2.user_id AS UserID2
FROM full AS p1
INNER JOIN full AS p2
    ON p1.user_id != p2.user_id
LIMIT 10
"""

database.query(query)

In [None]:
database.cursor.fetchall()


In [None]:
left_table = """
    CREATE TEMPORARY TABLE p1 AS
        SELECT u.id as user_id, tp.datetime as datetime, tp.geom as geom
        FROM TrackPoints as tp
        INNER JOIN Activities as a
            ON a.id = tp.activity_id
        INNER JOIN Users as u
            on u.id = a.user_id
"""
database.query(left_table)

In [None]:
right_table = """
    CREATE TEMPORARY TABLE p2 AS
        SELECT u.id as user_id, tp.datetime as datetime, tp.geom as geom
        FROM TrackPoints as tp
        INNER JOIN Activities as a
            ON a.id = tp.activity_id
        INNER JOIN Users as u
            on u.id = a.user_id
"""
database.query(right_table)

In [None]:
query = """
SELECT DISTINCT a1.user_id, a2.user_id
FROM Activities a1
JOIN TrackPoints tp1 ON a1.id = tp1.activity_id
JOIN Activities a2 ON a1.user_id != a2.user_id
JOIN TrackPoints tp2 ON a2.id = tp2.activity_id
WHERE a1.start_datetime <= a2.end_datetime
AND a1.end_datetime >= a2.start_datetime
AND ST_Distance_Sphere(tp1.geom, tp2.geom) <= 50
AND ABS(TIMESTAMPDIFF(SECOND, tp1.datetime, tp2.datetime)) <= 30
"""

database.query(query)

In [12]:
query = """
WITH user_pairs AS (
    SELECT DISTINCT a1.user_id AS user_id1, a2.user_id AS user_id2
    FROM Activities a1
    JOIN Activities a2 ON a1.id < a2.id
        AND TIME_TO_SEC(TIMEDIFF(a2.start_datetime, a1.end_datetime)) <= 30
        AND TIME_TO_SEC(TIMEDIFF(a1.start_datetime, a2.end_datetime)) <= 30
        AND a1.user_id < a2.user_id
)
SELECT DISTINCT user_id
FROM (
    SELECT user_id1 AS user_id FROM user_pairs
    UNION
    SELECT user_id2 FROM user_pairs
) AS user_ids;
"""

database.query(query)

Running statement:
 
WITH user_pairs AS (
    SELECT DISTINCT a1.user_id AS user_id1, a2.user_id AS user_id2
    FROM Activities a1
    JOIN Activities a2 ON a1.id < a2.id
        AND TIME_TO_SEC(TIMEDIFF(a2.start_datetime, a1.end_datetime)) <= 30
        AND TIME_TO_SEC(TIMEDIFF(a1.start_datetime, a2.end_datetime)) <= 30
        AND a1.user_id < a2.user_id
)
SELECT DISTINCT user_id
FROM (
    SELECT user_id1 AS user_id FROM user_pairs
    UNION
    SELECT user_id2 FROM user_pairs
) AS user_ids;

Query Finished


Unnamed: 0,user_id
0,000
1,001
2,002
3,003
4,004
...,...
142,166
143,171
144,152
145,139


In [18]:
query = """
WITH user_pairs AS (
    SELECT DISTINCT a1.user_id AS user_id1, a2.user_id AS user_id2
    FROM Activities a1
    JOIN Activities a2 ON a1.id < a2.id
        AND TIME_TO_SEC(TIMEDIFF(a2.start_datetime, a1.end_datetime)) <= 30
        AND TIME_TO_SEC(TIMEDIFF(a1.start_datetime, a2.end_datetime)) <= 30
        AND a1.user_id < a2.user_id
        JOIN TrackPoints p1 ON p1.activity_id = a1.id
)
SELECT DISTINCT user_id
FROM (
    SELECT user_id1 AS user_id FROM user_pairs
    UNION
    SELECT user_id2 FROM user_pairs
) AS user_ids;
"""

database.query(query)

Running statement:
 
WITH user_pairs AS (
    SELECT DISTINCT a1.user_id AS user_id1, a2.user_id AS user_id2
    FROM Activities a1
    JOIN Activities a2 ON a1.id < a2.id
        AND TIME_TO_SEC(TIMEDIFF(a2.start_datetime, a1.end_datetime)) <= 30
        AND TIME_TO_SEC(TIMEDIFF(a1.start_datetime, a2.end_datetime)) <= 30
        AND a1.user_id < a2.user_id
        JOIN TrackPoints p1 ON p1.activity_id = a1.id
)
SELECT DISTINCT user_id
FROM (
    SELECT user_id1 AS user_id FROM user_pairs
    UNION
    SELECT user_id2 FROM user_pairs
) AS user_ids;

Query Finished


Unnamed: 0,user_id
0,000
1,001
2,002
3,003
4,004
...,...
142,166
143,171
144,152
145,139


In [19]:
query = """
WITH user_pairs AS (
    SELECT DISTINCT a1.user_id AS user_id1, a2.user_id AS user_id2
    FROM Activities a1
    JOIN Activities a2 ON a1.id < a2.id
        AND TIME_TO_SEC(TIMEDIFF(a2.start_datetime, a1.end_datetime)) <= 30
        AND TIME_TO_SEC(TIMEDIFF(a1.start_datetime, a2.end_datetime)) <= 30
        AND a1.user_id < a2.user_id
        JOIN TrackPoints p2 ON p2.activity_id = a2.id
)
SELECT DISTINCT user_id
FROM (
    SELECT user_id1 AS user_id FROM user_pairs
    UNION
    SELECT user_id2 FROM user_pairs
) AS user_ids;
"""

database.query(query)

Running statement:
 
WITH user_pairs AS (
    SELECT DISTINCT a1.user_id AS user_id1, a2.user_id AS user_id2
    FROM Activities a1
    JOIN Activities a2 ON a1.id < a2.id
        AND TIME_TO_SEC(TIMEDIFF(a2.start_datetime, a1.end_datetime)) <= 30
        AND TIME_TO_SEC(TIMEDIFF(a1.start_datetime, a2.end_datetime)) <= 30
        AND a1.user_id < a2.user_id
        JOIN TrackPoints p2 ON p2.activity_id = a2.id
)
SELECT DISTINCT user_id
FROM (
    SELECT user_id1 AS user_id FROM user_pairs
    UNION
    SELECT user_id2 FROM user_pairs
) AS user_ids;

Query Finished


Unnamed: 0,user_id
0,000
1,001
2,002
3,003
4,004
...,...
142,166
143,171
144,152
145,139


In [20]:
query = """
WITH user_pairs AS (
    SELECT DISTINCT a1.user_id AS user_id1, a2.user_id AS user_id2
    FROM Activities a1
    JOIN Activities a2 ON a1.id < a2.id
        AND TIME_TO_SEC(TIMEDIFF(a2.start_datetime, a1.end_datetime)) <= 30
        AND TIME_TO_SEC(TIMEDIFF(a1.start_datetime, a2.end_datetime)) <= 30
        AND a1.user_id < a2.user_id
    JOIN TrackPoints p1 ON p1.activity_id = a1.id
    JOIN TrackPoints p2 ON p2.activity_id = a2.id
)
SELECT DISTINCT user_id
FROM (
    SELECT user_id1 AS user_id FROM user_pairs
    UNION
    SELECT user_id2 FROM user_pairs
) AS user_ids;
"""

database.query(query)

Running statement:
 
WITH user_pairs AS (
    SELECT DISTINCT a1.user_id AS user_id1, a2.user_id AS user_id2
    FROM Activities a1
    JOIN Activities a2 ON a1.id < a2.id
        AND TIME_TO_SEC(TIMEDIFF(a2.start_datetime, a1.end_datetime)) <= 30
        AND TIME_TO_SEC(TIMEDIFF(a1.start_datetime, a2.end_datetime)) <= 30
        AND a1.user_id < a2.user_id
    JOIN TrackPoints p1 ON p1.activity_id = a1.id
    JOIN TrackPoints p2 ON p2.activity_id = a2.id
)
SELECT DISTINCT user_id
FROM (
    SELECT user_id1 AS user_id FROM user_pairs
    UNION
    SELECT user_id2 FROM user_pairs
) AS user_ids;

Query Finished


Unnamed: 0,user_id
0,000
1,001
2,002
3,003
4,004
...,...
142,166
143,171
144,152
145,139


In [39]:
query = """
WITH user_pairs AS (
    SELECT DISTINCT a1.user_id AS user_id1, a2.user_id AS user_id2
    FROM Activities a1
    -- Make a combination of all activities
    JOIN Activities a2 ON a1.id < a2.id
        -- We restrict the search space to activities that overlap with a 30 second margin
        -- to limit the number of track point comparisons that we have to perform.
        -- Activities that do not overlap within at least a 30 second margin
        -- should not contain track points that are within 30 seconds of each other.
        AND TIME_TO_SEC(TIMEDIFF(a2.start_datetime, a1.end_datetime)) <= 30
        AND TIME_TO_SEC(TIMEDIFF(a1.start_datetime, a2.end_datetime)) <= 30
        -- Avoid comparing a user to themselves
        AND a1.user_id < a2.user_id
    -- Join in the track points on the two sets of activities
    JOIN TrackPoints p1 ON p1.activity_id = a1.id
    JOIN TrackPoints p2 ON p2.activity_id = a2.id
    -- Then, after restricting the search space, we check for
    -- track points that are close both in time
    WHERE ABS(TIME_TO_SEC(TIMEDIFF(p1.datetime, p2.datetime))) <= 30
    -- and in space
    AND ST_Distance_Sphere(p1.geom, p2.geom) <= 50
)
-- Finally, we select the list of distinct user_ids of users
-- who have been near other users in space and time
SELECT DISTINCT user_id
FROM (
    -- Combine the two columns of user ID pairs into a single column of user IDs
    -- to find the total count of unique users who have been near others
    SELECT user_id1 AS user_id FROM user_pairs
    UNION
    SELECT user_id2 FROM user_pairs
) AS user_ids
-- Order the results by ascending ID
ORDER BY user_id ASC;
"""

database.query(query)

Running statement:
 
WITH user_pairs AS (
    SELECT DISTINCT a1.user_id AS user_id1, a2.user_id AS user_id2
    FROM Activities a1
    JOIN Activities a2 ON a1.id < a2.id
        -- We restrict the search space to activities that overlap with a 30 second margin
        -- to limit the number of track point comparisons that we have to perform.
        AND TIME_TO_SEC(TIMEDIFF(a2.start_datetime, a1.end_datetime)) <= 30
        AND TIME_TO_SEC(TIMEDIFF(a1.start_datetime, a2.end_datetime)) <= 30
        -- Avoid comparing a user to themselves
        AND a1.user_id < a2.user_id
    JOIN TrackPoints p1 ON p1.activity_id = a1.id
    JOIN TrackPoints p2 ON p2.activity_id = a2.id
    -- Then, after restricting the search space, we check for
    -- track points that are close both in time and space
    WHERE ABS(TIME_TO_SEC(TIMEDIFF(p1.datetime, p2.datetime))) <= 30
    AND ST_Distance_Sphere(p1.geom, p2.geom) <= 50
)
-- Finally, we select the list of distinct user_ids of users
-- who have bee

Unnamed: 0,user_id
0,000
1,001
2,003
3,004
4,005
...,...
116,173
117,174
118,175
119,176


In [None]:
query = """
WITH user_pairs AS (
    SELECT DISTINCT a1.user_id AS user_id1, a2.user_id AS user_id2
    FROM Activities a1
    JOIN Activities a2 ON a1.id < a2.id
        -- We restrict the search space to activities that overlap with a 30 second margin
        -- to limit the number of track point comparisons that we have to perform.
        AND TIME_TO_SEC(TIMEDIFF(a2.start_datetime, a1.end_datetime)) <= 30
        AND TIME_TO_SEC(TIMEDIFF(a1.start_datetime, a2.end_datetime)) <= 30
        -- Avoid comparing a user to themselves
        AND a1.user_id < a2.user_id
    JOIN TrackPoints p1 ON p1.activity_id = a1.id
    JOIN TrackPoints p2 ON p2.activity_id = a2.id
    WHERE ABS(TIME_TO_SEC(TIMEDIFF(p1.datetime, p2.datetime))) <= 30
)
SELECT DISTINCT user_id
FROM (
    SELECT user_id1 AS user_id FROM user_pairs
    UNION
    SELECT user_id2 FROM user_pairs
) AS user_ids;
"""

database.query(query)

In [23]:
query = """
SELECT DISTINCT p1.id, p2.id
FROM Activities a1
JOIN Activities a2 ON a1.id < a2.id
    AND TIME_TO_SEC(TIMEDIFF(a2.start_datetime, a1.end_datetime)) <= 30
    AND TIME_TO_SEC(TIMEDIFF(a1.start_datetime, a2.end_datetime)) <= 30
    AND a1.user_id < a2.user_id
JOIN TrackPoints p1 ON p1.activity_id = a1.id
JOIN TrackPoints p2 ON p2.activity_id = a2.id
"""

database.query(query)

Running statement:
 
SELECT DISTINCT p1.id, p2.id
FROM Activities a1
JOIN Activities a2 ON a1.id < a2.id
    AND TIME_TO_SEC(TIMEDIFF(a2.start_datetime, a1.end_datetime)) <= 30
    AND TIME_TO_SEC(TIMEDIFF(a1.start_datetime, a2.end_datetime)) <= 30
    AND a1.user_id < a2.user_id
JOIN TrackPoints p1 ON p1.activity_id = a1.id
JOIN TrackPoints p2 ON p2.activity_id = a2.id

Query Finished


Unnamed: 0,id,id.1
0,911,1063368
1,911,1063369
2,911,1063370
3,911,1063371
4,911,1063372
...,...,...
213598906,1962424,688266
213598907,1962424,688267
213598908,1962424,688268
213598909,1962424,688269


In [41]:
task_8_result = _

In [42]:
task_8_result

Unnamed: 0,user_id
0,000
1,001
2,003
3,004
4,005
...,...
116,173
117,174
118,175
119,176


In [44]:
task_8_result.all()

user_id    True
dtype: bool

In [49]:
with pd.option_context("display.max_rows", None):
    display(task_8_result)

Unnamed: 0,user_id
0,0
1,1
2,3
3,4
4,5
5,6
6,7
7,8
8,9
9,10


In [110]:
track_points_of_interest = database.query(
    """
        SELECT id
        FROM TrackPoints
        WHERE activity_id IN (
            SELECT a1.id AS id
            FROM Activities a1
            WHERE EXISTS (
                SELECT 1
                FROM Activities a2
                WHERE a2.start_datetime <= a1.end_datetime + INTERVAL 30 SECOND
                AND a2.end_datetime >= a1.start_datetime - INTERVAL 30 SECOND
                AND a1.id < id
                AND a1.user_Id < user_id
            )
        )
        ORDER BY id;
    """
)

Running statement:
 
        SELECT id, activity_id, datetime, ST_LATITUDE(geom) as latitude, ST_LONGITUDE(geom) AS longitude
        FROM TrackPoints
        WHERE activity_id IN (
            SELECT a1.id AS id
            FROM Activities a1
            WHERE EXISTS (
                SELECT 1
                FROM Activities a2
                WHERE a2.start_datetime <= a1.end_datetime + INTERVAL 30 SECOND
                AND a2.end_datetime >= a1.start_datetime - INTERVAL 30 SECOND
                AND a1.id < id
                AND a1.user_Id < user_id
            )
        )
        ORDER BY id;
    
Query Finished


In [111]:
track_points_of_interest

Unnamed: 0,id,activity_id,datetime,latitude,longitude
0,1,135-20090103012134,2009-01-03 01:21:34,39.974294,116.399741
1,2,135-20090103012134,2009-01-03 01:21:35,39.974292,116.399592
2,3,135-20090103012134,2009-01-03 01:21:36,39.974309,116.399523
3,4,135-20090103012134,2009-01-03 01:21:38,39.974320,116.399588
4,5,135-20090103012134,2009-01-03 01:21:39,39.974365,116.399730
...,...,...,...,...,...
5927774,9681391,176-20071208013029,2007-12-08 03:13:54,39.971000,116.304200
5927775,9681392,176-20071208013029,2007-12-08 03:15:01,39.971100,116.304933
5927776,9681393,176-20071208013029,2007-12-08 03:15:54,39.970817,116.304250
5927777,9681394,176-20071208013029,2007-12-08 03:16:57,39.970867,116.303250


In [141]:
import multiprocessing
from multi import get_id_pairs

In [131]:


processes = 11
pool = multiprocessing.Pool(processes)
batch_size = track_points_of_interest.shape[0] // processes




In [132]:
def get_batch_ids(batch_idx, batch_size, df, database):
    current_batch = df.iloc[batch_size * batch_idx: batch_size * (batch_idx + 1)]
    first, last = current_batch.iloc[[0, -1]]["id"]
    return database, first, last, first, last

batch_input = [get_batch_ids(i, batch_size, track_points_of_interest, database) for i in range(processes)]

In [140]:
results = pool.map(get_id_pairs, batch_input)

AttributeError: Can't pickle local object 'make_get_id_pairs.<locals>.get_id_pairs'

In [130]:
batch_input

[(1, 1085465, 1, 1085465),
 (1085466, 1677769, 1085466, 1677769),
 (1677770, 2374860, 1677770, 2374860),
 (2374861, 3067020, 2374861, 3067020),
 (3067021, 3652122, 3067021, 3652122),
 (3652123, 4597806, 3652123, 4597806),
 (4597807, 6632630, 4597807, 6632630),
 (6632631, 7535069, 6632631, 7535069),
 (7535070, 8107145, 7535070, 8107145),
 (8107146, 8771418, 8107146, 8771418),
 (8771419, 9681395, 8771419, 9681395)]

In [81]:
batch_size

493981

In [75]:
get_id_pairs((3021481, 3021482, 3021481, 3021482))

Running statement:
 
        WITH acts AS (
            SELECT a1.id AS id, a1.user_id AS user_id
            FROM Activities a1
            WHERE EXISTS (
                SELECT 1
                FROM Activities a2
                WHERE a2.start_datetime <= a1.end_datetime + INTERVAL 30 SECOND
                AND a2.end_datetime >= a1.start_datetime - INTERVAL 30 SECOND
                AND a1.id < id
                AND a1.user_Id < user_id
            )
        ),
        batch_track_points AS (
            SELECT id, activity_id, datetime, geom
            FROM TrackPoints
            WHERE activity_id IN (
                SELECT id
                FROM acts
            )
            AND id BETWEEN %s AND %s
        ),
        track_points_of_interest AS (
            SELECT id, activity_id, datetime, geom
            FROM TrackPoints
            WHERE activity_id IN (
                SELECT id
                FROM acts
            )
            AND id NOT BETWEEN %s AND %s
        

Unnamed: 0,user_id,user_id.1
