# Introduction
This notebook performs the tasks in the exercise

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from migrator import Migrator
from database import Database
from environs import Env
import tasks as t

## Part 1
Setting up and seeding the database.

In [3]:
env = Env()
env.read_env(".env")

In [8]:
database = Database(
    host=env.str("DB_HOST"),
    port=env.int("DB_PORT"),
    user=env.str("DB_USER"),
    password=env.str("DB_PASSWORD"),
    database=env.str("DB_DATABASE")
)

Connected to: 8.1.0
You are connected to the database: ('mysql',)
-----------------------------------------------



### Create tables

In [5]:
migrator = Migrator(database)

In [14]:
migrator.migrate()

--------------------------------Starting migrate--------------------------------
Found migration files:  ['001_users.sql', '002_activities.sql', '003_trackpoints.sql']
Running migration:   001_users.sql       ✅
Running migration:   002_activities.sql  ✅
Running migration:   003_trackpoints.sql ✅
-------------------------Finished migrate in 55.593 ms--------------------------



### (DANGER) Wipe the DB
🚨 THIS WILL WIPE ALL DATA IN THE TABLES 🚨

In [13]:
# migrator.wipe()

---------------------------------Starting wipe----------------------------------
Wiping TrackPoints
Wiping Activities
Wiping Users
---------------------------Finished wipe in 23.851 ms---------------------------



### Seed Database from Data Set

In [15]:
migrator.seed_users()

------------------------------Starting seed_users-------------------------------
Seeded [(182,)] Users
------------------------Finished seed_users in 11.671 ms------------------------



In [16]:
migrator.seed_activities()

----------------------------Starting seed_activities----------------------------
Generating seed data for user: 135	✅
Generating seed data for user: 132	✅
Generating seed data for user: 104	✅
Generating seed data for user: 103	✅
Generating seed data for user: 168	✅
Generating seed data for user: 157	✅
Generating seed data for user: 150	✅
Generating seed data for user: 159	✅
Generating seed data for user: 166	✅
Generating seed data for user: 161	✅
Generating seed data for user: 102	✅
Generating seed data for user: 105	✅
Generating seed data for user: 133	✅
Generating seed data for user: 134	✅
Generating seed data for user: 160	✅
Generating seed data for user: 158	✅
Generating seed data for user: 167	✅
Generating seed data for user: 151	✅
Generating seed data for user: 169	✅
Generating seed data for user: 156	✅
Generating seed data for user: 024	✅
Generating seed data for user: 023	✅
Generating seed data for user: 015	✅
Generating seed data for user: 012	✅
Generating seed data for user: 

In [18]:
migrator.seed_track_points()

---------------------------Starting seed_track_points---------------------------
autocommit False
Generating seed data for user: 135	✅
Generating seed data for user: 132	✅
Generating seed data for user: 104	✅
Generating seed data for user: 103	✅
Generating seed data for user: 168	✅
Generating seed data for user: 157	✅
Generating seed data for user: 150	✅
Generating seed data for user: 159	✅
Generating seed data for user: 166	✅
Generating seed data for user: 161	✅
Generating seed data for user: 102	✅
Generating seed data for user: 105	✅
Generating seed data for user: 133	✅
Generating seed data for user: 134	✅
Generating seed data for user: 160	✅
Generating seed data for user: 158	✅
Generating seed data for user: 167	✅
Generating seed data for user: 151	✅
Generating seed data for user: 169	✅
Generating seed data for user: 156	✅
Generating seed data for user: 024	✅
Generating seed data for user: 023	✅
Generating seed data for user: 015	✅
Generating seed data for user: 012	✅
Generating see

In [19]:
migrator.create_indices()

----------------------------Starting create_indices-----------------------------
-------------------Finished create_indices in 2138095.149 ms--------------------



## Part 2

In [6]:
tasks = t.Task(database)

### Task 1

In [21]:
tasks.task1()

---------------------------------Starting task1---------------------------------
Running statement:
 
        SELECT
            (SELECT Count(*) AS UsersCount FROM Users) AS '# Users',
            (SELECT Count(*) AS UsersCount FROM Activities) AS '# Activities',
            (SELECT Count(*) AS UsersCount FROM TrackPoints) AS '# TrackPoints';
        
Query Finished
--------------------------Finished task1 in 967.701 ms--------------------------



Unnamed: 0,# Users,# Activities,# TrackPoints
0,182,16048,9681756


### Task 2

In [22]:
tasks.task2()

---------------------------------Starting task2---------------------------------
Running statement:
 
            SELECT CAST(ROUND(AVG(count), 0) AS SIGNED) AS Avg, MAX(count) AS Max, MIN(count) AS Min
            FROM (
                SELECT COUNT(*) AS count
                FROM TrackPoints as tp
                LEFT JOIN Activities as a
                    ON tp.activity_id = a.id
                GROUP BY a.user_id
            ) as counts;
        
Query Finished
-------------------------Finished task2 in 4666.632 ms--------------------------



Unnamed: 0,Avg,Max,Min
0,55964,1010325,17


### Task 3

In [23]:
tasks.task3()

---------------------------------Starting task3---------------------------------
Running statement:
 
            SELECT UserId, ActivityCount
            FROM
                (
                    SELECT Count(*) as ActivityCount, u.id as UserId
                    FROM Activities as a
                    LEFT JOIN Users as u
                        on a.user_id = u.id
                    GROUP BY u.id
                ) as activityCounts
            ORDER BY ActivityCount DESC
            LIMIT 15;
        
Query Finished
---------------------------Finished task3 in 28.7 ms----------------------------



Unnamed: 0,UserId,ActivityCount
0,128,2102
1,153,1793
2,25,715
3,163,704
4,62,691
5,144,563
6,41,399
7,85,364
8,4,346
9,140,345


### Task 4

In [24]:
tasks.task4()

---------------------------------Starting task4---------------------------------
Running statement:
 
            SELECT DISTINCT u.id AS UserId
            FROM Users AS u
            LEFT JOIN Activities AS a
                ON u.id = a.user_id
            WHERE a.transportation_mode LIKE 'Bus';
        
Query Finished
--------------------------Finished task4 in 11.734 ms---------------------------



Unnamed: 0,UserId
0,10
1,20
2,52
3,62
4,68
5,73
6,80
7,81
8,84
9,85


### Task 5

In [25]:
tasks.task5()

---------------------------------Starting task5---------------------------------
Running statement:
 
            SELECT DISTINCT u.id AS UserID, Count(DISTINCT a.transportation_mode) as '# Transportation Modes', GROUP_CONCAT(DISTINCT a.transportation_mode SEPARATOR ', ') AS 'Transportation Modes'
            FROM Activities AS a
            LEFT JOIN Users AS u
                ON a.user_id = u.id
            WHERE a.transportation_mode != ""
            GROUP BY u.id
            ORDER BY Count(DISTINCT a.transportation_mode) DESC
            LIMIT 10;
        
Query Finished
--------------------------Finished task5 in 15.486 ms---------------------------



Unnamed: 0,UserID,# Transportation Modes,Transportation Modes
0,128,9,"airplane, bike, boat, bus, car, subway, taxi, ..."
1,62,9,"bike, boat, bus, car, run, subway, taxi, train..."
2,85,4,"bus, subway, taxi, walk"
3,65,4,"bike, subway, taxi, walk"
4,112,3,"bike, bus, walk"
5,78,3,"subway, taxi, walk"
6,80,3,"bike, bus, taxi"
7,81,3,"bike, bus, walk"
8,111,3,"bike, car, taxi"
9,10,3,"bus, taxi, train"


### Task 7

In [26]:
tasks.task7a()

--------------------------------Starting task7a---------------------------------
Running statement:
 
            SELECT COUNT(DISTINCT user_id) as '# Users With Overnight Activities'
            FROM Activities
            WHERE DATEDIFF(end_datetime, start_datetime) = 1;
        
Query Finished
--------------------------Finished task7a in 12.294 ms--------------------------



Unnamed: 0,# Users With Overnight Activities
0,98


In [27]:
tasks.task7b()

--------------------------------Starting task7b---------------------------------
Running statement:
 
            SELECT transportation_mode AS 'Transportation Mode', user_id AS UserId, TIMEDIFF(end_datetime, start_datetime) as Duration
            FROM Activities
            WHERE DATEDIFF(end_datetime, start_datetime) = 1;
        
Query Finished
--------------------------Finished task7b in 23.346 ms--------------------------



Unnamed: 0,Transportation Mode,UserId,Duration
0,,000,0 days 04:09:15
1,,000,0 days 00:10:45
2,,000,0 days 11:03:55
3,,000,0 days 01:43:20
4,,000,0 days 16:24:06
...,...,...,...
1006,,168,0 days 01:13:02
1007,,172,0 days 04:20:47
1008,,174,0 days 02:19:51
1009,,174,0 days 01:31:59


### Task 8

In [8]:
tasks.task8()



---------------------------------Starting task8---------------------------------
Running statement:
 
            SELECT DISTINCT left.user_id
            FROM full AS left
            INNER JOIN full AS right
                ON left.user_id != right.user_id
                AND left.datetime <= right.datetime
                -- Out of these, find track points that are witihin 50 meters of each other, using a minimum
                -- bounding rectangle
                -- e.g.
                -- All points inside this rectangle, where we have sides of 50 meters
                --    50 m
                -- ---------
                -- |       |
                -- |   .   |  50 m
                -- |       |
                -- ---------
                -- 
                -- After making a first pass (which is very efficient as we use the spatial index of geom)
                -- we find the ones that are actually inside the circle.
                AND MBRContains(ST_BUFFER(left.geom, 5

ProgrammingError: 1064 (42000): You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near 'left
            INNER JOIN full AS right
                ON left.user_id != rig' at line 2

In [19]:
# query = """
#     SELECT DISTINCT left.user_id AS UserID1, right.user_id AS UserID2
#     FROM full AS left
#     INNER JOIN full AS right
#         ON left.user_id != right.user_id
#         AND left.datetime <= right.datetime
#         AND MBRContains(ST_BUFFER(left.geom, 50), right.geom)
#         AND TIME_TO_SEC(TIMEDIFF(right.datetime, left.datetime)) <= 30
# """



query = """
SELECT DISTINCT p1.user_id as UserID1, p2.user_id AS UserID2
FROM full AS p1
INNER JOIN full AS p2
    ON p1.user_id != p2.user_id
LIMIT 10
"""

database.query(query)

Running statement:
 
SELECT DISTINCT p1.user_id as UserID1, p2.user_id AS UserID2
FROM full AS p1
INNER JOIN full AS p2
    ON p1.user_id != p2.user_id
LIMIT 10



DatabaseError: 1137 (HY000): Can't reopen table: 'p1'

In [18]:
database.cursor.fetchall()


[]

In [21]:
left_table = """
    CREATE TEMPORARY TABLE p1 AS
        SELECT u.id as user_id, tp.datetime as datetime, tp.geom as geom
        FROM TrackPoints as tp
        INNER JOIN Activities as a
            ON a.id = tp.activity_id
        INNER JOIN Users as u
            on u.id = a.user_id
"""
database.query(left_table)

Running statement:
 
    CREATE TEMPORARY TABLE p1 AS
        SELECT u.id as user_id, tp.datetime as datetime, tp.geom as geom
        FROM TrackPoints as tp
        INNER JOIN Activities as a
            ON a.id = tp.activity_id
        INNER JOIN Users as u
            on u.id = a.user_id

Query Finished


In [22]:
right_table = """
    CREATE TEMPORARY TABLE p2 AS
        SELECT u.id as user_id, tp.datetime as datetime, tp.geom as geom
        FROM TrackPoints as tp
        INNER JOIN Activities as a
            ON a.id = tp.activity_id
        INNER JOIN Users as u
            on u.id = a.user_id
"""
database.query(right_table)

Running statement:
 
    CREATE TEMPORARY TABLE p2 AS
        SELECT u.id as user_id, tp.datetime as datetime, tp.geom as geom
        FROM TrackPoints as tp
        INNER JOIN Activities as a
            ON a.id = tp.activity_id
        INNER JOIN Users as u
            on u.id = a.user_id

Query Finished


In [9]:
# query = """
# SELECT COUNT(DISTINCT u1.id, u2.id) as num_users
# FROM Users AS u1
# JOIN Users AS u2 ON u1.id < u2.id
# JOIN Activities AS a1 ON a1.user_id = u1.id
# JOIN Activities AS a2 ON a2.user_id = u2.id
# JOIN TrackPoints AS tp1 ON a1.id = tp1.activity_id
# JOIN TrackPoints AS tp2 ON a2.id = tp2.activity_id
# WHERE MBRContains(ST_BUFFER(tp1.geom, 50), tp2.geom)
# AND ABS(TIME_TO_SEC(TIMEDIFF(tp1.datetime, tp2.datetime))) <= 30
# AND ST_Distance_Sphere(tp1.geom, tp2.geom) <= 50
# """
query = """
WITH user_pairs AS (
    SELECT a1.user_id AS user_id1, a2.user_id AS user_id2
    FROM Activities AS a1
    JOIN Activities AS a2 ON a1.user_id < a2.user_id
    JOIN TrackPoints AS tp1 ON a1.id = tp1.activity_id
    JOIN TrackPoints AS tp2 ON a2.id = tp2.activity_id AND tp1.id < tp2.id
    WHERE ST_Distance_Sphere(tp1.geom, tp2.geom) <= 50
    AND ABS(TIME_TO_SEC(TIMEDIFF(tp1.datetime, tp2.datetime))) <= 30
)
SELECT COUNT(DISTINCT user_id) as num_users
FROM (
    SELECT user_id1 AS user_id FROM user_pairs
    UNION
    SELECT user_id2 FROM user_pairs
) AS user_ids;
"""


database.query(query)

Running statement:
 
WITH user_pairs AS (
    SELECT a1.user_id AS user_id1, a2.user_id AS user_id2
    FROM Activities AS a1
    JOIN Activities AS a2 ON a1.user_id < a2.user_id
    JOIN TrackPoints AS tp1 ON a1.id = tp1.activity_id
    JOIN TrackPoints AS tp2 ON a2.id = tp2.activity_id AND tp1.id < tp2.id
    WHERE ST_Distance_Sphere(tp1.geom, tp2.geom) <= 50
    AND ABS(TIME_TO_SEC(TIMEDIFF(tp1.datetime, tp2.datetime))) <= 30
)
SELECT COUNT(DISTINCT user_id) as num_users
FROM (
    SELECT user_id1 AS user_id FROM user_pairs
    UNION
    SELECT user_id2 FROM user_pairs
) AS user_ids;

