# Introduction
This notebook performs the tasks in the exercise

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from migrator import Migrator
from database import Database
from environs import Env
import tasks as t

## Part 1
Setting up and seeding the database.

In [3]:
env = Env()
env.read_env(".env")

In [4]:
database = Database(
    host=env.str("DB_HOST"),
    port=env.int("DB_PORT"),
    user=env.str("DB_USER"),
    password=env.str("DB_PASSWORD"),
    database=env.str("DB_DATABASE")
)

Connected to: 8.1.0
You are connected to the database: ('mysql',)
-----------------------------------------------



### Create tables

In [5]:
migrator = Migrator(database)

In [6]:
migrator.migrate()

--------------------------------Starting migrate--------------------------------
Found migration files:  ['001_users.sql', '002_activities.sql', '003_trackpoints.sql']
Running migration:   001_users.sql       ✅
Running migration:   002_activities.sql  ✅
Running migration:   003_trackpoints.sql ✅
-------------------------Finished migrate in 38.838 ms--------------------------



### (DANGER) Wipe the DB
🚨 THIS WILL WIPE ALL DATA IN THE TABLES 🚨

In [None]:
# migrator.wipe()

### Seed Database from Data Set

In [7]:
migrator.seed_users()

------------------------------Starting seed_users-------------------------------
Seeded [(182,)] Users
------------------------Finished seed_users in 7.831 ms-------------------------



In [8]:
migrator.seed_activities()

----------------------------Starting seed_activities----------------------------
Generating seed data for user: 135	✅
Generating seed data for user: 132	✅
Generating seed data for user: 104	✅
Generating seed data for user: 103	✅
Generating seed data for user: 168	✅
Generating seed data for user: 157	✅
Generating seed data for user: 150	✅
Generating seed data for user: 159	✅
Generating seed data for user: 166	✅
Generating seed data for user: 161	✅
Generating seed data for user: 102	✅
Generating seed data for user: 105	✅
Generating seed data for user: 133	✅
Generating seed data for user: 134	✅
Generating seed data for user: 160	✅
Generating seed data for user: 158	✅
Generating seed data for user: 167	✅
Generating seed data for user: 151	✅
Generating seed data for user: 169	✅
Generating seed data for user: 156	✅
Generating seed data for user: 024	✅
Generating seed data for user: 023	✅
Generating seed data for user: 015	✅
Generating seed data for user: 012	✅
Generating seed data for user: 

In [9]:
migrator.seed_track_points()

---------------------------Starting seed_track_points---------------------------
Generating seed data for user: 135	✅
Generating seed data for user: 132	✅
Generating seed data for user: 104	✅
Generating seed data for user: 103	✅
Generating seed data for user: 168	✅
Generating seed data for user: 157	✅
Generating seed data for user: 150	✅
Generating seed data for user: 159	✅
Generating seed data for user: 166	✅
Generating seed data for user: 161	✅
Generating seed data for user: 102	✅
Generating seed data for user: 105	✅
Generating seed data for user: 133	✅
Generating seed data for user: 134	✅
Generating seed data for user: 160	✅
Generating seed data for user: 158	✅
Generating seed data for user: 167	✅
Generating seed data for user: 151	✅
Generating seed data for user: 169	✅
Generating seed data for user: 156	✅
Generating seed data for user: 024	✅
Generating seed data for user: 023	✅
Generating seed data for user: 015	✅
Generating seed data for user: 012	✅
Generating seed data for user: 

In [10]:
migrator.create_indices()

----------------------------Starting create_indices-----------------------------
Executing statement
 ALTER TABLE TrackPoints ADD INDEX datetime_idx (datetime); ✅
--------------------Finished create_indices in 11427.037 ms---------------------



## Part 2

In [11]:
tasks = t.Task(database)

### Task 1

In [12]:
tasks.task1()

---------------------------------Starting task1---------------------------------
Running statement:
 
        SELECT
            (SELECT Count(*) AS UsersCount FROM Users) AS '# Users',
            (SELECT Count(*) AS UsersCount FROM Activities) AS '# Activities',
            (SELECT Count(*) AS UsersCount FROM TrackPoints) AS '# TrackPoints';
        
Query Finished
--------------------------Finished task1 in 823.754 ms--------------------------



Unnamed: 0,# Users,# Activities,# TrackPoints
0,182,16048,9681756


### Task 2

In [13]:
tasks.task2()

---------------------------------Starting task2---------------------------------
Running statement:
 
            SELECT CAST(ROUND(AVG(count), 0) AS SIGNED) AS Avg, MAX(count) AS Max, MIN(count) AS Min
            FROM (
                SELECT COUNT(*) AS count
                FROM TrackPoints as tp
                LEFT JOIN Activities as a
                    ON tp.activity_id = a.id
                GROUP BY a.user_id
            ) as counts;
        
Query Finished
-------------------------Finished task2 in 5014.978 ms--------------------------



Unnamed: 0,Avg,Max,Min
0,55964,1010325,17


### Task 3

In [14]:
tasks.task3()

---------------------------------Starting task3---------------------------------
Running statement:
 
            SELECT UserId, ActivityCount
            FROM
                (
                    SELECT Count(*) as ActivityCount, u.id as UserId
                    FROM Activities as a
                    LEFT JOIN Users as u
                        on a.user_id = u.id
                    GROUP BY u.id
                ) as activityCounts
            ORDER BY ActivityCount DESC
            LIMIT 15;
        
Query Finished
--------------------------Finished task3 in 12.835 ms---------------------------



Unnamed: 0,UserId,ActivityCount
0,128,2102
1,153,1793
2,25,715
3,163,704
4,62,691
5,144,563
6,41,399
7,85,364
8,4,346
9,140,345


### Task 4

In [15]:
tasks.task4()

---------------------------------Starting task4---------------------------------
Running statement:
 
            SELECT DISTINCT user_id AS UserId
            FROM Activities
            WHERE transportation_mode LIKE 'Bus'
            ORDER BY user_id;
        
Query Finished
--------------------------Finished task4 in 12.909 ms---------------------------



Unnamed: 0,UserId
0,10
1,52
2,62
3,73
4,81
5,84
6,85
7,91
8,92
9,112


### Task 5

In [16]:
tasks.task5()

---------------------------------Starting task5---------------------------------
Running statement:
 
            SELECT DISTINCT u.id AS UserID, Count(DISTINCT a.transportation_mode) as '# Transportation Modes', GROUP_CONCAT(DISTINCT a.transportation_mode SEPARATOR ', ') AS 'Transportation Modes'
            FROM Activities AS a
            LEFT JOIN Users AS u
                ON a.user_id = u.id
            WHERE a.transportation_mode != ""
            GROUP BY u.id
            ORDER BY Count(DISTINCT a.transportation_mode) DESC
            LIMIT 10;
        
Query Finished
---------------------------Finished task5 in 5.903 ms---------------------------



Unnamed: 0,UserID,# Transportation Modes,Transportation Modes
0,128,9,"airplane, bike, boat, bus, car, subway, taxi, ..."
1,62,7,"bike, bus, car, run, taxi, train, walk"
2,85,4,"bus, subway, taxi, walk"
3,112,3,"bike, bus, walk"
4,84,3,"bus, subway, walk"
5,81,3,"bike, bus, walk"
6,78,3,"subway, taxi, walk"
7,58,3,"car, taxi, walk"
8,163,3,"bike, taxi, walk"
9,86,2,"car, walk"


### Task 6

In [17]:
tasks.task6()

---------------------------------Starting task6---------------------------------
Running statement:
 
            SELECT a1.id as activity1_id, a2.id AS activity2_id
            FROM Activities AS a1
            JOIN Activities AS a2
            ON a1.id < a2.id
            AND a1.user_id = a2.user_id
            AND a1.start_datetime = a2.start_datetime
            AND a1.end_datetime = a2.end_datetime;
        
Query Finished
--------------------------Finished task6 in 46.446 ms---------------------------



Unnamed: 0,activity1_id,activity2_id


### Task 7

In [18]:
tasks.task7a()

--------------------------------Starting task7a---------------------------------
Running statement:
 
            SELECT COUNT(DISTINCT user_id) as '# Users With Overnight Activities'
            FROM Activities
            WHERE DATEDIFF(end_datetime, start_datetime) = 1;
        
Query Finished
--------------------------Finished task7a in 4.455 ms---------------------------



Unnamed: 0,# Users With Overnight Activities
0,98


In [19]:
tasks.task7b()

--------------------------------Starting task7b---------------------------------
Running statement:
 
            SELECT transportation_mode AS 'Transportation Mode', user_id AS UserId, TIMEDIFF(end_datetime, start_datetime) as Duration
            FROM Activities
            WHERE DATEDIFF(end_datetime, start_datetime) = 1
            LIMIT 10;
        
Query Finished
--------------------------Finished task7b in 3.624 ms---------------------------



Unnamed: 0,Transportation Mode,UserId,Duration
0,,0,0 days 04:09:15
1,,0,0 days 00:10:45
2,,0,0 days 11:03:55
3,,0,0 days 01:43:20
4,,0,0 days 16:24:06
5,,1,0 days 06:54:46
6,,1,0 days 00:09:30
7,,1,0 days 00:24:57
8,,1,0 days 14:17:28
9,,1,0 days 08:26:34


### Task 8

In [None]:
tasks.task8()

Running statement:
 
WITH user_pairs AS (
    SELECT DISTINCT a1.user_id AS user_id1, a2.user_id AS user_id2
    FROM Activities a1
    JOIN Activities a2 ON a1.id < a2.id
        -- We restrict the search space to activities that overlap with a 30 second margin
        -- to limit the number of track point comparisons that we have to perform.
        AND TIME_TO_SEC(TIMEDIFF(a2.start_datetime, a1.end_datetime)) <= 30
        AND TIME_TO_SEC(TIMEDIFF(a1.start_datetime, a2.end_datetime)) <= 30
        -- Avoid comparing a user to themselves
        AND a1.user_id < a2.user_id
    JOIN TrackPoints p1 ON p1.activity_id = a1.id
    JOIN TrackPoints p2 ON p2.activity_id = a2.id
    -- Then, after restricting the search space, we check for
    -- track points that are close both in time and space
    WHERE ABS(TIME_TO_SEC(TIMEDIFF(p1.datetime, p2.datetime))) <= 30
    AND ST_Distance_Sphere(p1.geom, p2.geom) <= 50
)
-- Finally, we select the list of distinct user_ids of users
-- who have bee

Unnamed: 0,user_id
0,000
1,001
2,003
3,004
4,005
...,...
116,173
117,174
118,175
119,176


### Task 9

In [20]:
tasks.task9()

---------------------------------Starting task9---------------------------------
Running statement:
 
            SELECT a1.user_id, SUM(tp2.altitude - tp1.altitude) AS 'Altitude Gain'
            FROM TrackPoints tp1
            JOIN TrackPoints tp2 ON tp2.id = tp1.id + 1
                AND tp2.altitude > tp1.altitude
                AND tp1.activity_id = tp2.activity_id
            JOIN Activities a1 ON a1.id = tp1.activity_id
            WHERE tp1.altitude != -777
            AND tp2.altitude != -777
            GROUP BY a1.user_id
            ORDER BY SUM(tp2.altitude - tp1.altitude) DESC
            LIMIT 15;
        
Query Finished
-------------------------Finished task9 in 23646.309 ms-------------------------



Unnamed: 0,user_id,Altitude Gain
0,128,2135455
1,153,1820766
2,4,1089358
3,41,789890
4,3,766613
5,85,714049
6,163,673439
7,62,596103
8,144,588771
9,30,576377


### Task 10

In [25]:
tasks.task10()

--------------------------------Starting task10---------------------------------
Running statement:
 
            WITH distances AS (
                SELECT a1.user_id AS user_id, a1.transportation_mode AS transportation_mode, ROUND(SUM(ST_DISTANCE_SPHERE(tp1.geom, tp2.geom)) / 1000, 2) AS distance
                FROM Activities a1
                JOIN TrackPoints tp1 ON tp1.activity_id = a1.id
                JOIN TrackPoints tp2 ON tp2.id = tp1.id + 1 
                    AND tp1.activity_id = tp2.activity_id
                WHERE a1.transportation_mode != ""
                GROUP BY a1.user_id, a1.transportation_mode, DATE(tp1.datetime)
                ORDER BY distance DESC
            )
            SELECT max.transportation_mode AS 'Transportation Mode', max.distance AS 'Max Distance (km)', MAX(d2.user_id) AS UserID
            FROM (
                SELECT d1.transportation_mode, MAX(d1.distance) AS distance
                FROM distances d1
                GROUP BY d1.transport

Unnamed: 0,Transportation Mode,Max Distance (km),UserID
0,airplane,2527.12,128
1,car,398.17,128
2,train,277.26,62
3,bus,207.41,128
4,boat,65.55,128
5,bike,63.11,128
6,taxi,40.22,128
7,subway,33.94,128
8,walk,22.81,139
9,run,0.03,78


### Task 11

In [22]:
tasks.task11()

--------------------------------Starting task11---------------------------------
Running statement:
 
            SELECT a1.user_id as UserID, COUNT(DISTINCT a1.id) as '# Invalid Activities'
            FROM Activities a1
            JOIN TrackPoints p1 ON a1.id = p1.activity_id
            JOIN TrackPoints p2 ON p2.id = p1.id + 1
                AND p2.activity_id = p1.activity_id
            WHERE ABS(TIME_TO_SEC(TIMEDIFF(p1.datetime, p2.datetime))) >= 5 * 60
            GROUP BY a1.user_id
            ORDER BY COUNT(DISTINCT a1.id) DESC
            LIMIT 10;
        
Query Finished
------------------------Finished task11 in 49217.174 ms-------------------------



Unnamed: 0,UserID,# Invalid Activities
0,128,720
1,153,557
2,25,263
3,62,249
4,163,233
5,4,219
6,41,201
7,85,184
8,3,179
9,144,157


### Task 12

In [23]:
tasks.task12()

--------------------------------Starting task12---------------------------------
Running statement:
 
        WITH counts AS (
            SELECT user_id, transportation_mode, COUNT(transportation_mode) as count
            FROM Activities
            WHERE transportation_mode != ""
            GROUP BY user_id, transportation_mode
            ORDER BY user_id, COUNT(transportation_mode) DESC
        )
        SELECT max.user_id AS user_id, MAX(c2.transportation_mode) AS most_used_transportation_mode
        FROM (
            SELECT c1.user_id, MAX(c1.count) AS count
            FROM counts c1
            GROUP BY c1.user_id
        ) AS max
        LEFT JOIN counts c2 ON max.count = c2.count
        GROUP BY max.user_id
        LIMIT 10;
        
Query Finished
--------------------------Finished task12 in 20.466 ms--------------------------



Unnamed: 0,user_id,most_used_transportation_mode
0,10,walk
1,20,bike
2,21,walk
3,52,walk
4,56,taxi
5,58,walk
6,60,walk
7,62,walk
8,64,walk
9,65,car
