## create a connection to the database using the library sqlite3

In [40]:
from pandas.io.sql import read_sql, to_sql
import sqlite3

con = sqlite3.connect('../data/checking-logs.sqlite')

## get the schema of the table checker

In [41]:
read_sql('PRAGMA table_info(checker)', con)

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,index,INTEGER,0,,0
1,1,status,TEXT,0,,0
2,2,success,INTEGER,0,,0
3,3,timestamp,TIMESTAMP,0,,0
4,4,numTrials,INTEGER,0,,0
5,5,labname,TEXT,0,,0
6,6,uid,TEXT,0,,0


## get only 10 first rows of the table checker to check how the table looks like

In [42]:
read_sql('SELECT * FROM checker LIMIT 10', con)

Unnamed: 0,index,status,success,timestamp,numTrials,labname,uid
0,0,checking,0,2020-04-16 21:12:50.740474,5,,admin_1
1,1,ready,0,2020-04-16 21:12:54.708365,5,code_rvw,admin_1
2,2,checking,0,2020-04-16 21:46:47.769088,7,,admin_1
3,3,ready,0,2020-04-16 21:46:48.121217,7,lab02,admin_1
4,4,checking,0,2020-04-16 21:53:01.862637,6,code_rvw,admin_1
5,5,ready,0,2020-04-16 21:53:05.373389,6,code_rvw,admin_1
6,6,checking,0,2020-04-17 05:18:51.965864,1,,
7,7,ready,0,2020-04-17 05:19:02.744528,1,project1,user_4
8,8,checking,0,2020-04-17 05:22:35.249331,2,project1,user_4
9,9,ready,1,2020-04-17 05:22:45.549397,2,project1,user_4


## count how many rows satisfy the following conditions using only one query with any number of subqueries:
### count the rows from the pageviews table but only with the users from the checker table with:
* status = ’ready’, we do not want to analyze the logs that are in status checking
* numTrials = 1, we want to analyze only the first commits, because only
they can tell us when a student started working on a lab
* labnames should be from the list: ’laba04’, ’laba04s’, ’laba05’, ’laba06’,
’laba06s’, ’project1’, only they were active during the experiment

In [43]:
query = '''
    SELECT checker.*, T.cnt FROM checker
    CROSS JOIN (
        SELECT count() as cnt FROM (SELECT * FROM checker
        INNER JOIN pageviews on checker.uid = pageviews.uid
        WHERE status='ready' 
        AND numTrials=1 
        AND labname in ("laba04", "laba04s", "laba05", "laba06", "laba06s", "project1"))
    ) T
'''
read_sql(query, con)

Unnamed: 0,index,status,success,timestamp,numTrials,labname,uid,cnt
0,0,checking,0,2020-04-16 21:12:50.740474,5,,admin_1,5584
1,1,ready,0,2020-04-16 21:12:54.708365,5,code_rvw,admin_1,5584
2,2,checking,0,2020-04-16 21:46:47.769088,7,,admin_1,5584
3,3,ready,0,2020-04-16 21:46:48.121217,7,lab02,admin_1,5584
4,4,checking,0,2020-04-16 21:53:01.862637,6,code_rvw,admin_1,5584
...,...,...,...,...,...,...,...,...
3397,3397,ready,0,2020-05-21 20:19:06.872761,7,laba06s,user_1,5584
3398,3398,checking,0,2020-05-21 20:22:41.785725,8,laba06s,user_1,5584
3399,3399,ready,0,2020-05-21 20:22:41.877806,8,laba06s,user_1,5584
3400,3400,checking,0,2020-05-21 20:37:00.129678,9,laba06s,user_1,5584


close the connection

In [44]:
con.close()