In [1]:
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
%load_ext sql

In [2]:
%sql sqlite:///results.db

In [12]:
%%sql top50 <<
with 
    top_commands as (
        select name as cmd_name, count(*) as cmd_count 
        from command 
        group by cmd_name 
        order by cmd_count desc 
        limit 50
    ), 
    top_arguments as (
        select name as cmd_name, arguments as cmd_args, count(*) as arg_count 
        from command 
        where cmd_name in (select cmd_name from top_commands) 
        group by cmd_name, cmd_args 
        order by cmd_name, arg_count desc
    ),
    X as (
        select *, row_number() over (partition by cmd_name order by arg_count desc) as num
        from top_arguments
    ),
    top_aliases as (
        select 
            command.name as cmd_name, 
            command.arguments as cmd_args,
            alias.name as alias_name,
            alias.value as alias_value,
            count(*) as alias_count
        from alias join command using (alias_id)
        where cmd_name in (select cmd_name from top_commands)
        group by cmd_name, cmd_args, alias_name 
        order by cmd_name, cmd_args, alias_count desc
    ),
    Y as (
        select *, row_number() over (partition by cmd_name, cmd_args order by alias_count desc) as num
        from top_aliases
    )

select alias_name, alias_value
from top_commands join X using (cmd_name) join Y using (cmd_name, cmd_args)
where X.num <= 10 and Y.num <= 3
order by cmd_count desc, arg_count desc, alias_count desc;

 * sqlite:///results.db
Done.
Returning data to local variable top50


In [13]:
top50.csv(filename='top50.csv')

In [14]:
longtail = %sql select alias.name, alias.value from alias group by alias.value, alias.name having count(*) = 1 order by random() limit 200;
longtail.csv(filename='longtail.csv')

 * sqlite:///results.db
Done.
