In [1]:
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
%load_ext sql

In [2]:
%sql sqlite:///results.db

# Substituting Commands

First, we want a list of known actual commands. We will use lists of standard Unix tools from Wikipedia (https://en.wikipedia.org/wiki/List_of_Unix_commands and https://en.wikipedia.org/wiki/List_of_GNU_Core_Utilities_commands) but also commands from our dataset. We need to make sure these are actual commands, however, and not already aliases that are also used like commands. So some manual curation is necessary. Afterwards, we create a table of known commands.

Note: The dataset from Zenodo already contains the `known_commands` table.

In [None]:
%%sql 
select name, count(*) from command
where name not like '%=%' and name not like '%-%' and name not like '%$%'
group by name having count(*) > 1000 
order by count(*) desc;

In [None]:
%%sql
create table known_commands (name text primary key);
insert into known_commands (name) values
('.'),('['),('ack'),('adb'),('admin'),('alias'),('apt'),('aptitude'),('ar'),('arch'),('asa'),('at'),('atom'),('awk'),('aws'),('b2sum'),('base32'),('base64'),('basename'),('bash'),('bat'),('batch'),('bc'),('bg'),('brew'),('bundle'),('c99'),('cal'),('cat'),('cc'),('cd'),('cflow'),('chcon'),('chgrp'),('chmod'),('chown'),('chroot'),('cksum'),('clear'),('cmp'),('code'),('colordiff'),('colorls'),('colourify'),('column'),('comm'),('command'),('composer'),('compress'),('cp'),('crontab'),('csplit'),('ctags'),('curl'),('cut'),('cxref'),('date'),('dd'),('defaults'),('delta'),('df'),('diff'),('dig'),('dir'),('dircolors'),('dirname'),('dirs'),('dnf'),('do'),('docker'),('done'),('drush'),('dscacheutil'),('du'),('echo'),('ed'),('edit'),('egrep'),('emacs'),('emacsclient'),('env'),('eval'),('ex'),('exa'),('exec'),('exit'),('expand'),('export'),('expr'),('factor'),('false'),('fasd'),('fc'),('feh'),('fg'),('fgrep'),('file'),('find'),('fmt'),('fold'),('for'),('fort77'),('free'),('function'),('fuser'),('fzf'),('g++'),('gcc'),('gem'),('gencat'),('get'),('getconf'),('getopts'),('git'),('gitk'),('grep'),('groups'),('gvim'),('hash'),('head'),('heroku'),('hg'),('history'),('hostid'),('htop'),('hub'),('iconv'),('id'),('ifconfig'),('install'),('ipconfig'),('ipcrm'),('ipcs'),('ipython'),('java'),('jobs'),('join'),('journalctl'),('jupyter'),('kill'),('killall'),('kubectl'),('launchctl'),('less'),('lex'),('link'),('ln'),('locale'),('localedef'),('logger'),('logname'),('lp'),('ls'),('lsof'),('m4'),('mailx'),('make'),('man'),('mate'),('md5sum'),('mdutil'),('mesg'),('mix'),('mkdir'),('mkfifo'),('mkfile'),('mknod'),('mktemp'),('more'),('mount'),('mplayer'),('mpv'),('mv'),('mvim'),('mvn'),('mysql'),('nano'),('netstat'),('newgrp'),('nice'),('nl'),('nm'),('nocorrect'),('node'),('noglob'),('nohup'),('npm'),('nproc'),('numfmt'),('nvim'),('od'),('open'),('openssl'),('osascript'),('pacaur'),('pacman'),('paste'),('patch'),('pathchk'),('pax'),('pbcopy'),('perl'),('pg_ctl'),('php'),('ping'),('pinky'),('pip'),('pip3'),('pkill'),('popd'),('pr'),('printenv'),('printf'),('prs'),('ps'),('ptx'),('pushd'),('pwd'),('pygmentize'),('python'),('python2'),('python3'),('qalter'),('qdel'),('qhold'),('qmove'),('qmsg'),('qrerun'),('qrls'),('qselect'),('qsig'),('qstat'),('qsub'),('rails'),('rake'),('ranger'),('read'),('readlink'),('realpath'),('reboot'),('renice'),('rg'),('rlwrap'),('rm'),('rmdel'),('rmdir'),('rsync'),('ruby'),('runcon'),('sact'),('sccs'),('scp'),('screen'),('sed'),('seq'),('service'),('set'),('setxkbmap'),('sh'),('sha1sum'),('sha224sum'),('sha256sum'),('sha384sum'),('sha512sum'),('shred'),('shuf'),('shutdown'),('sleep'),('sort'),('source'),('split'),('spring'),('ssh'),('sshfs'),('stat'),('stdbuf'),('strings'),('strip'),('stty'),('su'),('subl'),('sudo'),('sum'),('svn'),('sync'),('systemctl'),('tabs'),('tac'),('tail'),('talk'),('tar'),('task'),('tcpdump'),('tee'),('terraform'),('test'),('then'),('time'),('timeout'),('tmux'),('tmuxinator'),('top'),('touch'),('tput'),('tr'),('tree'),('true'),('truncate'),('tsort'),('tty'),('type'),('ulimit'),('umask'),('umount'),('unalias'),('uname'),('uncompress'),('unexpand'),('unget'),('uniq'),('unlink'),('unset'),('uptime'),('users'),('uucp'),('uudecode'),('uuencode'),('uustat'),('uux'),('vagrant'),('val'),('vdir'),('vi'),('vim'),('wait'),('watch'),('wc'),('wget'),('what'),('who'),('whoami'),('write'),('xargs'),('xclip'),('xrandr'),('xsel'),('yacc'),('yaourt'),('yarn'),('yay'),('yes'),('yum'),('zcat'),('zeus')

---

In [25]:
%%sql
with total as (select count(*) as total from alias)
select count(*) as '#', round(count(*)*100.0/total,2) as '%'
from alias
join total
where alias.name in known_commands
and alias.value != alias.name
and alias.value not like alias.name || ' %'
and alias.value not like '% ' || alias.name
and alias.value not like '% ' || alias.name || ' %'

 * sqlite:///results.db
Done.


#,%
100564,4.56


In [26]:
%%sql
with total as (select count(*) as total from alias)
select alias.name, alias.value, count(*)
from alias
join total
where alias.name in known_commands
and alias.value != alias.name
and alias.value not like alias.name || ' %'
and alias.value not like '% ' || alias.name
and alias.value not like '% ' || alias.name || ' %'
group by alias.name, alias.value
order by count(*) desc
limit 20;

 * sqlite:///results.db
Done.


name,value,count(*)
vi,vim,9648
vim,nvim,6936
vi,nvim,2771
python,python3,1947
more,less,1900
open,xdg-open,1647
git,hub,1626
diff,colordiff,1424
cat,bat,1321
pip,pip3,1013
