# Gathering system data - Python for System Administrators 

## Goals:
    - Gathering System Data with multiplatform and platform-dependent tools
    - Get infos from files, /proc, /sys
    - Capture command output
    - Use psutil to get IO, CPU and memory data
    - Parse files with a strategy
    
## Non-goals for this lesson:
    - use with, yield or pipes

## Modules

In [1]:
import psutil
import glob
import sys
import subprocess


In [2]:
#
# Our code is p3-ready
#
from __future__ import print_function, unicode_literals

In [3]:
def grep(needle, fpath):
    """A simple grep implementation

       goal: open() is iterable and doesn't
             need splitlines()
       goal: comprehension can filter lists
    """
    return [x for x in open(fpath) if needle in x]

# Do we have localhost?
grep("localhost", "/etc/hosts")

['127.0.0.1\tlocalhost\n', '::1\tlocalhost ip6-localhost ip6-loopback\n']

In [4]:
#The psutil module is very nice
import psutil

#Works on Windows, Linux and MacOS
psutil.cpu_percent() 

59.7

In [5]:
#And its output is very easy to manage
ret = psutil.disk_io_counters()
print(ret)

sdiskio(read_count=2365882, write_count=3881668, read_bytes=51355675136, write_bytes=91856771072, read_time=16088324, write_time=132931849, read_merged_count=49442, write_merged_count=422233, busy_time=9600992)


In [6]:
# Exercise: Which other informations 
# does psutil provide? 
# Use this cell and the tab-completion jupyter functionalities.

In [7]:
# Exercise
def multiplatform_vmstat(count):
    # Write a vmstat-like function printing every second:
    # - cpu usage%
    # - bytes read and written in the given interval
    # Hint: use psutil and time.sleep(1)
    # Hint: use this cell or try on ipython and *then* write the function
    #       using %edit vmstat.py
    for i in range(count):
        raise NotImplementedError
        print(cpu_usage, bytes_rw)

multiplatform_vmstat(5)

NotImplementedError: 

In [12]:
# %load course/multiplatform_vmstat.py
def multiplatform_vmstat(count):
    """Get data in a multiplatform way

    """
    import psutil
    import time
    cpu_percent, io_stat, io_stat_0 = 0, 0, 0
    print("cpu%", "iops(r+w)", sep="\t")
    for x in range(-count, 1):
        cpu_percent = psutil.cpu_percent()
        read_io, write_io = psutil.disk_io_counters()[:2]
        io_stat = read_io + write_io
        print(cpu_percent, io_stat - io_stat_0, sep='\t')
        io_stat_0 = io_stat
        if x:
            time.sleep(1)


In [13]:
# Run your vmstat implementation.

multiplatform_vmstat(5)

cpu% iops(r+w)
5.8	6250358
10.4	204
3.0	58
2.0	0
29.8	80
34.3	122


In [14]:
#
# subprocess
#
# The check_output function returns the command stdout
from subprocess import check_output

# It takes a *list* as an argument!
out = check_output("ping -w1  -c1 www.google.com".split())

# and returns a string
print(out)

PING www.google.com (216.58.208.36) 56(84) bytes of data.
64 bytes from fra15s12-in-f4.1e100.net (216.58.208.36): icmp_seq=1 ttl=49 time=29.8 ms

--- www.google.com ping statistics ---
1 packets transmitted, 1 received, 0% packet loss, time 0ms
rtt min/avg/max/mdev = 29.802/29.802/29.802/0.000 ms



In [None]:
# If you want to stream command output, use subprocess.Popen
#  and check carefully subprocess documentation!

In [22]:
def sh(cmd, shell=False, timeout=0):
    """"Returns an iterable output of a command string
        checking...
    """
    from sys import version_info as python_version
    if python_version < (3, 3): # ..before using..
        if timeout:
            raise ValueError("Timeout not supported until Python 3.3")
        output = check_output(cmd.split(), shell=shell)
    else:
        output = check_output(cmd.split(), shell=shell, timeout=timeout)
    return output.splitlines()

In [None]:
# Exercise:
# implement a multiplatform pgrep-like function.
def ppgrep(program):
    """
    A multiplatform pgrep-like function.
    Prints a list of processes executing 'program'
    @param program - eg firefox, explorer.exe
    
    Hint: use subprocess, os and list-comprehension
    eg. items = [x for x in a_list if 'firefox' in x] 
    """
    raise NotImplementedError

In [None]:
%load course/pgrep.py


## Parsing /proc

Linux /proc filesystem is a cool place to get data

In the next example we'll see how to get:
 - thread informations;
 - disk statistics;
 
 

In [32]:
# Parsing /proc - 1
def linux_threads(pid):
    """Retrieving data from /proc
    """
    from glob import glob
    # glob emulates shell expansion of * and ?
    path = "/proc/{}/task/*/status".format(pid)
    
         
    # pick a set of fields to gather
    t_info = ('Pid', 'Tgid', 'voluntary')  # this is a tuple!
    for t in glob(path):
        # ... and use comprehension to get 
        # intersting data.
        t_info = [x 
                  for x in open(t) 
                  if x.startswith(t_info)] # startswith accepts tuples!
        print(t_info)

In [33]:
# If you're on linux try linux_threads
pid_of_init = 1 # or systemd ?
linux_threads(pid_of_init)

['Tgid:\t1\n', 'Pid:\t1\n', 'voluntary_ctxt_switches:\t143\n']


In [40]:
# On linux /proc/diskstats is the source of I/O infos
disk_l = grep("sda", "/proc/diskstats")
print(''.join(disk_l))

   8       0 sda 1005394 53134 42170585 6272865 1790286 437093 74710584 53596891 0 4656802 59872828
   8       1 sda1 119 0 1816 1258 0 0 0 0 0 1258 1258
   8       2 sda2 51 0 1272 1234 0 0 0 0 0 1231 1234
   8       3 sda3 113 0 1768 1565 0 0 0 0 0 1564 1565
   8       4 sda4 115 0 1784 1835 0 0 0 0 0 1830 1835
   8       5 sda5 425 206 2282 2599 0 0 0 0 0 2002 2599
   8       6 sda6 117 0 1800 1014 0 0 0 0 0 1008 1014
   8       7 sda7 249 0 3794 2854 7 1 64 36 0 2575 2890
   8       8 sda8 994166 39099 41963093 6127853 1581563 166403 72503680 52067403 0 3666400 58198493
   8       9 sda9 9904 13829 191008 130101 5113 270689 2206840 222559 0 167888 352736



In [47]:
# To gather that data we put the header in a multiline string
from course import diskstats_headers as headers
print(*headers, sep='\n')

major
minor
device
reads
reads_merged
reads_sectors
reads_ms
writes
writes_merged
writes_sectors
writes_ms
io_in_progress
io_ms_spent
io_ms_weight


In [50]:
#Take the 1st entry (sda), split the data...
disk_info = disk_l[0].split()
# ... and tie them with the header
ret = zip(headers, disk_info)

# On py3 we need to iterate over the generators
print(list(ret))

[(u'major', '8'), (u'minor', '0'), (u'device', 'sda'), (u'reads', '1005394'), (u'reads_merged', '53134'), (u'reads_sectors', '42170585'), (u'reads_ms', '6272865'), (u'writes', '1790286'), (u'writes_merged', '437093'), (u'writes_sectors', '74710584'), (u'writes_ms', '53596891'), (u'io_in_progress', '0'), (u'io_ms_spent', '4656802'), (u'io_ms_weight', '59872828')]


In [53]:
# Try to mangle ret
print('\n'.join(str(x) for x in ret))
# Exercise: trasform ret in a dict.

(u'major', '8')
(u'minor', '0')
(u'device', 'sda')
(u'reads', '1005394')
(u'reads_merged', '53134')
(u'reads_sectors', '42170585')
(u'reads_ms', '6272865')
(u'writes', '1790286')
(u'writes_merged', '437093')
(u'writes_sectors', '74710584')
(u'writes_ms', '53596891')
(u'io_in_progress', '0')
(u'io_ms_spent', '4656802')
(u'io_ms_weight', '59872828')


In [38]:
# We can create a reusable commodity class with
from collections import namedtuple

# using the imported `headers` as attributes
# like the one provided by psutil
DiskStats = namedtuple('DiskStat', headers)

# ... and disk_info as values
dstat = DiskStats(*disk_info)
print(dstat.device, dstat.writes_ms)

# Homework: check further features with
# help(collections)

sda 53585975


In [None]:
# Exercise
# Write the following function 
def linux_diskstats(partition):
    """Print every second I/O information from /proc/diskstats
    
        @param: partition - eg sda1 or vdx1
        
        Hint: use the above `grep` function
        Hint: use zip, time.sleep, print() and *magic
    """
    diskstats_headers = ('reads reads_merged reads_sectors reads_ms'
            ' writes writes_merged writes_sectors writes_ms'
            ' io_in_progress io_ms_weight').split()
    
    while True:
        raise NotImplementedError
        print(values, sep="\t")

In [None]:
# Solution
%load course/linux_diskstats.py

In [None]:
# Using check_output with split() doesn't always work
from os import makedirs
makedirs('/tmp/course/b l a n k s', exist_ok=True)

check_output('ls "/tmp/course/b l a n k s"'.split())

In [None]:
# You can use
from shlex import split
# and
cmd = split('dir -a "/tmp/course/b l a n k s"')
check_output(cmd)

# Pandas 

Pandas is an useful (and fat) library for data management.

You can use it to import and export data from csv and excel, or for basic plotting.



In [72]:
# Pandas 101# Pandas 101
import pandas as pd
pd.DataFrame(ret)
import pandas as pd
df = pd.DataFrame(zip(*ret[3:]), columns=zip(*ret[3:])[0], index=ret[2])
df[1:]

Unnamed: 0,reads,reads_merged,reads_sectors,reads_ms,writes,writes_merged,writes_sectors,writes_ms,io_in_progress,io_ms_spent,io_ms_weight
sda,1005394,53134,42170585,6272865,1790286,437093,74710584,53596891,0,4656802,59872828


## zip on py3 is a generator 


In [None]:
# zip_iterables():
"""The zip method joins list elements pairwise
    like a zip fastener
"""
from sys import version_info as python_version
a_list = [0, 1, 2, 3]
b_list = ["a", "b", "c", "d"]
zipper = zip(a_list, b_list)
print(zipper)

In [None]:
if python_version >= (3,):
    zipper = list(zipper)
assert zipper == [(0, "a"), (1, "b"), (2, "c"), (3, "d")]