In [1]:
%%time
#Count num of bytes transferred in Apache lo file using for loop
with open("./data/access-log.log") as wwwlog:
    total = 0
    for line in wwwlog:
        bytes_sent = line.rsplit(None,1)[1]
        if bytes_sent != '-':
            total += int(bytes_sent)
    print("Total", total)

Total 18094164
Wall time: 6 ms


In [2]:
%%time
#using generators:
with open('./data/access-log.log') as fh:
    bytecolumn = (line.rsplit(None,1)[1] for line in fh)
    bytes_sent = ( int(x) for x in bytecolumn if x != '-')
    print("Total: %r"%( sum(bytes_sent)))

Total: 18094164
Wall time: 5 ms


In [3]:
%%time
from pathlib import Path

#Search file name in the given directory
def gen_find(filepat, top):
    yield from Path(top).rglob(filepat)

# Example use

lognames = gen_find("access*",".\\data")

for name in lognames:
    print(name)

data\access-log-0108.bz2
data\access-log-0108.gz
data\access-log.log
Wall time: 3 ms


In [4]:
%%time
#open allfile's

import gzip,bz2
from pathlib import Path

def gen_open(paths):
    for path in paths:
        if path.suffix == '.gz':
            yield gzip.open(path, 'rt')
        elif path.suffix == '.bz2':
            yield bz2.open(path, 'rt')
        else:
            yield open(path,'rt')
            
            
#main
lognames = Path('data').rglob('access*')
logfiles = gen_open(lognames)
for f in logfiles:
    print(f)


<_io.TextIOWrapper encoding='cp1252'>
<_io.TextIOWrapper name='data\\access-log-0108.gz' encoding='cp1252'>
<_io.TextIOWrapper name='data\\access-log.log' mode='rt' encoding='cp1252'>
Wall time: 12 ms


In [5]:
%time
from pathlib import Path
#read all files

def gen_cat(sources):
    for src in sources:
        yield from src
        


lognames = Path('data').rglob('access-log*')
print(list(lognames))

logfiles = gen_open(lognames)

loglines = gen_cat(logfiles)
for line in loglines:
    #print(line,end='') #print of all lines of all files
    pass

Wall time: 0 ns
[WindowsPath('data/access-log-0108.bz2'), WindowsPath('data/access-log-0108.gz'), WindowsPath('data/access-log.log')]


In [6]:
%time
#search a specific lines that match a pattern
import re

def gen_grep(pat,lines):
    patc = re.compile(pat)
    return ( line for line in lines if patc.search(line))

#main
lognames = Path("data").rglob('access-log*')
logfiles = gen_open(lognames)
loglines = gen_cat(logfiles)

#Look for ply downloads
plylines = gen_grep(r'ply-.*\.gz',loglines)
for line in plylines:
    #print(line,end='') #print matched lines
    pass


Wall time: 0 ns


In [7]:
%time
#Parsing and Processing Data;

#sum the bytes if ply-.*\.gz file matches

pat = r'ply-.*\.gz'
logdir = '.\data'

filenames = gen_find("access*", logdir )

logfiles = gen_open(filenames)
#print(list(logfiles))

loglines = gen_cat(logfiles)


    
patlines = gen_grep(pat,loglines)


bytecol = ( line.rsplit(None, 1)[1] for line in patlines )


bytes_sent = ( int(x) for x in bytecol if x != '-')

print("Total:%r"%( sum(bytes_sent)))



Wall time: 0 ns
Total:38792864


In [8]:
#Parse lines into tuples using regular expressions

loglines = open("./data/access-log.log")

import re

logpats  = r'(\S+) (\S+) (\S+) \[(.*?)\] ' \
           r'"(\S+) (\S+) (\S+)" (\S+) (\S+)'

logpat   = re.compile(logpats)

groups   = (logpat.match(line) for line in loglines)
tuples   = (g.groups() for g in groups if g)

#main 
for t in tuples:
    #print(t) #print lines in a tuple
    pass

In [9]:
#Parse lines into a dict using regular expressions;

import re
loglines = open("./data/access-log.log")

logpats  = r'(\S+) (\S+) (\S+) \[(.*?)\] ' \
           r'"(\S+) (\S+) (\S+)" (\S+) (\S+)'

logpat   = re.compile(logpats)

groups   = (logpat.match(line) for line in loglines)
tuples   = (g.groups() for g in groups if g)

colnames = ('host','referrer','user','datetime',
            'method', 'request','proto','status','bytes')

log      = (dict(zip(colnames, t)) for t in tuples)

#main
for x in log:
    #print(x) #print lines in a dict
    pass

In [10]:
#Re-map fields in a sequence of dict;
def field_map(dictseq, name, func):
    for d in dictseq:
        d[name] = func(d[name])
        yield d

# main



loglines = open("./data/access-log.log")

import re

logpats  = r'(\S+) (\S+) (\S+) \[(.*?)\] ' \
           r'"(\S+) (\S+) (\S+)" (\S+) (\S+)'

logpat   = re.compile(logpats)

groups   = (logpat.match(line) for line in loglines)
tuples   = (g.groups() for g in groups if g)

colnames = ('host','referrer','user','datetime',
            'method', 'request','proto','status','bytes')

log      = (dict(zip(colnames, t)) for t in tuples)

log      = field_map(log,"status",int)


log      = field_map(log,"bytes", lambda s: int(s) if s != '-' else 0)


for x in log:
    #print(x) # {'host': '140.180.132.213', 'referrer': '-', 'user': '-', 'datetime': '24/Feb/2008:00:08:59 -0600', 'method': 'GET', 'request': '/favicon.ico', 'proto': 'HTTP/1.1', 'status': 404, 'bytes': 133}
    pass

In [11]:
#Generate lines from files in a directory;

def lines_from_dir(filepat, dirname):
    names = Path(dirname).rglob(filepat)
    files = gen_open(names)
    lines = gen_cat(files)
    return lines

#main
loglines = lines_from_dir("access*", 'data')
for line in loglines:
    #print(line) #75.54.118.139 - - [24/Feb/2008:00:15:40 -0600] "GET / HTTP/1.1" 200 4447
    pass

In [12]:
#parse apche log files into sequence of dict

import re

logpats  = r'(\S+) (\S+) (\S+) \[(.*?)\] ' \
           r'"(\S+) (\S+) (\S+)" (\S+) (\S+)'

logpat = re.compile(logpats)

def apache_log(lines):
    groups = ( logpat.match(line) for line in lines )
    tuples = ( g.groups() for g in groups if g )
    
    colnames = ( 'host','referrer', 'user','datetime','method','request', 'proto', 'status','bytes' )
    
    log = ( dict(zip(colnames,t)) for t in tuples)
    log = field_map( log, 'status', int) #type-cast to int
    log = field_map(log, "bytes", lambda s:int(s) if s != '-' else 0 ) #remove - in the lines
    
    return log

#main
lines = lines_from_dir("access*", 'data')
log = apache_log(lines)

for r in log:
   # print(r)  #{'host': '140.180.132.213', 'referrer': '-', 'user': '-', 'datetime': '24/Feb/2008:00:08:59 -0600', 'method': 'GET', 'request': '/ply/ply.html', 'proto': 'HTTP/1.1', 'status': 200, 'bytes': 97238}
    pass

In [15]:
#find the set of all docuemnts that 404 in a log file

lines = lines_from_dir("access*", 'data')
log = apache_log(lines)

stat404 = {  r['request'] for r in log if r['status'] == 404 }

for r in sorted(stat404):
    #print(r) # /Doc/index.html

    pass

In [16]:
#find all transfers over a megabyte

lines = lines_from_dir("access*", "data")
log = apache_log(lines)

large = ( r for r in log  if r['bytes'] > 1000000 )

for r in large:
    #print( r['request'], r['bytes']) #/dynamic/01Introduction.pdf 3110734
    pass

In [17]:
# find the largest document

lines = lines_from_dir('access*','data')
log = apache_log(lines)

print("%d %s"%( max(( r['bytes'], r['request']) for r in log )))

4919642 /dynamic/ffcache.zip


In [18]:
#find unique host IP address

lines = lines_from_dir('access-log.log', 'data')
log = apache_log(lines)

hosts = set( r['host'] for r in log )

for h in hosts:
    #print(h, end='') # print IP address
    pass

In [19]:
#find the specific downloads of a specific request;

lines = lines_from_dir("access*", "data")
log = apache_log(lines)

request = 'ply/ply-2.3.tar.gz'

total = sum( 1 for r in log if r['request'] == '/ply/ply-2.3.tar.gz'  ) # if matches print one and finally sum the ones
print("Total", total)


Total 234


In [None]:
import socket

lines = lines_from_dir('access*','data')
log = apache_log(lines)

addrs = { r['host'] for r in log if 'robots.txt' in r['request'] }


for addr in addrs:
    try:
        #print(socket.gethostbyaddr(addr)[0]) # print host name from IP address
        pass
    except socket.herror:
        #print(addr)
        pass


In [24]:
#Trace a generators by printing items received

def trace(source):
    for item in source:
        #print(item)
        yield item
        
#main
lines = open("./data/access-log.log")
log = trace( apache_log(lines))

r404 = ( r for r in log if r['status'] == 404 )

for r in r404:
    #print matched lines
    pass


In [None]:
# tail -f functions

import time
import os

def follow(thefile):
    thefile.seek(0, os.SEEK_END)
    while True:
        line = thefile.readline()
        
        if not line:
            time.sleep(0.1)
            continue
        yield line
        
        
#main
logfile=open('./data/access-log.log')
loglines = follow(logfile)
for line in loglines:
    print(line, end='')

In [None]:

import time
import os

def follow(thefile):
    thefile.seek(0, os.SEEK_END)
    while True:
        line = thefile.readline()
        
        if not line:
            time.sleep(0.1)
            continue
        yield line
        
#main
lines = follow(open('./data/access-log.log'))

for i,line in enumerate(lines):
    print(line, end = '')
    if i == 10:
        lines.close()

In [10]:
def fun1(l): 
    yield from l
    

def fun2(l):
    for i in l:
        yield i
  
# fun1 and fun2 are same;

l = [ 1,2,3,4]

print( list( fun1(l) ) )

print( list( fun2(l)) )

[1, 2, 3, 4]
[1, 2, 3, 4]
