In [5]:
#Generator and StopIteration use
def squares(N):
    i = 0
    while True:
        if i > N:
            raise StopIteration
        yield i * i 
        i+=1

try:        
    squared_values = [elem for elem in squares(20)]
except StopIteration:
    pass

print(squared_values)
    

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100, 121, 144, 169, 196, 225, 256, 289, 324, 361, 400]


  # This is added back by InteractiveShellApp.init_path()


In [6]:
#Reading files with generator

log = open('example_log.txt')

def parse_log(log_obj):
    for line in log_obj:
        line = line.split()
        remote_user = line[0]
        time_local = line[3] + " " + line[4]
        request_type = line[5]
        request_path = line[6]
        status = line[8]
        body_bytes_sent = line[9]
        http_referrer = line[10]
        http_user_agent = " ".join (line[11:])
        
        yield(remote_user, time_local, request_type, request_path, status, body_bytes_sent, http_referrer, http_user_agent)
        
first_line = next(parse_log(log))

In [2]:
#More Sophiscated parsing the log
from datetime import datetime


def parse_log(log):
    for line in log:
        split_line = line.split()
        remote_addr = split_line[0]
        time_local = split_line[3] + " " + split_line[4]
        time_local = parse_time(time_local)
        
        request_type = split_line[5]
        request_type = strip_quotes(request_type) 
        
        request_path = split_line[6]
        status = split_line[8]
        body_bytes_sent = split_line[9]
        http_referrer = split_line[10]
        http_referrer = strip_quotes(http_referrer)
        
        http_user_agent = " ".join(split_line[11:])
        http_user_agent = strip_quotes(http_user_agent)
        
        yield (
            remote_addr, time_local, request_type, request_path,
            status, body_bytes_sent, http_referrer, http_user_agent
        )
def parse_time(time_str):
    """
    Parses time in the format [day/month/year:HH:MM:SS +####]
    to a datetime object
    """
    time_obj = datetime.strptime(time_str, '[%d/%b/%Y:%H:%M:%S %z]')
    return time_obj

def strip_quotes(s):
    return s.replace('"', '')

log = open('example_log.txt')
first_line = next(parse_log(log))

In [13]:
#From parsing process to making a csv file
import csv


def build_csv(lines, header = None, file = None):
    if header:
        lines = [header] + [line for line in lines]
        
    writer = csv.writer(file, delimiter = ',')
    writer.writerows(lines)
    
    file.seek(0)
    return file


log = open('example_log.txt')
parsed = parse_log(log)
try:
    file = open("temporary.csv", 'r+')

except FileNotFoundError:
    file = open("temporary.csv", 'w+')
    
    
csv_file = build_csv(
    parsed, 
    header = [
        'ip', 'time_local', 'request_type',
        'request_path', 'status', 'bytes_sent',
        'http_referrer', 'http_user_agent'
    ],
    file = file)
    
contents = csv_file.readlines()

print(contents[:5])


['ip,time_local,request_type,request_path,status,bytes_sent,http_referrer,http_user_agent\n', '200.155.108.44,2017-11-30 11:59:54+00:00,PUT,/categories/categories/categories,401,963,http://www.yates.com/list/tags/category/,"Mozilla/5.0 (Windows CE) AppleWebKit/5332 (KHTML, like Gecko) Chrome/13.0.864.0 Safari/5332"\n', '36.139.255.202,2017-11-30 11:59:54+00:00,PUT,/search,404,171,https://www.butler.org/main/tag/category/home.php,"Mozilla/5.0 (Macintosh; PPC Mac OS X 10_5_0) AppleWebKit/5332 (KHTML, like Gecko) Chrome/15.0.813.0 Safari/5332"\n', '50.112.115.219,2017-11-30 11:59:54+00:00,POST,/main/blog,404,743,http://deleon-bender.com/categories/category.html,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_5_5 rv:2.0; apn-IN) AppleWebKit/531.48.1 (KHTML, like Gecko) Version/4.0 Safari/531.48.1"\n', '204.132.56.4,2017-11-30 11:59:54+00:00,POST,/list,404,761,http://smith.com/category.htm,Opera/9.39.(Windows 98; Win 9x 4.90; mn-MN) Presto/2.9.163 Version/12.00\n']


In [14]:
#Inserting header into lines generator like the above makes the lines lose its effect as a generator
#So?! use itertool.chain()

import csv
import itertools


log = open('example_log.txt')
parsed = parse_log(log)

def build_csv(lines, header=None, file=None):
    # if header:
    #    lines = [header] + [l for l in lines]
    if header:
        lines = itertools.chain([header], lines)
        
    writer = csv.writer(file, delimiter=',')
    writer.writerows(lines)
    file.seek(0)
    return file

file = open('temporary.csv', 'r+')
csv_file = build_csv(
    parsed,
    header=[
        'ip', 'time_local', 'request_type',
        'request_path', 'status', 'bytes_sent',
        'http_referrer', 'http_user_agent'
    ],
    file=file
)
    
contents = csv_file.readlines()
print(contents[:5])



['ip,time_local,request_type,request_path,status,bytes_sent,http_referrer,http_user_agent\n', '200.155.108.44,2017-11-30 11:59:54+00:00,PUT,/categories/categories/categories,401,963,http://www.yates.com/list/tags/category/,"Mozilla/5.0 (Windows CE) AppleWebKit/5332 (KHTML, like Gecko) Chrome/13.0.864.0 Safari/5332"\n', '36.139.255.202,2017-11-30 11:59:54+00:00,PUT,/search,404,171,https://www.butler.org/main/tag/category/home.php,"Mozilla/5.0 (Macintosh; PPC Mac OS X 10_5_0) AppleWebKit/5332 (KHTML, like Gecko) Chrome/15.0.813.0 Safari/5332"\n', '50.112.115.219,2017-11-30 11:59:54+00:00,POST,/main/blog,404,743,http://deleon-bender.com/categories/category.html,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_5_5 rv:2.0; apn-IN) AppleWebKit/531.48.1 (KHTML, like Gecko) Version/4.0 Safari/531.48.1"\n', '204.132.56.4,2017-11-30 11:59:54+00:00,POST,/list,404,761,http://smith.com/category.htm,Opera/9.39.(Windows 98; Win 9x 4.90; mn-MN) Presto/2.9.163 Version/12.00\n']


In [16]:
#Create a task
#From the parsed logfile, find make a summary for the reqeust type
import csv
from collections import defaultdict

log = open('example_log.txt')
parsed = parse_log(log)
file = open('temporary.csv', 'r+')
csv_file = build_csv(
    parsed,
    header=[
        'ip', 'time_local', 'request_type',
        'request_path', 'status', 'bytes_sent',
        'http_referrer', 'http_user_agent'
    ],
    file=file
)

def count_unique_requests(csv_file):
    reader = csv.reader(csv_file)
    header = next(reader)
    idx = header.index('request_type')
    res = defaultdict(int)
    
    for line in reader:
        res[line[idx]] += 1
        
    return dict(res) 

uniques = count_unique_requests(csv_file)

print(uniques)



{'PUT': 3367, 'POST': 3299, 'GET': 3334}


In [None]:
#Letting the return type of the count_unique_request() function consistent with the generator

import csv

def count_unique_request(csv_file):
    reader = csv.reader(csv_file)
    header = next(reader)
    idx = header.index('request_type')
    res = defaultdict(int)
    
    for line in reader:
        res[line[idx]] += 1
    return ((key,value) for key,value in uniques.items())


log = open('example_log.txt')
parsed = parse_log(log)
file = open('temporary.csv', 'r+')
csv_file = build_csv(
    parsed,
    header=[
        'ip', 'time_local', 'request_type',
        'request_path', 'status', 'bytes_sent',
        'http_referrer', 'http_user_agent'
    ],
    file=file
)
uniques = count_unique_request(csv_file)
summarized_file = open('summarized.csv', 'r+')
summarized_csv = build_csv(uniques, header=['request_type', 'count'], file=summarized_file)
print(summarized_file.readlines())