# Looking at the HTTPS Log files 
* Split the data on carriage return line feed \r\n\r for ~2k of lines 


# GDI notes for the algorithm 

*  You are a Site Reliability Engineer, and you have a giant pile of logs to look through.
*  We need to know what the most frequent error is,
* and what kinds of errors there are, 
* and under what HTTP response code they will fall


# Future thoughts
* Now that I have an understanding of generators the idea of using yield to walk through each line instead of materializing the data would be the next step in making this more efficient with %%timeit or pyflame 




In [None]:
# Looking at data
# Reviewed the 4.6 K of lines - narrowed down to 2k of lines and punctuation with 1st split
raw_logs = """
[WARNING] 403 Forbidden: No token in request parameters
[ERROR] 500 Server Error: int is not subscriptable
[INFO] 200 OK: Login Successful
[INFO] 200 OK: User sent a message
[ERROR] 500 Server Error: int is not subscriptable
[WARNING] 403 Forbidden: No token in request parameters
[ERROR] 500 Server Error: int is not subscriptable
[INFO] 200 OK: Login Successful
[ERROR] 500 Server Error: int is not subscriptable
[ERROR] 500 Server Error: int is not subscriptable
[INFO] 200 OK: User sent a message
[ERROR] 500 Server Error: int is not subscriptable
[WARNING] 403 Forbidden: No token in request parameters
[INFO] 200 OK: Login Successful
[INFO] 200 OK: User sent a message
[INFO] 200 OK: Login Successful
[INFO] 200 OK: User sent a message
[INFO] 200 OK: Login Successful
[INFO] 200 OK: User sent a message
[ERROR] 500 Server Error: int is not subscriptable
[INFO] 200 OK: Login Successful
[INFO] 200 OK: User sent a message
[ERROR] 500 Server Error: int is not subscriptable
"""

output = {
	'WARNING': {
		'403': {
			'Forbidden': {
				'No token in request parameters': 3
			}
		}
	},
	'ERROR': {
		'500': {
			'Server Error': {
				'int is not subscriptable': 8
			}
		}
	},
	'INFO': {
		'200': {
			'OK': {
				'Login Successful': 6,
				'User sent a message': 6
			}
		}
	}
}

## 
# {'[ERROR]': {'500': {'Server Error': {' int is not subscriptable': 8}}},
# '[INFO]': {'200': {'OK': {' Login Successful': 6, ' User sent a message': 6}}},
#  '[WARNING]': {'403': {'Forbidden': {' No token in request parameters': 3}}}}

In [None]:
import pprint
pp = pprint.pprint 
pp(raw_logs)

('\n'
 '[ERROR] 500 Server Error: int is not subscriptable\n'
 '[INFO] 200 OK: Login Successful\n'
 '[INFO] 200 OK: User sent a message\n'
 '[ERROR] 500 Server Error: int is not subscriptable\n'
 '[ERROR] 500 Server Error: int is not subscriptable\n'
 '[INFO] 200 OK: Login Successful\n'
 '[ERROR] 500 Server Error: int is not subscriptable\n'
 '[ERROR] 500 Server Error: int is not subscriptable\n'
 '[INFO] 200 OK: User sent a message\n'
 '[ERROR] 500 Server Error: int is not subscriptable\n'
 '[INFO] 200 OK: Login Successful\n'
 '[INFO] 200 OK: User sent a message\n'
 '[INFO] 200 OK: Login Successful\n'
 '[INFO] 200 OK: User sent a message\n'
 '[INFO] 200 OK: Login Successful\n'
 '[INFO] 200 OK: User sent a message\n'
 '[ERROR] 500 Server Error: int is not subscriptable\n'
 '[INFO] 200 OK: Login Successful\n'
 '[INFO] 200 OK: User sent a message\n'
 '[ERROR] 500 Server Error: int is not subscriptable\n')


In [None]:
raw_logs.split("\n")

['',
 '[ERROR] 500 Server Error: int is not subscriptable',
 '[INFO] 200 OK: Login Successful',
 '[INFO] 200 OK: User sent a message',
 '[ERROR] 500 Server Error: int is not subscriptable',
 '[ERROR] 500 Server Error: int is not subscriptable',
 '[INFO] 200 OK: Login Successful',
 '[ERROR] 500 Server Error: int is not subscriptable',
 '[ERROR] 500 Server Error: int is not subscriptable',
 '[INFO] 200 OK: User sent a message',
 '[ERROR] 500 Server Error: int is not subscriptable',
 '[INFO] 200 OK: Login Successful',
 '[INFO] 200 OK: User sent a message',
 '[INFO] 200 OK: Login Successful',
 '[INFO] 200 OK: User sent a message',
 '[INFO] 200 OK: Login Successful',
 '[INFO] 200 OK: User sent a message',
 '[ERROR] 500 Server Error: int is not subscriptable',
 '[INFO] 200 OK: Login Successful',
 '[INFO] 200 OK: User sent a message',
 '[ERROR] 500 Server Error: int is not subscriptable',
 '']

In [None]:
def analyze_logs(logs):
	"""
	You are a Site Reliability Engineer, and you have a giant pile of logs to look through.
	We need to know
		what the most frequent error is,
		and what kinds of errors there are,
		and under what HTTP response code they will fall

	analyze_logs function takes logs as a string,
	 then parses through the lines to see the frequency of occurance of:
		Log_level (error, info, warning)
		HTTP status_code (200, 400) etc.
		HTTP status_message (ok, server error, forbidden)
		HTTP message body (human readable information message)

	:param logs:
	:return: dictionary with logging statistics

	Formattted output example: using pretty print to return it nicer

	output = {
	'WARNING': {
		'403': {
			'Forbidden': {
				'No token in request parameters': 3
						}
				}
			}
		}

    >>> raw_logs = "
      [WARNING] 403 Forbidden: No token in request parameters
      [ERROR] 500 Server Error: int is not subscriptable
      [INFO] 200 OK: Login Successful
      [INFO] 200 OK: User sent a message
      [ERROR] 500 Server Error: int is not subscriptable
      [WARNING] 403 Forbidden: No token in request parameters
      [ERROR] 500 Server Error: int is not subscriptable
      [INFO] 200 OK: Login Successful
      [ERROR] 500 Server Error: int is not subscriptable
      [ERROR] 500 Server Error: int is not subscriptable
      [INFO] 200 OK: User sent a message
      [ERROR] 500 Server Error: int is not subscriptable
      [WARNING] 403 Forbidden: No token in request parameters
      [INFO] 200 OK: Login Successful
      [INFO] 200 OK: User sent a message
      [INFO] 200 OK: Login Successful
      [INFO] 200 OK: User sent a message
    "
    >>> analyze_logs(raw_logs)
    {'ERROR': {'500': {'Server Error': {'int is not subscriptable': 8}}},
    'INFO': {'200': {'OK': {'Login Successful': 6, 'User sent a message': 6}}},
    'WARNING': {'403': {'Forbidden': {'No token in request parameters': 3}}}}

	"""
	## Make the output human readable - mostly
	import pprint
	pp = pprint.pprint

	## Define the output dictionary
	output = {}

	## Parse through the string and split on new lines
	for line in raw_logs.split("\n"):
		## if blank then skip it
		if not line:
			continue
		## looking for the following pieces to split out
			## log level (info, warn, error)
			## http response (200, 500, 403, etc)
			## message (everrything after the : )
		## split the line on colon
		colon_split = line.split(":")
		## colon_split is now in 2 pieces --> word_split 0 and 1
		word_split = colon_split[0].split(sep=" ", maxsplit=2)
		log_level = word_split[0].strip('[|]')
		status_code = word_split[1]
		status_message = word_split[2]
		message_count = colon_split[1].lstrip()

		## Instead of trying to slice a list
		## use the dictionary to get set dictionary keys within the main dictionary
		if log_level not in output.keys():
			## Looking for the existence of a key - set the log level dictionary
			output[log_level] = {}
		## Looking for the existence of each part of the dictionaries within the dictionary
		if status_code not in output[log_level].keys():
			output[log_level][status_code] = {}
		if status_message not in output[log_level][status_code].keys():
			output[log_level][status_code][status_message] = {}
		if message_count not in output[log_level][status_code][status_message].keys():
			# This is the COUNTER for how many times this message occurs in the log files
			output[log_level][status_code][status_message][message_count] = 0
		# Increment the counter
		output[log_level][status_code][status_message][message_count] += 1

	return(pp(output))


analyze_logs(raw_logs)

{'ERROR': {'500': {'Server Error': {'int is not subscriptable': 8}}},
 'INFO': {'200': {'OK': {'Login Successful': 6, 'User sent a message': 6}}},


In [None]:
# Run the tests when you think analyze_logs is working
# May have to use a file with the raw log info inside it. 
import doctest
doctest.run_docstring_examples(analyze_logs, globals(), verbose=True, name="analyze_logs")

Finding tests in analyze_logs
Trying:
    raw_logs = "
Expecting:
      [ERROR] 500 Server Error: int is not subscriptable
      [INFO] 200 OK: Login Successful
      [INFO] 200 OK: User sent a message
      [ERROR] 500 Server Error: int is not subscriptable
      [ERROR] 500 Server Error: int is not subscriptable
      [INFO] 200 OK: Login Successful
      [ERROR] 500 Server Error: int is not subscriptable
      [ERROR] 500 Server Error: int is not subscriptable
      [INFO] 200 OK: User sent a message
      [ERROR] 500 Server Error: int is not subscriptable
      [INFO] 200 OK: Login Successful
      [INFO] 200 OK: User sent a message
      [INFO] 200 OK: Login Successful
      [INFO] 200 OK: User sent a message
    "
**********************************************************************
File "__main__", line 31, in analyze_logs
Failed example:
    raw_logs = "
Exception raised:
    Traceback (most recent call last):
      File "/usr/lib/python3.7/doctest.py", line 1337, in __run
   