In [11]:
from debugpy.common.sockets import serve

MAX_LINES_TO_PRINT = 5
with open('sample_log_file.log', 'r') as f:
    # f is an iterator
    for i, line in enumerate(f):
        if i == MAX_LINES_TO_PRINT: # stop
            break
        # the extra empty line is because the line has "\n" at the end
        # print(repr(line)) can be used to see that
        # or line.strip() will strip ' ', '\t', '\n' and '\r\n'
        print(line.strip())


2024-12-24 23:18:49 250.141.129.28 ERROR Session expired
2025-01-19 05:07:49 140.111.39.206 DEBUG Session expired
2025-01-04 04:03:49 68.155.22.150 DEBUG Permission denied
2025-01-20 23:56:49 253.114.152.234 ERROR Permission denied


In [8]:
with open('sample_log_file.log', 'r') as f:
    line = f.readline()
    count = 0
    while line != '':
        count += 1
        line = f.readline()
    print(f'number of lines: {count}')
    print('hello world')

number of lines: 100
hello world


String Processing

In [24]:
MAX_LINES_TO_PRINT = 5
dates = set()
with open('sample_log_file.log', 'r') as f:
    # f is an iterator
    for i, line in enumerate(f):
        print(i, line.strip())
        if i == MAX_LINES_TO_PRINT: # stop
            break
        # the extra empty line is because the line has "\n" at the end
        # print(repr(line)) can be used to see that
        # or line.strip() will strip ' ', '\t', '\n' and '\r\n'
        cols = line.strip().split(' ')
        date = cols[0]
        dates.add(date)
    print(dates)

0 2024-12-24 23:18:49 250.141.129.28 ERROR Session expired
1 2025-01-19 05:07:49 140.111.39.206 DEBUG Session expired
2 2025-01-04 04:03:49 68.155.22.150 DEBUG Permission denied
4 2025-01-20 23:56:49 253.114.152.234 ERROR Permission denied
5 2025-01-13 03:00:49 130.22.186.207 INFO Session expired
{'2025-01-19', '2025-01-04', '2024-12-24', '2025-01-20', '2025-01-16'}


In [27]:
MAX_LINES_TO_PRINT = 5
dates = []
with open('sample_log_file.log', 'r') as f:
    # f is an iterator
    for i, line in enumerate(f):
        # print(i, line.strip())
        if i == MAX_LINES_TO_PRINT: # stop
            break
        # the extra empty line is because the line has "\n" at the end
        # print(repr(line)) can be used to see that
        # or line.strip() will strip ' ', '\t', '\n' and '\r\n'
        cols = line.strip().split(' ')
        print(date)
        date = cols[0]
        if not dates or dates[-1] < date:
            dates.append(date)
    print(dates)

2025-01-20
2024-12-24
2025-01-19
2025-01-04
2025-01-16
['2024-12-24', '2025-01-19', '2025-01-20']


In [32]:
# dates in the YYYY-MM-DD format are easy to compare. (simply > < == )
# however other format comparison needs to use the datetime library

from datetime import datetime
date1 = '01-20-2025'
date2 = '12-24-2024'

compare = 'later' if date1 > date2 else 'earlier'

print(f'{date1} is {compare} than {date2} -- wrong')

date1_dt = datetime.strptime(date1, '%m-%d-%Y') # strptime = str parse time
date2_dt = datetime.strptime(date2, '%m-%d-%Y')

compare = 'later' if date1_dt > date2_dt else 'earlier'
print(f'{date1} is {compare} than {date2} -- correct')


01-20-2025 is earlier than 12-24-2024 -- wrong
01-20-2025 is later than 12-24-2024 -- correct


In [43]:
# extra note
# strptime use default values to fill in the unspecified units
# Y - 1900 | m - 01 | d - 01 | H - 00 | M - 00 | S - 00
with open('sample_log_file.log', 'r') as f:
    line = f.readline()
    cols = line.strip().split(' ')
    date = cols[0]
    time = cols[1]
date_dt = datetime.strptime(date, '%Y-%m-%d')
time_dt = datetime.strptime(time, '%H:%M:%S')
print(date_dt)
print(time_dt)
datetime_dt = datetime.strptime(f'{date} {time}', '%Y-%m-%d %H:%M:%S')
print(datetime_dt)

# if the format doesn't match, error will be thrown
# date_dt = datetime.strptime(f'{date}', '%m-%d-%Y')
# > ValueError: time data '2024-12-24' does not match format '%m-%d-%Y'

2024-12-24 00:00:00
1900-01-01 23:18:49
2024-12-24 23:18:49


Writing files

In [68]:
from collections import defaultdict
log_counter = defaultdict(int)
with open('sample_log_file.log', 'r') as f:
    for i, line in enumerate(f):
        cols = f.readline().strip().split(' ')
        date = cols[0]
        log_counter[date] += 1

# the 'w' mode will either create a new file
# or override the existing file
with open('sample_log_file_processed.log', 'w') as f:
    # f.close() # closing the file right-away will just leave the file empty
    # there is no good way to modify a single line
    # a workaround is to read the whole file into memory, modify and write the whole file.
    for date, count in log_counter.items():
        f.write(f'{date} {count}\n')


Regular Expressions

In [67]:
import re
with open('sample_log_file.log', 'r') as f:
    #  f.read() reads the whole file into a single string
    text = f.read()
    print(text)

all_ = re.findall(r'ERROR', text)
print(all_)

# "." here match a single character except for ' ', '/t', '/n', etc.
all_ = re.findall(r'2024-..-..', text)
print(all_)

# re is greedy by default so ".+" is matched with as many characters as possible
all_ = re.findall(r'2024-.+\d', text)
print(all_)

search_ =  re.search(r'2024-.+\d', text)
print(search_.group())
print(search_.span())
print(search_.start())
print(search_.end())

2024-12-24 23:18:49 250.141.129.28 ERROR Session expired
2025-01-19 05:07:49 140.111.39.206 DEBUG Session expired
2025-01-04 04:03:49 68.155.22.150 DEBUG Permission denied
2025-01-20 23:56:49 253.114.152.234 ERROR Permission denied
2025-01-13 03:00:49 130.22.186.207 INFO Session expired
2025-01-02 23:11:49 102.80.177.18 DEBUG Permission denied
2025-01-14 21:58:49 47.180.233.238 DEBUG Session expired
2025-01-15 06:20:49 234.15.205.196 ERROR Session expired
2025-01-11 11:52:49 59.234.42.39 ERROR Permission denied
2024-12-24 15:51:49 194.153.23.221 CRITICAL Permission denied
2025-01-23 12:26:49 85.77.138.186 CRITICAL Cache cleared
2025-01-04 04:21:49 34.126.219.233 DEBUG Permission denied
2024-12-25 14:27:49 39.78.3.195 ERROR File not found
2025-01-14 17:25:49 118.19.250.122 INFO Unexpected error occurred
2025-01-10 01:35:49 97.229.152.99 CRITICAL Session expired
2025-01-18 03:30:49 220.142.157.133 CRITICAL Unexpected error occurred
2025-01-18 18:45:49 231.214.208.163 CRITICAL Configurati