In [9]:
import gzip
import re
from datetime import datetime
from typing import List, Dict
import pandas as pd

LOG_PATTERN = re.compile(
    r'(?P<host>\S+) \S+ \S+ \[(?P<timestamp>[^\]]+)\] "(?:GET|POST|HEAD)? ?(?P<filename>\S+)? ?HTTP/[\d.]+" (?P<status>\d{3}) (?P<bytes>\S+)'
)

def parse_log_line(line: str) -> dict | None:
    match = LOG_PATTERN.match(line)
    if not match:
        return None

    try:
        ts_str = match.group("timestamp")
        timestamp = datetime.strptime(ts_str, "%d/%b/%Y:%H:%M:%S %z")

        bytes_str = match.group("bytes")
        bytes_val = int(bytes_str) if bytes_str.isdigit() else None

        filename = match.group("filename")
        file_ext = filename.split(".")[-1] if "." in filename else None

        return {
            "host": match.group("host"),
            "timestamp": timestamp,
            "filename": filename,
            "file_ext": file_ext,
            "status": int(match.group("status")),
            "bytes": bytes_val,
        }
    except Exception:
        return None

def load_logs(filepath: str) -> List[dict]:
    logs = []
    with gzip.open(filepath, "rt", encoding="utf-8", errors="replace") as f:
        for line in f:
            parsed = parse_log_line(line)
            if parsed:
                logs.append(parsed)
    return logs

def load_logs_to_dataframe(filepath: str) -> pd.DataFrame:
    records = []
    with gzip.open(filepath, "rt", encoding="utf-8", errors="replace") as file:
        for line in file:
            parsed = parse_log_line(line)
            if parsed:
                records.append(parsed)
    df = pd.DataFrame(records)
    return df

# Example usage
if __name__ == "__main__":
    filepath = "calgary_access_log.gz"  # Ensure this file is present in your directory
    df_logs = load_logs_to_dataframe(filepath)
    print(f"✅ Loaded {len(df_logs)} valid log entries.")
    print(df_logs.head())
    print(df_logs.dtypes)


✅ Loaded 722147 valid log entries.
    host                  timestamp    filename file_ext  status    bytes
0  local  1994-10-24 13:41:41-06:00  index.html     html     200    150.0
1  local  1994-10-24 13:41:41-06:00       1.gif      gif     200   1210.0
2  local  1994-10-24 13:43:13-06:00  index.html     html     200   3185.0
3  local  1994-10-24 13:43:14-06:00       2.gif      gif     200   2555.0
4  local  1994-10-24 13:43:15-06:00       3.gif      gif     200  36403.0
host          object
timestamp     object
filename      object
file_ext      object
status         int64
bytes        float64
dtype: object


In [28]:
def q3_datewise_unique_filenames(df: pd.DataFrame) -> dict[str, int]:
    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
    df_valid = df.dropna(subset=['filename', 'timestamp'])
    df_valid['date_str'] = df_valid['timestamp'].dt.strftime('%d-%b-%Y')
    result = df_valid.groupby('date_str')['filename'].nunique()
    return result.to_dict()

In [36]:
def q1_total_log_records(df: pd.DataFrame) -> int:
    return len(df)

In [11]:
def q2_unique_hosts(df: pd.DataFrame) -> int:
    return df['host'].nunique()

In [37]:
if __name__ == "__main__":
    filepath = "calgary_access_log.gz"
    df_logs = load_logs_to_dataframe(filepath)

    print("Q1: Total log records =", q1_total_log_records(df_logs))
    print("Q2: Unique hosts =", q2_unique_hosts(df_logs))
    print("Q3: Unique filenames per day:")
    for date, count in q3_datewise_unique_filenames(df_logs).items():
        print(f"{date}: {count}")

Q1: Total log records = 722147
Q2: Unique hosts = 2
Q3: Unique filenames per day:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_valid['date_str'] = df_valid['timestamp'].dt.strftime('%d-%b-%Y')


01-Aug-1995: 669
01-Jul-1995: 387
01-Jun-1995: 590
01-May-1995: 467
01-Oct-1995: 552
01-Sep-1995: 328
02-Apr-1995: 438
02-Aug-1995: 855
02-Jul-1995: 397
02-Jun-1995: 513
02-May-1995: 701
02-Oct-1995: 871
02-Sep-1995: 349
03-Apr-1995: 795
03-Aug-1995: 582
03-Jul-1995: 433
03-Jun-1995: 398
03-May-1995: 589
03-Oct-1995: 843
03-Sep-1995: 212
04-Apr-1995: 821
04-Aug-1995: 715
04-Jul-1995: 610
04-Jun-1995: 353
04-May-1995: 684
04-Oct-1995: 888
04-Sep-1995: 340
05-Apr-1995: 890
05-Aug-1995: 507
05-Jul-1995: 607
05-Jun-1995: 494
05-May-1995: 609
05-Oct-1995: 846
05-Sep-1995: 411
06-Apr-1995: 677
06-Aug-1995: 448
06-Jul-1995: 522
06-Jun-1995: 662
06-May-1995: 517
06-Oct-1995: 868
06-Sep-1995: 549
07-Apr-1995: 775
07-Aug-1995: 608
07-Jul-1995: 428
07-Jun-1995: 486
07-May-1995: 725
07-Oct-1995: 468
07-Sep-1995: 590
08-Apr-1995: 542
08-Aug-1995: 654
08-Jul-1995: 277
08-Jun-1995: 642
08-May-1995: 656
08-Oct-1995: 514
08-Sep-1995: 754
09-Apr-1995: 626
09-Aug-1995: 698
09-Jul-1995: 233
09-Jun-1995: 4

In [16]:
def q4_count_404_responses(df: pd.DataFrame) -> int:
    return df[df['status'] == 404].shape[0]

In [17]:
def q5_top_15_404_filenames(df: pd.DataFrame) -> list[tuple[str, int]]:
    df_404 = df[(df['status'] == 404) & df['filename'].notna()]
    top_404 = df_404['filename'].value_counts().nlargest(15)
    return list(top_404.items())

In [18]:
def q6_top_15_404_extensions(df: pd.DataFrame) -> list[tuple[str, int]]:
    df_404 = df[(df['status'] == 404) & df['file_ext'].notna()]
    top_exts = df_404['file_ext'].value_counts().nlargest(15)
    return list(top_exts.items())

In [19]:
if __name__ == "__main__":
    filepath = "calgary_access_log.gz"
    df_logs = load_logs_to_dataframe(filepath)

    print("Q4: Number of 404 responses =", q4_count_404_responses(df_logs))

    print("\nQ5: Top 15 filenames with 404 responses:")
    for filename, count in q5_top_15_404_filenames(df_logs):
        print(f"{filename}: {count}")

    print("\nQ6: Top 15 file extensions with 404 responses:")
    for ext, count in q6_top_15_404_extensions(df_logs):
        print(f".{ext}: {count}")

Q4: Number of 404 responses = 23430

Q5: Top 15 filenames with 404 responses:
index.html: 4691
4115.html: 900
1611.html: 649
5698.xbm: 585
710.txt: 408
2002.html: 257
2177.gif: 193
10695.ps: 161
6555.html: 153
487.gif: 152
151.html: 149
3414.gif: 148
488.gif: 148
40.html: 148
9678.gif: 142

Q6: Top 15 file extensions with 404 responses:
.html: 12135
.gif: 7202
.xbm: 824
.ps: 754
.jpg: 520
.txt: 496
.GIF: 135
.htm: 107
.cgi: 77
.com: 45
.gif": 45
.Z: 41
.dvi: 40
.com/: 37
.ca: 36


In [31]:
def q7_bandwidth_per_day_july_1995(df: pd.DataFrame) -> dict[str, int]:
    df = df.copy()
    
    # Convert 'timestamp' column to datetime if it's not already
    df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')

    # Remove rows where bytes are missing or timestamp is invalid
    df_valid = df[df['bytes'].notna() & df['timestamp'].notna()]

    # Filter only July 1995
    df_july = df_valid[
        (df_valid['timestamp'].dt.month == 7) & 
        (df_valid['timestamp'].dt.year == 1995)
    ]

    # Create formatted date string
    df_july['date_str'] = df_july['timestamp'].dt.strftime('%d-%b-%Y')

    # Sum bytes by date
    result = df_july.groupby('date_str')['bytes'].sum()

    return result.to_dict()

In [33]:
def q8_hourly_request_distribution(df: pd.DataFrame) -> dict[int, int]:
    df = df.copy()
    
    # Ensure 'timestamp' is datetime
    df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
    df = df[df['timestamp'].notna()]  # remove rows with invalid timestamps

    # Extract hour
    df['hour'] = df['timestamp'].dt.hour

    # Count requests per hour
    result = df['hour'].value_counts().sort_index()
    return result.to_dict()

In [22]:
def q9_top_10_requested_filenames(df: pd.DataFrame) -> list[tuple[str, int]]:
    df_valid = df[df['filename'].notna()]
    top_files = df_valid['filename'].value_counts().nlargest(10)
    return list(top_files.items())

In [23]:
def q10_http_status_distribution(df: pd.DataFrame) -> dict[int, int]:
    result = df['status'].value_counts().sort_index()
    return result.to_dict()

In [34]:
if __name__ == "__main__":
    filepath = "calgary_access_log.gz"
    df_logs = load_logs_to_dataframe(filepath)

    # Q7
    print("Q7: Bandwidth per day (July 1995):")
    for date, total_bytes in q7_bandwidth_per_day_july_1995(df_logs).items():
        print(f"{date}: {total_bytes} bytes")

    # Q8
    print("\nQ8: Hourly request distribution:")
    for hour, count in q8_hourly_request_distribution(df_logs).items():
        print(f"{hour:02d}:00 - {count} requests")

    # Q9
    print("\nQ9: Top 10 most requested filenames:")
    for filename, count in q9_top_10_requested_filenames(df_logs):
        print(f"{filename}: {count}")

    # Q10
    print("\nQ10: HTTP response code distribution:")
    for code, count in q10_http_status_distribution(df_logs).items():
        print(f"{code}: {count} responses")
        

Q7: Bandwidth per day (July 1995):


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_july['date_str'] = df_july['timestamp'].dt.strftime('%d-%b-%Y')


01-Jul-1995: 11333976.0 bytes
02-Jul-1995: 8653986.0 bytes
03-Jul-1995: 13508529.0 bytes
04-Jul-1995: 26565884.0 bytes
05-Jul-1995: 19541225.0 bytes
06-Jul-1995: 19752989.0 bytes
07-Jul-1995: 9427822.0 bytes
08-Jul-1995: 5403491.0 bytes
09-Jul-1995: 4660556.0 bytes
10-Jul-1995: 14912796.0 bytes
11-Jul-1995: 22503471.0 bytes
12-Jul-1995: 17365039.0 bytes
13-Jul-1995: 15986302.0 bytes
14-Jul-1995: 19184404.0 bytes
15-Jul-1995: 15769181.0 bytes
16-Jul-1995: 9005564.0 bytes
17-Jul-1995: 19596435.0 bytes
18-Jul-1995: 17096829.0 bytes
19-Jul-1995: 17847673.0 bytes
20-Jul-1995: 20751717.0 bytes
21-Jul-1995: 25455607.0 bytes
22-Jul-1995: 8059932.0 bytes
23-Jul-1995: 9577795.0 bytes
24-Jul-1995: 22298075.0 bytes
25-Jul-1995: 24472760.0 bytes
26-Jul-1995: 24564950.0 bytes
27-Jul-1995: 25967969.0 bytes
28-Jul-1995: 36456855.0 bytes
29-Jul-1995: 11684209.0 bytes
30-Jul-1995: 23158170.0 bytes
31-Jul-1995: 30715614.0 bytes

Q8: Hourly request distribution:
00:00 - 11510 requests
01:00 - 9821 request