In [2]:
import re
from collections import defaultdict

def analyze_log_file(log_file):
    # Patterns to extract data from the log
    ip_pattern = r'^(\d+\.\d+\.\d+\.\d+)'  # Pattern to extract IP address
    endpoint_pattern = r'\"(GET|POST) (/[\w/]*)'
    status_code_pattern = r'\" (\d{3}) '
    size_pattern = r' (\d+)$'
    
    # Dictionaries to store the results
    ip_count = defaultdict(int)
    endpoint_count = defaultdict(int)
    status_code_count = defaultdict(int)
    total_data_transferred = 0
    
    try:
        # Open the log file for reading
        with open(log_file, 'r') as file:
            for line in file:
                # Extract IP address
                ip_match = re.search(ip_pattern, line)
                if ip_match:
                    ip = ip_match.group(1)
                    ip_count[ip] += 1

                # Extract endpoint
                endpoint_match = re.search(endpoint_pattern, line)
                if endpoint_match:
                    endpoint = endpoint_match.group(2)
                    endpoint_count[endpoint] += 1

                # Extract status code
                status_code_match = re.search(status_code_pattern, line)
                if status_code_match:
                    status_code = status_code_match.group(1)
                    status_code_count[status_code] += 1

                # Extract data size
                size_match = re.search(size_pattern, line)
                if size_match:
                    size = int(size_match.group(1))
                    total_data_transferred += size
    except FileNotFoundError:
        print(f"Error: The file '{log_file}' does not exist.")
        return

    # Print the results
    print("IP Address Counts:")
    for ip, count in ip_count.items():
        print(f"{ip}: {count}")
    
    print("\nEndpoint Access Counts:")
    for endpoint, count in endpoint_count.items():
        print(f"{endpoint}: {count}")
    
    print("\nHTTP Status Code Counts:")
    for status_code, count in status_code_count.items():
        print(f"{status_code}: {count}")
    
    print(f"\nTotal Data Transferred: {total_data_transferred} bytes")

# Provide the absolute path to the log file
log_file = r"C:\Users\Admin\Downloads\sample.log"

# Call the function
analyze_log_file(log_file)


IP Address Counts:
192.168.1.1: 7
203.0.113.5: 8
10.0.0.2: 6
198.51.100.23: 8
192.168.1.100: 5

Endpoint Access Counts:
/home: 5
/login: 13
/about: 5
/contact: 2
/register: 2
/dashboard: 3
/profile: 2
/feedback: 2

HTTP Status Code Counts:
200: 21
401: 13

Total Data Transferred: 9784 bytes


In [11]:
pip install pandas openpyxl



Collecting openpyxl
  Downloading openpyxl-3.1.3-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-1.1.0-py3-none-any.whl.metadata (1.8 kB)
Downloading openpyxl-3.1.3-py2.py3-none-any.whl (251 kB)
   ---------------------------------------- 251.3/251.3 kB 5.2 MB/s eta 0:00:00
Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.3


In [4]:
import re
from collections import defaultdict

def detect_suspicious_activity(log_file, threshold=10):
    """
    Detects IP addresses with failed login attempts exceeding the specified threshold.
    
    Parameters:
        log_file (str): Path to the log file.
        threshold (int): Number of failed login attempts to flag as suspicious.
    
    Returns:
        None
    """
    # Patterns to extract data
    ip_pattern = r'^(\d+\.\d+\.\d+\.\d+)'  # Pattern to extract IP address
    failed_login_pattern = r'401.*Invalid credentials'  # Pattern to identify failed login attempts

    # Dictionary to count failed login attempts by IP address
    failed_login_counts = defaultdict(int)

    try:
        # Open the log file
        with open(log_file, 'r') as file:
            for line in file:
                # Check if the log line indicates a failed login attempt
                if re.search(failed_login_pattern, line):
                    # Extract the IP address
                    ip_match = re.search(ip_pattern, line)
                    if ip_match:
                        ip = ip_match.group(1)
                        failed_login_counts[ip] += 1
    except FileNotFoundError:
        print(f"Error: The file '{log_file}' does not exist.")
        return

    # Filter IP addresses that exceed the threshold
    suspicious_ips = {ip: count for ip, count in failed_login_counts.items() if count > threshold}

    # Display results
    print("\nSuspicious Activity Detected:")
    print(f"{'IP Address':<20}{'Failed Login Attempts'}")
    print("-" * 40)
    for ip, count in suspicious_ips.items():
        print(f"{ip:<20}{count}")

# Provide the absolute path to the log file
log_file = r"C:\Users\Admin\Downloads\sample.log"

# Call the function with the log file and threshold
detect_suspicious_activity(log_file, threshold=10)



Suspicious Activity Detected:
IP Address          Failed Login Attempts
----------------------------------------


In [7]:
import re
from collections import defaultdict

def count_requests_per_ip(log_file):
    """
    Counts the number of requests made by each IP address in the log file.
    
    Parameters:
        log_file (str): Path to the log file.
    
    Returns:
        None
    """
    # Pattern to extract IP addresses
    ip_pattern = r'^(\d+\.\d+\.\d+\.\d+)'

    # Dictionary to store request counts per IP
    ip_request_counts = defaultdict(int)

    try:
        # Open the log file for reading
        with open(log_file, 'r') as file:
            for line in file:
                # Extract the IP address
                ip_match = re.search(ip_pattern, line)
                if ip_match:
                    ip = ip_match.group(1)
                    ip_request_counts[ip] += 1
    except FileNotFoundError:
        print(f"Error: The file '{log_file}' does not exist.")
        return

    # Sort the results by request counts in descending order
    sorted_requests = sorted(ip_request_counts.items(), key=lambda x: x[1], reverse=True)

    # Display results in the desired format
    print("IP Address           Request Count")
    for ip, count in sorted_requests:
        print(f"{ip:<20}{count}")

# Provide the absolute path to the log file
log_file = r"C:\Users\Admin\Downloads\sample.log"

# Call the function with the log file
count_requests_per_ip(log_file)


IP Address           Request Count
203.0.113.5         8
198.51.100.23       8
192.168.1.1         7
10.0.0.2            6
192.168.1.100       5


In [8]:
import re
from collections import defaultdict

def most_frequently_accessed_endpoint(log_file):
    """
    Identifies the most frequently accessed endpoint from the log file.

    Parameters:
        log_file (str): Path to the log file.

    Returns:
        None
    """
    # Pattern to extract endpoints (e.g., resource paths)
    endpoint_pattern = r'\"(?:GET|POST|PUT|DELETE) (/[\w./-]*)'

    # Dictionary to store access counts for each endpoint
    endpoint_access_counts = defaultdict(int)

    try:
        # Open the log file for reading
        with open(log_file, 'r') as file:
            for line in file:
                # Extract the endpoint
                endpoint_match = re.search(endpoint_pattern, line)
                if endpoint_match:
                    endpoint = endpoint_match.group(1)
                    endpoint_access_counts[endpoint] += 1
    except FileNotFoundError:
        print(f"Error: The file '{log_file}' does not exist.")
        return

    # Identify the most frequently accessed endpoint
    if endpoint_access_counts:
        most_accessed_endpoint = max(endpoint_access_counts.items(), key=lambda x: x[1])
        endpoint_name, access_count = most_accessed_endpoint
        print(f"Most Frequently Accessed Endpoint:\n{endpoint_name} (Accessed {access_count} times)")
    else:
        print("No endpoints found in the log file.")

# Provide the absolute path to the log file
log_file = r"C:\Users\Admin\Downloads\sample.log"

# Call the function with the log file
most_frequently_accessed_endpoint(log_file)


Most Frequently Accessed Endpoint:
/login (Accessed 13 times)
