In [1]:
import re
import csv
from pathlib import Path
import pandas as pd

print("üîß FINAL CORRECTED RESPONSE TIME EXTRACTION")
print("=" * 55)


üîß FINAL CORRECTED RESPONSE TIME EXTRACTION


In [2]:
def extract_log_data_final(log_line):
    """Final corrected extraction function for Apache logs with response times"""
    
    # Extract response time from the end (format: **seconds/microseconds**)
    time_match = re.search(r'\*\*(\d+)/(\d+)\*\*$', log_line)
    
    if not time_match:
        return None
    
    time_seconds = float(time_match.group(1))
    time_microseconds = float(time_match.group(2))
    
    # Convert to total milliseconds: (seconds * 1000) + (microseconds / 1000)
    total_response_time_ms = (time_seconds * 1000) + (time_microseconds / 1000)
    
    # Extract IP (first field)
    ip_match = re.match(r'^(\S+)', log_line)
    ip = ip_match.group(1) if ip_match else ''
    
    # Extract timestamp
    timestamp_match = re.search(r'\[([^\]]+)\]', log_line)
    timestamp = timestamp_match.group(1) if timestamp_match else ''
    
    # Extract HTTP method, URL, and protocol from the quoted request
    request_match = re.search(r'"(\w+)\s+([^\s"]+)\s+([^"]+)"', log_line)
    if request_match:
        method = request_match.group(1)
        url = request_match.group(2)
        protocol = request_match.group(3)
    else:
        method = ''
        url = ''
        protocol = ''
    
    # Extract status code and response size
    status_size_match = re.search(r'" (\d+) (\d+) "', log_line)
    if status_size_match:
        status = status_size_match.group(1)
        response_size = status_size_match.group(2)
    else:
        status = ''
        response_size = ''
    
    # Clean URL (remove query parameters)
    if url and '?' in url:
        url = url.split('?')[0]
    
    return {
        'timestamp': timestamp,
        'ip': ip,
        'method': method,
        'url': url,
        'status': status,
        'response_size': response_size,
        'response_time_seconds': time_seconds,
        'response_time_microseconds': time_microseconds,
        'total_response_time_ms': total_response_time_ms
    }

print("‚úÖ Extraction function defined")


‚úÖ Extraction function defined


In [3]:
# Test the extraction function first
print("üß™ TESTING FINAL EXTRACTION FUNCTION")
print("=" * 45)

# Test with actual log line format
test_line = '10.103.14.12 - - [27/Aug/2024:06:26:14 +0700] "POST /trx_rajal/order_pmr/getDokter/ HTTP/2.0" 200 4245 "https://simrs.rsmoewardi.com/home" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36" **0/16667**'

result = extract_log_data_final(test_line)
if result:
    print("‚úÖ Test successful!")
    print(f"  IP: {result['ip']}")
    print(f"  Method: {result['method']}")
    print(f"  URL: {result['url']}")
    print(f"  Status: {result['status']}")
    print(f"  Response Size: {result['response_size']}")
    print(f"  Response Time: {result['total_response_time_ms']:.3f} ms")
else:
    print("‚ùå Test failed!")
    raise Exception("Extraction test failed")


üß™ TESTING FINAL EXTRACTION FUNCTION
‚úÖ Test successful!
  IP: 10.103.14.12
  Method: POST
  URL: /trx_rajal/order_pmr/getDokter/
  Status: 200
  Response Size: 4245
  Response Time: 16.667 ms


In [4]:
def process_log_file_final(input_file, max_records=None):
    """Process log file with final corrected extraction"""
    extracted_data = []
    error_count = 0
    
    print(f"  Processing: {input_file.name}")
    
    with open(input_file, 'r', encoding='utf-8', errors='ignore') as f:
        for line_num, line in enumerate(f, 1):
            # Stop if we've reached the max records limit
            if max_records and len(extracted_data) >= max_records:
                print(f"    -> Reached max records limit: {max_records:,}")
                break
                
            line = line.strip()
            if not line:
                continue
                
            try:
                log_data = extract_log_data_final(line)
                if log_data and log_data['ip'] and log_data['url']:
                    extracted_data.append([
                        log_data['timestamp'],
                        log_data['ip'],
                        log_data['method'],
                        log_data['url'],
                        log_data['status'],
                        log_data['response_size'],
                        log_data['response_time_seconds'],
                        log_data['response_time_microseconds'],
                        log_data['total_response_time_ms']
                    ])
                else:
                    error_count += 1
            except Exception as e:
                error_count += 1
                if error_count <= 3:  # Only print first few errors
                    print(f"    Error on line {line_num}: {e}")
                continue
            
            # Progress update every 100k records
            if len(extracted_data) % 100000 == 0:
                print(f"    -> Progress: {len(extracted_data):,} records extracted")
    
    print(f"    -> Final: {len(extracted_data):,} records, Errors: {error_count:,}")
    return extracted_data

print("‚úÖ Processing function defined")


‚úÖ Processing function defined


In [5]:
# Check available log files
logs_folder = Path('../Data')
log_files = list(logs_folder.glob('response_time-simrs*'))

print(f"üìÇ Found {len(log_files)} log files:")
for i, log_file in enumerate(log_files, 1):
    file_size_mb = log_file.stat().st_size / (1024 * 1024)
    print(f"  {i:2d}. {log_file.name} ({file_size_mb:.1f} MB)")

print(f"\nüìä Total size: {sum(f.stat().st_size for f in log_files) / (1024 * 1024 * 1024):.2f} GB")


üìÇ Found 15 log files:
   1. response_time-simrs.log.10 (1014.7 MB)
   2. response_time-simrs.log.11 (1070.6 MB)
   3. response_time-simrs.log.9 (1048.5 MB)
   4. response_time-simrs.log.7 (921.3 MB)
   5. response_time-simrs.log.1 (1049.0 MB)
   6. response_time-simrs.log.6 (511.5 MB)
   7. response_time-simrs.log.8 (1045.0 MB)
   8. response_time-simrs.log.13 (549.2 MB)
   9. response_time-simrs.log.14 (990.9 MB)
  10. response_time-simrs.log.12 (566.3 MB)
  11. response_time-simrs.log (623.3 MB)
  12. response_time-simrs.log.3 (1167.4 MB)
  13. response_time-simrs.log.4 (1044.2 MB)
  14. response_time-simrs.log.5 (526.1 MB)
  15. response_time-simrs.log.2 (1109.6 MB)

üìä Total size: 12.93 GB


In [6]:
# OPTION 1: Process just one file first (for testing)
print("üß™ OPTION 1: Process single file (recommended for testing)")
print("=" * 60)

# Choose the smallest file for testing
smallest_file = min(log_files, key=lambda f: f.stat().st_size)
print(f"Processing smallest file: {smallest_file.name}")

# Process with a limit to test
sample_data = process_log_file_final(smallest_file, max_records=10000)

if sample_data:
    print(f"\n‚úÖ Sample extraction successful!")
    print(f"üìä Extracted {len(sample_data):,} records")
    
    # Save sample to CSV
    sample_output = 'sample_data_with_time.csv'
    with open(sample_output, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow([
            'Time', 'IP', 'Method', 'URL', 'Status', 'Response_Size',
            'Response_Time_Seconds', 'Response_Time_Microseconds', 'Total_Response_Time_MS'
        ])
        writer.writerows(sample_data)
    
    print(f"üíæ Sample saved to: {sample_output}")
    
    # Quick analysis
    df_sample = pd.DataFrame(sample_data, columns=[
        'Time', 'IP', 'Method', 'URL', 'Status', 'Response_Size',
        'Response_Time_Seconds', 'Response_Time_Microseconds', 'Total_Response_Time_MS'
    ])
    
    print(f"\nüìà Quick Analysis:")
    print(f"  ‚Ä¢ Average response time: {df_sample['Total_Response_Time_MS'].mean():.2f} ms")
    print(f"  ‚Ä¢ Max response time: {df_sample['Total_Response_Time_MS'].max():.2f} ms")
    print(f"  ‚Ä¢ Min response time: {df_sample['Total_Response_Time_MS'].min():.2f} ms")
    print(f"  ‚Ä¢ Records with > 0ms: {(df_sample['Total_Response_Time_MS'] > 0).sum():,}")
    
else:
    print("‚ùå Sample extraction failed!")


üß™ OPTION 1: Process single file (recommended for testing)
Processing smallest file: response_time-simrs.log.6
  Processing: response_time-simrs.log.6
    -> Reached max records limit: 10,000
    -> Final: 10,000 records, Errors: 0

‚úÖ Sample extraction successful!
üìä Extracted 10,000 records
üíæ Sample saved to: sample_data_with_time.csv

üìà Quick Analysis:
  ‚Ä¢ Average response time: 813.08 ms
  ‚Ä¢ Max response time: 50287.38 ms
  ‚Ä¢ Min response time: 0.16 ms
  ‚Ä¢ Records with > 0ms: 10,000


In [None]:
# OPTION 2: Process all files - READY TO RUN! 
print("üöÄ OPTION 2: Process ALL files - EXTRACTION VERIFIED!")
print("=" * 70)
print("‚úÖ Option 1 test successful! Response times working perfectly.")
print("üìä Expected: ~51 million records with response times")
print("‚è±Ô∏è  Estimated time: 30-60 minutes")
print("üíæ Required: ~8GB RAM, ~5GB disk space")
print("üéØ Output: extracted_data_with_time_final.csv")

# READY TO PROCESS - Option 1 test was successful!
process_all = True  # ENABLED - test was successful!

if process_all:
    all_extracted_data = []
    
    for i, log_file in enumerate(log_files, 1):
        print(f"\nProcessing file {i}/{len(log_files)}: {log_file.name}")
        try:
            extracted_data = process_log_file_final(log_file)
            all_extracted_data.extend(extracted_data)
            print(f"  ‚úÖ Total extracted so far: {len(all_extracted_data):,}")
        except Exception as e:
            print(f"  ‚ùå Error processing {log_file}: {e}")
    
    # Save to CSV
    output_file = 'extracted_data_with_time_final.csv'
    print(f"\nüíæ Writing {len(all_extracted_data):,} records to {output_file}")
    
    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow([
            'Time', 'IP', 'Method', 'URL', 'Status', 'Response_Size',
            'Response_Time_Seconds', 'Response_Time_Microseconds', 'Total_Response_Time_MS'
        ])
        writer.writerows(all_extracted_data)
    
    print(f"‚úÖ All files processed! Output: {output_file}")
else:
    print("üîÑ Set process_all = True to run full extraction")
    print("üìù Recommended: Test with Option 1 first")


üöÄ OPTION 2: Process ALL files - EXTRACTION VERIFIED!
‚úÖ Option 1 test successful! Response times working perfectly.
üìä Expected: ~51 million records with response times
‚è±Ô∏è  Estimated time: 30-60 minutes
üíæ Required: ~8GB RAM, ~5GB disk space
üéØ Output: extracted_data_with_time_final.csv

Processing file 1/15: response_time-simrs.log.10
  Processing: response_time-simrs.log.10
    -> Progress: 100,000 records extracted
    -> Progress: 200,000 records extracted
    -> Progress: 300,000 records extracted
    -> Progress: 400,000 records extracted
    -> Progress: 500,000 records extracted
    -> Progress: 600,000 records extracted
    -> Progress: 700,000 records extracted
    -> Progress: 800,000 records extracted
    -> Progress: 900,000 records extracted
    -> Progress: 1,000,000 records extracted
    -> Progress: 1,100,000 records extracted
    -> Progress: 1,200,000 records extracted
    -> Progress: 1,300,000 records extracted
    -> Progress: 1,400,000 records extra

: 

In [None]:
# OPTION 3: Process files in batches (SAFER alternative)
print("üì¶ OPTION 3: Process files in BATCHES (SAFER OPTION)")
print("=" * 55)
print("‚úÖ This approach is SAFER for large datasets")
print("üìÅ Processes files one by one and saves each separately")
print("üîÑ Prevents memory issues and allows you to stop/resume")
print("üí° Recommended if you have limited RAM (<8GB)")

batch_process = False  # Change to True if you prefer this safer approach
batch_size_limit = None  # Process full files (remove limit)

if batch_process:
    for i, log_file in enumerate(log_files, 1):
        output_file = f'extracted_data_batch_{i:02d}_{log_file.stem}.csv'
        
        print(f"\nüìÇ Processing batch {i}/{len(log_files)}: {log_file.name}")
        print(f"üíæ Output will be: {output_file}")
        
        try:
            # Process this file
            extracted_data = process_log_file_final(log_file, max_records=batch_size_limit)
            
            if extracted_data:
                # Save immediately
                with open(output_file, 'w', newline='', encoding='utf-8') as f:
                    writer = csv.writer(f)
                    writer.writerow([
                        'Time', 'IP', 'Method', 'URL', 'Status', 'Response_Size',
                        'Response_Time_Seconds', 'Response_Time_Microseconds', 'Total_Response_Time_MS'
                    ])
                    writer.writerows(extracted_data)
                
                print(f"  ‚úÖ Saved {len(extracted_data):,} records to {output_file}")
                
                # Quick stats
                total_response_times = [row[8] for row in extracted_data]  # Total_Response_Time_MS
                avg_time = sum(total_response_times) / len(total_response_times)
                max_time = max(total_response_times)
                print(f"  üìä Avg: {avg_time:.2f}ms, Max: {max_time:.2f}ms")
            else:
                print(f"  ‚ùå No data extracted from {log_file.name}")
                
        except Exception as e:
            print(f"  ‚ùå Error processing {log_file}: {e}")
            
        print("-" * 60)
    
    print("‚úÖ Batch processing completed!")
    print("üìÅ Each file has been processed into a separate CSV")
    print("üí° You can now analyze individual batches or combine them later")
    
else:
    print("üîÑ Set batch_process = True to run batch extraction")
    print("üìù This is the safest option for large datasets")


üì¶ OPTION 3: Process files in BATCHES (recommended)
This approach processes files one by one and saves each separately
This prevents memory issues and allows you to stop/resume
üîÑ Set batch_process = True to run batch extraction
üìù This is the safest option for large datasets
