# Nginx错误日志分析

## 加载、解析数据

In [1]:
# encoding: utf8

import os
import re
import json
import logging
import datetime
from typing import Dict, List

logging.basicConfig(format=logging.BASIC_FORMAT, level=logging.INFO)

logger = logging.getLogger()

# nginx error 格式：YYYY/MM/DD HH:MM:SS [LEVEL] PID#TID: *CID MESSAGE
# PID: 进程ID
# TID: 线程ID
# CID: 连接connection标识
pattern = re.compile('(\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2})\ \[(.+)\] (\d+)#(\d+): \*(\d+) (.*)')


def parse_error_log_line(line: str) -> Dict:
    if not pattern.match(line):
        logger.warning(f'log: {line} parse error!')
        return

    items = pattern.findall(line)[0]
    record = dict(zip(('time', 'level', 'pid', 'tid', 'cid', 'message'), items))
    message = record['message']
    message = list(map(str.strip, message.split(',')))
    record['message'] = message[0]
    for item in message[1:]:
        k, *v = list(item.split(':'))
        record[k] = ''.join(v).strip()

    # tm_str = record['time']
    # record['time'] = datetime.datetime.strptime(tm_str, '%Y/%m/%d %H:%M:%S')

    return record


def parse_error_log_file(file_path: str) -> List[Dict]:
    records = []

    with open(file_path) as f:
        for line in f.readlines():
            records.append(parse_error_log_line(line))

    return records


def parse_error_log_dir(file_path_dir: str) -> List[Dict]:
    records = []

    for root, dirs, files in os.walk(file_path_dir):
        for path in files:
            file_path = os.path.join(root, path)
            records.extend(parse_error_log_file(file_path))

    return sorted(records, key=lambda x: x['time'])


In [2]:
nginx_error_log_path = '/Users/hotbaby/hengchang/nginx-logs'

In [3]:
records = parse_error_log_dir(nginx_error_log_path)

分析数据的时间范围

In [4]:
print(f"start time: {records[0]['time']}, end time: {records[-1]['time']}")

start time: 2022/01/22 03:08:03, end time: 2022/01/28 03:21:00


数据示例

In [5]:
print(json.dumps(records[0], indent=4, ensure_ascii=False))

{
    "time": "2022/01/22 03:08:03",
    "level": "error",
    "pid": "15691",
    "tid": "0",
    "cid": "567380300",
    "message": "upstream timed out (110: Connection timed out) while reading response header from upstream",
    "client": "10.100.251.243",
    "server": "default_server",
    "request": "\"POST /api/v1/sdc/zcdf/query HTTP/1.1\"",
    "upstream": "\"http//10.100.31.24131080/api/v1/sdc/zcdf/query\"",
    "host": "\"api.bdp.credithc.com\""
}


## 分析、统计数据

### 数据示例

In [6]:
import pandas as pd

In [7]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_colwidth', 100)

In [8]:
df = pd.DataFrame.from_records(records)

数据预处理

In [9]:
df['tm_str'] = df['time']
df['datetime'] = pd.to_datetime(df['time'])
df['date'] = df['datetime'].dt.date
df['time'] = df['datetime'].dt.time

数据示例

In [10]:
columns = ['datetime', 'level', 'pid', 'tid', 'cid', 'message', 'client', 'server', 'request', 'upstream', 'host']

In [11]:
df[columns].head(1).T

Unnamed: 0,0
datetime,2022-01-22 03:08:03
level,error
pid,15691
tid,0
cid,567380300
message,upstream timed out (110: Connection timed out) while reading response header from upstream
client,10.100.251.243
server,default_server
request,"""POST /api/v1/sdc/zcdf/query HTTP/1.1"""
upstream,"""http//10.100.31.24131080/api/v1/sdc/zcdf/query"""


### 每天错误统计

In [12]:
tmp_df = df[['date', 'pid']].groupby('date').count()
tmp_df = tmp_df.reset_index()
tmp_df.columns = ['日期', '错误数量']
day_error_stats = tmp_df

In [13]:
day_error_stats

Unnamed: 0,日期,错误数量
0,2022-01-22,1630
1,2022-01-23,1797
2,2022-01-24,1901
3,2022-01-25,1873
4,2022-01-26,1869
5,2022-01-27,1538
6,2022-01-28,269


### 错误域名统计

In [14]:
tmp_df = df[['host', 'pid']].groupby(['host']).count().reset_index()
tmp_df.columns = ['域名', '错误数量']
tmp_df = tmp_df.sort_values(by=['错误数量'], ascending=False).reset_index(drop=True)
domain_error_stats = tmp_df

In [15]:
domain_error_stats[domain_error_stats.错误数量 > 10]

Unnamed: 0,域名,错误数量
0,"""streamsets-3.bdp.credithc.com""",8710
1,"""api.bdp.credithc.com""",2033
2,"""monitor.bdp.credithc.com80""",54
3,"""monitor.bdp.credithc.com""",41
4,"""10.100.251.42""",32


In [16]:
tmp_df = df[['date', 'host', 'pid']].groupby(['date', 'host']).count()
tmp_df.columns = ['错误数量']
date_domain_error_stats = tmp_df

In [17]:
date_domain_error_stats[date_domain_error_stats.错误数量 > 10]

Unnamed: 0_level_0,Unnamed: 1_level_0,错误数量
date,host,Unnamed: 2_level_1
2022-01-22,"""10.100.251.42""",12
2022-01-22,"""api.bdp.credithc.com""",297
2022-01-22,"""monitor.bdp.credithc.com""",16
2022-01-22,"""monitor.bdp.credithc.com80""",19
2022-01-22,"""streamsets-3.bdp.credithc.com""",1286
2022-01-23,"""api.bdp.credithc.com""",349
2022-01-23,"""streamsets-3.bdp.credithc.com""",1434
2022-01-24,"""api.bdp.credithc.com""",429
2022-01-24,"""monitor.bdp.credithc.com""",13
2022-01-24,"""streamsets-3.bdp.credithc.com""",1438


### 错误路由统计

In [18]:
tmp_df = df[['host', 'request', 'pid']].groupby(by=['host', 'request']).count()
tmp_df.columns = ['错误数量']
path_error_stats = tmp_df

In [19]:
path_error_stats[path_error_stats.错误数量 > 10]

Unnamed: 0_level_0,Unnamed: 1_level_0,错误数量
host,request,Unnamed: 2_level_1
"""10.100.251.42""","""POST /sdc/api1/balance HTTP/1.1""",32
"""api.bdp.credithc.com""","""POST /api/v1/sdc/zcdf/query HTTP/1.1""",400
"""api.bdp.credithc.com""","""POST /bdp/baihang/pushReport HTTP/1.1""",182
"""api.bdp.credithc.com""","""POST /bdp/bairong HTTP/1.1""",16
"""api.bdp.credithc.com""","""POST /bdp/position HTTP/1.1""",48
"""api.bdp.credithc.com""","""POST /bdp/spider/data HTTP/1.1""",13
"""api.bdp.credithc.com""","""POST /sdc/mdsp/el2/v1/staffInfoSpecified HTTP/1.1""",12
"""api.bdp.credithc.com""","""POST /sdc/sqlquery/dhsj/fqzk HTTP/1.1""",1202
"""monitor.bdp.credithc.com""","""POST /instances HTTP/1.1""",41
"""monitor.bdp.credithc.com80""","""POST /instances HTTP/1.1""",54
