In [1]:
import pandas as pd
import numpy as np

# %matplotlib inline

In [2]:
# %h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\"

from datetime import datetime
import pytz

def parse_str(x):
    return x[1:-1]

def parse_datetime(x):
    dt = datetime.strptime(x[1:-7], '%d/%b/%Y:%H:%M:%S')
    dt_tz = int(x[-6:-3])*60+int(x[-3:-1])    
    return dt.replace(tzinfo=pytz.FixedOffset(dt_tz))

In [3]:
import re
import pandas as pd

data = pd.read_csv(
    'data/access.log', 
    sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])', 
    engine='python', 
    na_values='-', 
    header=None,
    usecols=[0, 3, 4, 5, 6, 7, 8],
    names=['ip', 'time', 'request', 'status', 'size', 'referer', 'user_agent'],
    converters={'time': parse_datetime,
                'request': parse_str,
                'status': int,
                'size': int,
                'referer': parse_str,
                'user_agent': parse_str})

In [4]:
data.head()

Unnamed: 0,ip,time,request,status,size,referer,user_agent
0,68.177.95.36,2017-07-29 23:52:46+09:00,GET /category/games HTTP/1.1,200,62,,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.1)...
1,84.48.39.212,2017-07-29 23:52:46+09:00,GET /item/office/1495 HTTP/1.1,200,82,/search/?c=Garden+Sports,Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...
2,160.33.168.76,2017-07-29 23:52:46+09:00,GET /category/books HTTP/1.1,200,125,/category/jewelry,Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...
3,24.168.36.225,2017-07-29 23:52:46+09:00,GET /item/books/2489 HTTP/1.1,200,65,/category/office,Mozilla/5.0 (Windows NT 6.0; rv:10.0.1) Gecko/...
4,196.30.56.81,2017-07-29 23:52:46+09:00,GET /category/electronics HTTP/1.1,200,116,,Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...


## IP出現数をカウント

In [20]:
counted_ip = data.ip.value_counts()
counted_ip.head()

188.171.28.96     111
136.108.227.33    109
80.90.157.181     108
124.105.86.24     108
68.60.78.40       108
Name: ip, dtype: int64

## IPソート

In [18]:
import socket as sock

def sorted_ips(raw_ips):
    ips = [sock.inet_pton(sock.AF_INET, ip) for ip in raw_ips]
    ips.sort()
    return [sock.inet_ntop(sock.AF_INET, ip) for ip in ips]

unique_ips= counted_ip.to_dict().keys()
sorted_ips(unique_ips)[0:10]

['20.24.67.42',
 '20.27.42.192',
 '20.30.177.143',
 '20.45.205.115',
 '20.48.21.137',
 '20.57.173.220',
 '20.78.111.60',
 '20.84.149.116',
 '20.90.188.172',
 '20.90.222.119']