In [15]:
import pandas as pd
import numpy as np
import itertools as it
%matplotlib inline

In [16]:
# %h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\"

from datetime import datetime
import pytz

def parse_str(x):
    """
    Returns the string delimited by two characters.

    Example:
        `>>> parse_str('[my string]')`
        `'my string'`
    """
    return x[1:-1]

def parse_datetime(x):
    '''
    Parses datetime with timezone formatted as:
        `[day/month/year:hour:minute:second zone]`

    Example:
        `>>> parse_datetime('13/Nov/2015:11:45:42 +0000')`
        `datetime.datetime(2015, 11, 3, 11, 45, 4, tzinfo=<UTC>)`

    Due to problems parsing the timezone (`%z`) with `datetime.strptime`, the 
    timezone will be obtained using the `pytz` library.
    '''    
    dt = datetime.strptime(x[1:-7], '%d/%b/%Y:%H:%M:%S')
    dt_tz = int(x[-6:-3])*60+int(x[-3:-1])    
    return dt.replace(tzinfo=pytz.FixedOffset(dt_tz))

In [17]:
import re
import pandas as pd

data = pd.read_csv(
    'data/access.log', 
    sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])', 
    engine='python', 
    na_values='-', 
    header=None,
    usecols=[0, 3, 4, 5, 6, 7, 8],
    names=['ip', 'time', 'request', 'status', 'size', 'referer', 'user_agent'],
    converters={'time': parse_datetime,
                'request': parse_str,
                'status': int,
                'size': int,
                'referer': parse_str,
                'user_agent': parse_str})

In [18]:
data.head()

Unnamed: 0,ip,time,request,status,size,referer,user_agent
0,68.177.95.36,2017-07-29 23:52:46+09:00,GET /category/games HTTP/1.1,200,62,,Mozilla/5.0 (Windows NT 6.1; WOW64; rv:10.0.1)...
1,84.48.39.212,2017-07-29 23:52:46+09:00,GET /item/office/1495 HTTP/1.1,200,82,/search/?c=Garden+Sports,Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...
2,160.33.168.76,2017-07-29 23:52:46+09:00,GET /category/books HTTP/1.1,200,125,/category/jewelry,Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...
3,24.168.36.225,2017-07-29 23:52:46+09:00,GET /item/books/2489 HTTP/1.1,200,65,/category/office,Mozilla/5.0 (Windows NT 6.0; rv:10.0.1) Gecko/...
4,196.30.56.81,2017-07-29 23:52:46+09:00,GET /category/electronics HTTP/1.1,200,116,,Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...


## IP出現数をカウント

In [34]:
counted_ip = data['ip'].value_counts()
counted_ip.head()

188.171.28.96     111
136.108.227.33    109
80.90.157.181     108
124.105.86.24     108
68.60.78.40       108
Name: ip, dtype: int64

## IPソート

In [48]:
import socket as sock

def sorted_ips(raw_ips):
    ips = [sock.inet_pton(sock.AF_INET, ip) for ip in raw_ips]
    ips.sort()
    return [sock.inet_ntop(sock.AF_INET, ip) for ip in ips]

dic = counted_ip.to_dict()
keys = dic.keys()
sorted_ips(keys)

['20.24.67.42',
 '20.27.42.192',
 '20.30.177.143',
 '20.45.205.115',
 '20.48.21.137',
 '20.57.173.220',
 '20.78.111.60',
 '20.84.149.116',
 '20.90.188.172',
 '20.90.222.119',
 '20.96.160.119',
 '20.102.43.140',
 '20.111.164.105',
 '20.126.60.107',
 '20.144.138.182',
 '20.183.84.164',
 '20.186.93.31',
 '20.186.206.125',
 '20.198.86.193',
 '20.201.43.108',
 '20.201.150.145',
 '20.207.155.228',
 '20.213.47.167',
 '24.27.92.22',
 '24.33.101.107',
 '24.39.133.77',
 '24.42.149.68',
 '24.48.69.103',
 '24.57.73.69',
 '24.60.46.139',
 '24.60.70.37',
 '24.66.223.98',
 '24.75.125.187',
 '24.93.89.70',
 '24.99.159.56',
 '24.99.222.197',
 '24.102.63.71',
 '24.117.200.192',
 '24.126.131.126',
 '24.129.104.31',
 '24.135.200.109',
 '24.138.36.160',
 '24.141.126.135',
 '24.168.36.225',
 '24.168.51.145',
 '24.174.85.104',
 '24.174.185.48',
 '24.180.55.33',
 '24.183.121.49',
 '24.210.195.93',
 '24.222.129.135',
 '28.18.125.132',
 '28.24.142.55',
 '28.33.61.199',
 '28.78.83.88',
 '28.99.81.228',
 '28.114.

In [9]:
data.sort_values(by=["ip"], ascending=False)

Unnamed: 0,ip,time,request,status,size,referer,user_agent
53097,96.93.130.70,2017-07-29 23:52:47+09:00,GET /category/electronics HTTP/1.1,200,111,,Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...
64309,96.93.130.70,2017-07-29 23:52:47+09:00,GET /category/health HTTP/1.1,200,62,/category/electronics,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...
95218,96.93.130.70,2017-07-29 23:52:48+09:00,GET /category/books HTTP/1.1,200,72,,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...
41120,96.93.130.70,2017-07-29 23:52:47+09:00,GET /category/electronics HTTP/1.1,200,125,/category/electronics,Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...
40970,96.93.130.70,2017-07-29 23:52:47+09:00,GET /category/software HTTP/1.1,200,52,/search/?c=Sports+Books,Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...
9690,96.93.130.70,2017-07-29 23:52:46+09:00,GET /item/software/4717 HTTP/1.1,200,84,/category/books,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...
95849,96.93.130.70,2017-07-29 23:52:48+09:00,GET /item/books/3303 HTTP/1.1,200,107,/category/software,Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...
62189,96.93.130.70,2017-07-29 23:52:47+09:00,GET /category/games HTTP/1.1,200,69,,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...
62338,96.93.130.70,2017-07-29 23:52:47+09:00,GET /category/giftcards HTTP/1.1,200,101,/item/music/51,Mozilla/5.0 (compatible; MSIE 9.0; Windows NT ...
38740,96.93.130.70,2017-07-29 23:52:47+09:00,GET /category/cameras HTTP/1.1,200,51,,Mozilla/4.0 (compatible; MSIE 8.0; Windows NT ...
