# Wrangle Data

- Prepare Environment

In [1]:
import numpy as np
import pandas as pd
import math
from sklearn import metrics

from scipy.stats import entropy

import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates #to format dates on our plots
%matplotlib inline
import seaborn as sns

# This is to make sure matplotlib doesn't throw the following error:
# The next line fixes "TypeError: float() argument must be a string or a number, not 'Timestamp' matplotlib"
pd.plotting.register_matplotlib_converters()

## Acquire

In [2]:
colnames=['ip', 'timestamp', 'request_method', 'status', 'size',
          'destination', 'request_agent']
df = pd.read_csv('https://python.zach.lol/access.log',          
                 engine='python',
                 header=None,
                 index_col=False,
                 names=colnames,
                 sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
                 na_values='"-"',
                 usecols=[0, 3, 4, 5, 6, 7, 8]
)

In [3]:
df.head()

Unnamed: 0,ip,timestamp,request_method,status,size,destination,request_agent
0,97.105.19.58,[16/Apr/2019:19:34:42 +0000],"""GET /api/v1/sales?page=81 HTTP/1.1""",200,512495,,"""python-requests/2.21.0"""
1,97.105.19.58,[16/Apr/2019:19:34:42 +0000],"""GET /api/v1/items HTTP/1.1""",200,3561,,"""python-requests/2.21.0"""
2,97.105.19.58,[16/Apr/2019:19:34:44 +0000],"""GET /api/v1/sales?page=82 HTTP/1.1""",200,510103,,"""python-requests/2.21.0"""
3,97.105.19.58,[16/Apr/2019:19:34:46 +0000],"""GET /api/v1/sales?page=83 HTTP/1.1""",200,510003,,"""python-requests/2.21.0"""
4,97.105.19.58,[16/Apr/2019:19:34:48 +0000],"""GET /api/v1/sales?page=84 HTTP/1.1""",200,511963,,"""python-requests/2.21.0"""


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13974 entries, 0 to 13973
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   ip              13974 non-null  object
 1   timestamp       13974 non-null  object
 2   request_method  13974 non-null  object
 3   status          13974 non-null  int64 
 4   size            13974 non-null  int64 
 5   destination     25 non-null     object
 6   request_agent   13974 non-null  object
dtypes: int64(2), object(5)
memory usage: 764.3+ KB


In [5]:
df.describe()

Unnamed: 0,status,size
count,13974.0,13974.0
mean,200.34235,449900.797338
std,10.112012,160742.535606
min,200.0,0.0
25%,200.0,500637.0
50%,200.0,510138.0
75%,200.0,511291.0
max,499.0,513736.0


In [6]:
df.shape

(13974, 7)

In [7]:
df.isnull().sum()

ip                    0
timestamp             0
request_method        0
status                0
size                  0
destination       13949
request_agent         0
dtype: int64

In [8]:
df = df[['ip', 'timestamp', 'size']]
## only interested in these features

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13974 entries, 0 to 13973
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ip         13974 non-null  object
 1   timestamp  13974 non-null  object
 2   size       13974 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 327.6+ KB


In [9]:
df.head(2)

Unnamed: 0,ip,timestamp,size
0,97.105.19.58,[16/Apr/2019:19:34:42 +0000],512495
1,97.105.19.58,[16/Apr/2019:19:34:42 +0000],3561


## Explore IP Address

In [10]:
# create dataframe out of value counts
ip_counts = pd.DataFrame(df.ip.value_counts()).reset_index()

# rename columns
ip_counts.columns=['ip', 'event_count']

# get the number of ip addresses seen 1, or 2, or 11,998 times. 
ip_counts.groupby(['event_count']).count()

Unnamed: 0_level_0,ip
event_count,Unnamed: 1_level_1
1,10
2,3
21,2
246,1
613,1
1059,1
11998,1


In [11]:
# filter where event_count == 1
ip_counts[ip_counts['event_count'] == 1]

Unnamed: 0,ip,event_count
9,34.229.70.250,1
10,52.90.165.200,1
11,54.145.52.184,1
12,52.91.30.150,1
13,54.172.14.223,1
14,3.88.129.158,1
15,3.92.201.136,1
16,45.23.250.16,1
17,35.174.209.2,1
18,34.207.64.242,1


In [12]:
from scipy.stats import entropy

# pass the entropy function an array-like object of counts. 
entropy(ip_counts.event_count)

0.5648495339393655

## Prepare Data to Explore Size

In [13]:
df.head(2)

Unnamed: 0,ip,timestamp,size
0,97.105.19.58,[16/Apr/2019:19:34:42 +0000],512495
1,97.105.19.58,[16/Apr/2019:19:34:42 +0000],3561


In [14]:
# remove brackets and timezone (+0000) by replacing them with empty string ''
df.timestamp = df.timestamp.str.replace(r'(\[|\]|\+0{4})', '', regex=True)
df.head(1)

Unnamed: 0,ip,timestamp,size
0,97.105.19.58,16/Apr/2019:19:34:42,512495


In [15]:
# replace the first colon, the one that separates date from time,
# with a space
df.timestamp = pd.to_datetime(df.timestamp.str.replace(':', ' ', 1))

df.head(1)

Unnamed: 0,ip,timestamp,size
0,97.105.19.58,2019-04-16 19:34:42,512495


In [16]:
# set the index to timestamp
df = df.set_index('timestamp')
df.head(1)

Unnamed: 0_level_0,ip,size
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-04-16 19:34:42,97.105.19.58,512495


In [17]:
# resample by day, summing the size
df = df[['size']].resample('1d').sum()
df.head(1)

Unnamed: 0_level_0,size
timestamp,Unnamed: 1_level_1
2019-04-16,5480479212


In [18]:
# fill missing values with 0
df = df.fillna(value=0)

# summary stats of the size
df.describe()

Unnamed: 0,size
count,2.0
mean,3143457000.0
std,3305049000.0
min,806434500.0
25%,1974946000.0
50%,3143457000.0
75%,4311968000.0
max,5480479000.0
