In [1]:
import pandas as pd
import numpy as np

This exercise uses data from: 
 Zaker, Farzin, 2019, "Online Shopping Store - Web Server Logs", https://doi.org/10.7910/DVN/3QBYB5, Harvard Dataverse, V1 
 
The files `server_log_00.txt` `server_log_01.txt` `server_log_02.txt` `server_log_03.txt` `server_log_04.txt` `server_log_05.txt` `server_log_06.txt` `server_log_07.txt` `server_log_08.txt` `server_log_09.txt` contain logs of an online shopping store web server

Each file contains 100 lines for a total of 1000 transaction logs

# Part 1

Let us focus on one file, the code below reads the first file

In [2]:
df_00 = pd.read_csv('data/server_log_00.txt', sep='\n', header=None)
df_00.head()

Unnamed: 0,0
0,54.36.149.41 - - [22/Jan/2019:03:56:14 +0330] ...
1,"31.56.96.51 - - [22/Jan/2019:03:56:16 +0330] ""..."
2,"31.56.96.51 - - [22/Jan/2019:03:56:16 +0330] ""..."
3,40.77.167.129 - - [22/Jan/2019:03:56:17 +0330]...
4,"91.99.72.15 - - [22/Jan/2019:03:56:17 +0330] ""..."


Each row is one big string containing the server log. The [log contains multiple fields](https://docs.nginx.com/nginx/admin-guide/monitoring/logging/)
In this exercise we are interested in the IP address (the first field) and the timestamp. For example, the first row

In [3]:
df_00.iloc[0, 0]

'54.36.149.41 - - [22/Jan/2019:03:56:14 +0330] "GET /filter/27|13%20%D9%85%DA%AF%D8%A7%D9%BE%DB%8C%DA%A9%D8%B3%D9%84,27|%DA%A9%D9%85%D8%AA%D8%B1%20%D8%A7%D8%B2%205%20%D9%85%DA%AF%D8%A7%D9%BE%DB%8C%DA%A9%D8%B3%D9%84,p53 HTTP/1.1" 200 30577 "-" "Mozilla/5.0 (compatible; AhrefsBot/6.1; +http://ahrefs.com/robot/)" "-"'

is a request coming from `54.36.149.41` on `22/Jan/2019:03:56:14 +0330`. 

1. Extract the IP and timestamp in two separate columns. Discard the original column `0`.

In [4]:
import re
exp1 = re.compile(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')
exp2 = re.compile(r'- \[(.+)\] "')

In [5]:
exp1.findall(df_00.iloc[0, 0])

['54.36.149.41']

In [6]:
exp2.findall(df_00.iloc[0, 0])

['22/Jan/2019:03:56:14 +0330']

In [7]:
df_00['ip'] = df_00[0].apply(lambda x: exp1.findall(x)[0])
df_00['timestamp'] = df_00[0].apply(lambda x: exp2.findall(x)[0])
df_00 = df_00[['ip', 'timestamp']]

The data frame should look like

In [8]:
df_00.head()

Unnamed: 0,ip,timestamp
0,54.36.149.41,22/Jan/2019:03:56:14 +0330
1,31.56.96.51,22/Jan/2019:03:56:16 +0330
2,31.56.96.51,22/Jan/2019:03:56:16 +0330
3,40.77.167.129,22/Jan/2019:03:56:17 +0330
4,91.99.72.15,22/Jan/2019:03:56:17 +0330


# Part 2
2. Repeat the same operations for all of the files. In total you should have ten data frames
3. Combine the ten data frames in one master data frame

In [9]:
def process_file(file):
    df = pd.read_csv(file, sep='\n', header=None)
    df['ip'] = df[0].apply(lambda x: exp1.findall(x)[0])
    df['timestamp'] = df[0].apply(lambda x: exp2.findall(x)[0])
    df = df[['ip', 'timestamp']]
    return df

In [10]:
files = [f'data/server_log_0{i}.txt' for i in range(10)]

In [11]:
data_frames = [process_file(f) for f in files]

In [12]:
master = pd.concat(data_frames)
master.shape

(1000, 2)

In [13]:
master.head()

Unnamed: 0,ip,timestamp
0,54.36.149.41,22/Jan/2019:03:56:14 +0330
1,31.56.96.51,22/Jan/2019:03:56:16 +0330
2,31.56.96.51,22/Jan/2019:03:56:16 +0330
3,40.77.167.129,22/Jan/2019:03:56:17 +0330
4,91.99.72.15,22/Jan/2019:03:56:17 +0330
