In [1]:
import sys
import os
import bz2
from bz2 import BZ2File
import io
import csv
import glob
import pandas as pd
from datetime import datetime
import pytz
import numpy as np
import re
from urllib import parse
from furl import furl
from itertools import tee
pd.options.mode.chained_assignment = None

In [2]:
path = "pangaea_usage"
filename_prefix = "doi.pangaea.de-access."
file_type=".bz2"

In [3]:
def parse_str(x):
    """
    Returns the string delimited by two characters.
    Example:`>>> parse_str('[my string]')``'my string'`"""
    if x:
        return x[1:-1]
    else:
        return x

In [None]:
#def clean_logs(df):
    

In [None]:
#converters={'request': parse_str,'referer': parse_str,'user_agent': parse_str
dfs = []
for file in os.listdir(path):
    if file.startswith(filename_prefix) and file.endswith(file_type):
        filepath = os.path.join(path, file)
        print(filepath)
        data = pd.read_csv(filepath, compression='bz2',encoding = 'ISO-8859-1',
                           sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])', engine='python', header=0,
                           usecols=[0, 3, 4, 5, 7, 8],names=['ip', 'time', 'request', 'status', 'referer', 'user_agent'])
        dfs.append(data)

# Concatenate all data into one DataFrame
dfmain = pd.concat(dfs, ignore_index=True)

pangaea_usage\doi.pangaea.de-access.201707020000.bz2
pangaea_usage\doi.pangaea.de-access.201707090000.bz2
pangaea_usage\doi.pangaea.de-access.201707160000.bz2
pangaea_usage\doi.pangaea.de-access.201707230000.bz2


In [None]:
dfmain.shape

In [None]:
#Get resource URI
request = dfmain.request.str.split()
dfmain['resource'] = request.str[1]
#Filter out non GET and non 200 requests
dfmain = dfmain[(request.str[0] == 'GET') & (dfmain.status == 200)]
dfmain.shape

In [None]:
#undesired resources
dfmain = dfmain[~dfmain['request'].str.match(r'^/media|^/static|^/admin|^/robots.txt$|^/favicon.ico$')]
#filter crawlers by User-Agent
dfmain = dfmain[~dfmain['user_agent'].str.match(r'.*?bot|.*?spider|.*?crawler|.*?slurp', flags=re.I).fillna(False)]
dfmain.shape

In [None]:
#remove referer == NaN
dfmain = dfmain.dropna(subset=['referer'])   
# only include referer from pangaease
domains = ['doi.pangaea.de', 'www.pangaea.de', '/search?']
domains_joins = '|'.join(map(re.escape, domains))
print(domains_joins)
dfmain = dfmain[(dfmain.referer.str.contains(domains_joins))]
dfmain.head()

In [None]:
#dfmain = dfmain[(dfmain.referer.str.contains('google') != True)]  
#dfmain.shape

In [None]:
#test only
m = pd.DataFrame({'Page URL':['GET /doi%3A10.1594/PANGAEA.134142?format=events_kml', '/10.1594/PANGAEA.55907?format=events_kml&',
                                   '/10.1594/PANGAEA.809526','/','/10.1594/PANGAEA.864108 HTTP/1.1']})
m['SubDomain'] = m['Page URL'].str.extract(r'PANGAEA.\s*([^\n? ]+)',expand=False)
m.head()

In [None]:
dfmain['status'] = dfmain['status'].astype(int)

In [None]:
#convert status to int
dfmain.loc[dfmain['status'] != 200]['status'].unique()

In [None]:
#get resource uri
dfmain['_id'] = dfmain['request'].str.extract(r'PANGAEA.\s*([^\n? ]+)',expand=False)
#remove rows if dataset is NaN
dfmain = dfmain.dropna(subset=['_id'], how='all')
dfmain.shape

In [None]:
#dfmain['_id'] = pd.to_numeric(dfmain['_id'], errors='coerce')
dfmain.head()

In [None]:
#dfmain = dfmain[~dfmain['ip'].str.startswith('X.X.X.')] 

In [None]:
def get_query(url):
    qparams = dict(parse.parse_qsl(parse.urlsplit(url).query))
    query_string= ""
    if len(qparams)>0:
        for key in qparams:
            if re.match(r'f[.]|q|t', key):
                query_string += qparams[key] + " "
    return query_string

In [None]:
#first degree queries
dfmain['query_1']=dfmain['referer'].map(get_query)

In [None]:
"" in dfmain.query_1.unique(), " " in dfmain.query_1.unique()

In [None]:
#convert time to datetime type
dfmain['time'] = dfmain['time'].str.strip('[]').str[:-6]
dfmain['time'] = pd.to_datetime(dfmain['time'], format='%d/%b/%Y:%H:%M:%S')

In [None]:
dfmain.info()

In [None]:
dfmain['time_normalize'] = dfmain['time'].dt.date
#dfmain['time_normalize']=dfmain['time'].dt.round('720min')  
dfmain.head()

In [None]:
dfmain.ix[dfmain['_id']=='875146']['referer']

In [None]:
dfmain['query_2'] = ""
dfmain = dfmain[['ip','time','_id','query_1','query_2','time_normalize']]

In [None]:
dfmain.shape

In [None]:
dfmain.head(15)

In [None]:
first = dfmain.groupby(by=['ip','time_normalize'])
first_filtered = first.filter(lambda x: len(x[x['query_1'] != ""]) >0)
first_filtered.head(20)  

In [None]:
second = first_filtered.groupby(by=['ip','time_normalize'])
filtered = second.filter(lambda x: len(x[x['query_1'] == ""]) >0)
filtered.head(20)  

In [None]:
#second_filters = filtered_notnull.groupby(['ip','time_normalize'])['query_1'].filter(lambda x: len(x == "") > 0)
#filtered = filtered_notnull[filtered_notnull['query_1'].isin(second_filters)]
#filtered.head(20)

In [None]:
#groups_final = filtered_notnull.groupby(by=['ip','time_normalize'])
#filtered = groups_final.filter(lambda x: len(x[x['query_1'] == None]) >0)
#filtered.head(20)  

In [None]:
def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = tee(iterable)
    next(b, None)
    return zip(a, b)

In [None]:
for (i1, row1), (i2, row2) in pairwise(filtered.iterrows()):
    if ((row1["query_1"] != "") and (row2["query_1"] == "")):
        filtered.set_value(i2, 'query_2', row1["query_1"])
        #filtered.loc[i2, 'query_2'] = row1["query_1"]

In [None]:
filtered.head(20)

In [None]:
filtered.shape

In [None]:
filtered = filtered[~((filtered.query_1 == "") & (filtered.query_2 == ""))]

In [None]:
filtered.shape

In [None]:
filtered.head(10)

In [None]:
filtered._id.nunique()

In [None]:
dfgroup = filtered.groupby('_id')['query_1','query_2'].apply(lambda x: x.sum())
dfgroup.head()

In [None]:
filtered[filtered._id=='100217']

In [None]:
#strip white spaces
dfgroup['query_1'] = dfgroup['query_1'].str.strip()
dfgroup['query_2'] = dfgroup['query_2'].str.strip()
dfgroup.head()

In [None]:
dfgroup.ix[dfgroup.query_2 =="", 'query_2'] = None
dfgroup.ix[dfgroup.query_1 =="", 'query_1'] = None

In [None]:
dfgroup.reset_index().to_json(path+'/query_data_rel.json',orient='records')