In [1]:
import sys
import bz2
from bz2 import BZ2File
import io
import csv
import glob
import pandas as pd
from datetime import datetime
import pytz
import numpy as np
import re
from urllib import parse
from furl import furl
from itertools import tee
pd.options.mode.chained_assignment = None

In [2]:
path = "usage_source_test"
filename_prefix = "doi.pangaea.de-access."
file_type=".bz2"

In [3]:
def parse_str(x):
    """
    Returns the string delimited by two characters.
    Example:`>>> parse_str('[my string]')``'my string'`"""
    if x:
        return x[1:-1]
    else:
        return x

In [4]:
import os
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(os.path.realpath('__file__')))) # /home/kristina/desire-directory
source_dir = os.path.abspath(os.path.join(parent_dir, path))
source_dir

'C:\\Users\\asd\\python-workspace\\pangaea-recsys\\usage_analysis\\usage_source_test'

In [5]:
def parse_datetime(x):
    dtime = datetime.strptime(x[1:-7],'%d/%b/%Y:%H:%M:%S')
    dtime = dtime.date()
    return dtime

In [6]:
#converters={'request': parse_str,'referer': parse_str,'user_agent': parse_str
dfs = []
for file in os.listdir(source_dir):
    if file.startswith(filename_prefix) and file.endswith(file_type):
        filepath = os.path.join(source_dir, file)
        #print(filepath)
        data = pd.read_csv(filepath, compression='bz2',encoding = 'ISO-8859-1',
                           sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])', engine='python', header=0,
                           usecols=[0, 3, 4, 5, 7, 8],names=['ip', 'time', 'request', 'status', 'referer', 'user_agent'],
                          converters={"request": parse_str})
        dfs.append(data)

# Concatenate all data into one DataFrame
dfmain = pd.concat(dfs, ignore_index=True)

In [7]:
dfmain.shape

(272408, 6)

In [8]:
dfmain.head()

Unnamed: 0,ip,time,request,status,referer,user_agent
0,134.1.2.141,[11/Jun/2012:08:34:47 +0000],HEAD /10.1594/PANGAEA.475810 HTTP/1.1,200,"""-""","""Mozilla/5.0 (Windows; U; Windows NT 6.1; de; ..."
1,134.1.2.141,[11/Jun/2012:08:34:47 +0000],HEAD /10.1594/PANGAEA.475888 HTTP/1.1,200,"""-""","""Mozilla/5.0 (Windows; U; Windows NT 6.1; de; ..."
2,134.1.2.141,[11/Jun/2012:08:34:47 +0000],HEAD /10.1594/PANGAEA.475811 HTTP/1.1,200,"""-""","""Mozilla/5.0 (Windows; U; Windows NT 6.1; de; ..."
3,134.1.2.141,[11/Jun/2012:08:34:47 +0000],HEAD /10.1594/PANGAEA.475889 HTTP/1.1,200,"""-""","""Mozilla/5.0 (Windows; U; Windows NT 6.1; de; ..."
4,134.1.2.141,[11/Jun/2012:08:34:47 +0000],HEAD /10.1594/PANGAEA.475812 HTTP/1.1,200,"""-""","""Mozilla/5.0 (Windows; U; Windows NT 6.1; de; ..."


In [9]:
dfmain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 272408 entries, 0 to 272407
Data columns (total 6 columns):
ip            272408 non-null object
time          272408 non-null object
request       272408 non-null object
status        272408 non-null int64
referer       272408 non-null object
user_agent    272408 non-null object
dtypes: int64(1), object(5)
memory usage: 12.5+ MB


In [10]:
#Get resource URI
request = dfmain.request.str.split()
#dfmain['resource'] = request.str[1]

In [11]:
#Filter out non GET and non 200 requests
dfmain = dfmain[(request.str[0] == 'GET') & (dfmain.status == 200)]
dfmain.shape

(237004, 6)

In [12]:
#undesired resources
dfmain = dfmain[~dfmain['request'].str.match(r'^/media|^/static|^/admin|^/robots.txt$|^/favicon.ico$')]
print(dfmain.shape)
#filter crawlers by User-Agent
dfmain = dfmain[~dfmain['user_agent'].str.match(r'.*?bot|.*?spider|.*?crawler|.*?slurp', flags=re.I).fillna(False)]
print(dfmain.shape)

(237004, 6)
(214982, 6)


In [13]:
#remove referer == NaN
#dfmain = dfmain.dropna(subset=['referer'])   
#print(dfmain.shape)
# only include referer from pangaease
#domains = ['pangaea.de', '/search?']
domains = ['doi.pangaea.de', 'www.pangaea.de', '/search?']
domains_joins = '|'.join(map(re.escape, domains))
print(domains_joins)
dfmain = dfmain[(dfmain.referer.str.contains(domains_joins))]
dfmain.head()

doi\.pangaea\.de|www\.pangaea\.de|\/search\?


Unnamed: 0,ip,time,request,status,referer,user_agent
4832,134.1.2.141,[11/Jun/2012:08:38:00 +0000],GET /10.1594/PANGAEA.715006 HTTP/1.1,200,"""http://search.yahoo.co.jp/search?p=Bassinot+o...","""Mozilla/5.0 (Windows NT 5.1; rv:12.0) Gecko/2..."
4857,134.1.2.141,[11/Jun/2012:08:38:33 +0000],GET /10.1594/PANGAEA.676969 HTTP/1.1,200,"""http://www.pangaea.de/search?q=project:BSRN+%...","""Mozilla/5.0 (Windows NT 5.1; rv:12.0) Gecko/2..."
4915,134.1.2.141,[11/Jun/2012:08:39:49 +0000],GET /10.1594/PANGAEA.715006?format=zip&charset...,200,"""http://doi.pangaea.de/10.1594/PANGAEA.715006""","""Mozilla/5.0 (Windows NT 5.1; rv:12.0) Gecko/2..."
5462,134.1.2.141,[11/Jun/2012:08:44:32 +0000],GET /10.1594/PANGAEA.673228 HTTP/1.1,200,"""http://www.pangaea.de/search?q=project:BSRN+%...","""Mozilla/5.0 (Windows NT 5.1; rv:10.0.4) Gecko..."
5526,134.1.2.141,[11/Jun/2012:08:53:08 +0000],GET /10.1594/PANGAEA.132301 HTTP/1.1,200,"""http://www.pangaea.de/search?count=10&q=prima...","""Mozilla/5.0 (Windows NT 6.0; rv:13.0) Gecko/2..."


In [14]:
'-' in dfmain.referer

False

In [15]:
dfmain.referer.isnull().values.any(), dfmain.request.isnull().values.any()

(False, False)

In [16]:
#dfmain = dfmain[(dfmain.referer.str.contains('google') != True)]  
dfmain.shape

(5239, 6)

In [17]:
#test only
m = pd.DataFrame({'Page URL':['GET /doi%3A10.1594/PANGAEA.134142?format=events_kml', '/10.1594/PANGAEA.55907?format=events_kml&',
                                   '/10.1594/PANGAEA.809526','/','/10.1594/PANGAEA.864108 HTTP/1.1']})
m['SubDomain'] = m['Page URL'].str.extract(r'PANGAEA.\s*([^\n? ]+)',expand=False)
m.head()

Unnamed: 0,Page URL,SubDomain
0,GET /doi%3A10.1594/PANGAEA.134142?format=event...,134142.0
1,/10.1594/PANGAEA.55907?format=events_kml&,55907.0
2,/10.1594/PANGAEA.809526,809526.0
3,/,
4,/10.1594/PANGAEA.864108 HTTP/1.1,864108.0


In [18]:
#check non-200 status
dfmain.loc[dfmain['status'] != 200]['status'].unique()

array([], dtype=int64)

In [19]:
#get resource uri
dfmain['_id'] = dfmain['request'].str.extract(r'PANGAEA.\s*([^\n? ]+)',expand=False)
print(dfmain.shape)
#remove rows if dataset is NaN
dfmain = dfmain.dropna(subset=['_id'], how='all')
print(dfmain.shape)

(5239, 7)
(5150, 7)


In [20]:
#dfmain['_id'] = pd.to_numeric(dfmain['_id'], errors='coerce')
dfmain.head()

Unnamed: 0,ip,time,request,status,referer,user_agent,_id
4832,134.1.2.141,[11/Jun/2012:08:38:00 +0000],GET /10.1594/PANGAEA.715006 HTTP/1.1,200,"""http://search.yahoo.co.jp/search?p=Bassinot+o...","""Mozilla/5.0 (Windows NT 5.1; rv:12.0) Gecko/2...",715006
4857,134.1.2.141,[11/Jun/2012:08:38:33 +0000],GET /10.1594/PANGAEA.676969 HTTP/1.1,200,"""http://www.pangaea.de/search?q=project:BSRN+%...","""Mozilla/5.0 (Windows NT 5.1; rv:12.0) Gecko/2...",676969
4915,134.1.2.141,[11/Jun/2012:08:39:49 +0000],GET /10.1594/PANGAEA.715006?format=zip&charset...,200,"""http://doi.pangaea.de/10.1594/PANGAEA.715006""","""Mozilla/5.0 (Windows NT 5.1; rv:12.0) Gecko/2...",715006
5462,134.1.2.141,[11/Jun/2012:08:44:32 +0000],GET /10.1594/PANGAEA.673228 HTTP/1.1,200,"""http://www.pangaea.de/search?q=project:BSRN+%...","""Mozilla/5.0 (Windows NT 5.1; rv:10.0.4) Gecko...",673228
5526,134.1.2.141,[11/Jun/2012:08:53:08 +0000],GET /10.1594/PANGAEA.132301 HTTP/1.1,200,"""http://www.pangaea.de/search?count=10&q=prima...","""Mozilla/5.0 (Windows NT 6.0; rv:13.0) Gecko/2...",132301


In [21]:
def get_query(url):
    qparams = dict(parse.parse_qsl(parse.urlsplit(url).query))
    query_string = ""
    if len(qparams) > 0:
        for key in qparams:
            if re.match(r'f[.]|q|t|p', key):
                query_string += qparams[key] + " "
    return query_string

In [22]:
testUrl = dfmain.iloc[10]['referer']
testUrl, get_query(testUrl)

('"http://www.pangaea.de/search?count=10&minlat=&minlon=&maxlat=&maxlon=&mindate=&maxdate=&env=All&q=ANT-XXVII%2F4+-radiosonde+"',
 'ANT-XXVII/4 -radiosonde " ')

In [23]:
#first degree queries
dfmain['query_1']=dfmain['referer'].map(get_query)

In [24]:
"" in dfmain.query_1.unique(), " " in dfmain.query_1.unique()

(True, False)

In [25]:
dfmain.time.min(),dfmain.time.max()

('[11/Jun/2012:08:38:00 +0000]', '[16/Jun/2012:23:53:39 +0000]')

In [26]:
dfsummary = dfmain.copy()
dfsummary.shape

(5150, 8)

In [27]:
#remove rows with "" queries
dfsummary= dfsummary[dfsummary.query_1 != ""]
dfmain.shape, dfsummary.shape

((5150, 8), (2639, 8))

In [28]:
#Bassinot oxygen isotope
dfsummary = dfsummary[['_id','query_1']]
dfsummary.head()

Unnamed: 0,_id,query_1
4832,715006,Bassinot oxygen isotope
4857,676969,project:BSRN +event:BAR +Ozone total
5462,673228,"project:BSRN +event:TOR +""Basic and other"""
5526,132301,primary productivity
5530,679305,primary productivity


In [29]:
df_qid= dfsummary.groupby(['query_1'])['_id'].apply(list).reset_index(name='datasets')
df_qid['Length'] = df_qid['datasets'].str.len()
df_qid.head()

Unnamed: 0,query_1,datasets,Length
0,"G�mez Izquierdo""","[773374, 730506, 773378, 773387, 773364]",5
1,Namsaraev Kara Sea methane,[746796],1
2,Population dynamics of the surf clams Donax h...,[690503],1
3,"Schlo&#946;teich kaliningrad ""","[772314, 775178, 772250]",3
4,"cortese, giuseppe; abelmann, andrea (2002): r...",[706557],1


In [33]:
df_qid.shape

(1177, 3)

In [30]:
#dftest = dfsummary.groupby(['query_1']).size().reset_index(name='count')

In [31]:
source_dir

'C:\\Users\\asd\\python-workspace\\pangaea-recsys\\usage_analysis\\usage_source_test'

In [32]:
df_qid.to_csv(os.path.abspath(os.path.join(parent_dir, 'results'))+'/query_data_frequency.csv', sep='\t', encoding='utf-8')