In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd
from datetime import datetime
import re
import pytz

In [2]:
from operator import itemgetter
from urllib.parse import urlparse

#### Helper functions to parse the server logs

In [3]:
# Parse string and return the string delimited by 2 characters
def parseString(s):
    if s is None:
        return s[1:-1]

# 
def parseInt(s):
    if s is None:
        return 0
    try:
        return int(s)
    except ValueError:
        return 0
    
def parseDatetime(s):
    try:
        date = datetime.strptime(s[1:-7], '%d/%b/%Y:%H:%M:%S')
        timezone = int(s[-6:-3])*60+int(s[-3:-1])
        return date.replace(tzinfo=pytz.FixedOffset(timezone))
    except ValueError:
        return '-'

In [4]:
# Dict of functions for converting values in the logs.
data_converters = {'time': parseDatetime,
        'request': parseString,
        'status': parseInt,
        'size': parseInt,
        'referer': parseString,
        'user_agent': parseString}

In [8]:
server_logs = pd.read_csv('/content/drive/MyDrive/ML_Datasets/Web Server Logs/web-server-logs.txt',
                   usecols=[0, 3, 4, 5, 6, 7, 8],
                   names=['Users_IP', 'Timestamp','Requested_Page', 'HTTP_Code', 'Page_Size', 'Referrer_Page','User_Agent'],
                   header=None,
                   error_bad_lines=False,
                   engine='python',
                   na_values='-',
                   converters=data_converters,
                   sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])')              

In [9]:
server_logs.head()

Unnamed: 0,Users_IP,Timestamp,Requested_Page,HTTP_Code,Page_Size,Referrer_Page,User_Agent
0,13.66.139.0,[19/Dec/2020:13:57:26 +0100],"""GET /index.php?option=com_phocagallery&view=c...",200.0,32653.0,"""-""","""Mozilla/5.0 (compatible; bingbot/2.0; +http:/..."
1,157.48.153.185,[19/Dec/2020:14:08:06 +0100],"""GET /apache-log/access.log HTTP/1.1""",200.0,233.0,"""-""","""Mozilla/5.0 (Windows NT 6.3; Win64; x64) Appl..."
2,157.48.153.185,[19/Dec/2020:14:08:08 +0100],"""GET /favicon.ico HTTP/1.1""",404.0,217.0,"""http://www.almhuette-raith.at/apache-log/acce...","""Mozilla/5.0 (Windows NT 6.3; Win64; x64) Appl..."
3,216.244.66.230,[19/Dec/2020:14:14:26 +0100],"""GET /robots.txt HTTP/1.1""",200.0,304.0,"""-""","""Mozilla/5.0 (compatible; DotBot/1.1; http://w..."
4,54.36.148.92,[19/Dec/2020:14:16:44 +0100],"""GET /index.php?option=com_phocagallery&view=c...",200.0,30662.0,"""-""","""Mozilla/5.0 (compatible; AhrefsBot/7.0; +http..."


### Q1

#### Find 10 people who visited the site frequently, show the information that identify the people and state why you identify these people as frequent visitors

In [10]:
user_IP_addresses = set(server_logs['Users_IP'])
print(len(user_IP_addresses))
user_devices=set(server_logs['User_Agent'])
len(user_devices)

6298


2520

In [11]:
user_device = {}
for i,row in server_logs.iterrows():
    if(tuple((row['Users_IP'],row['User_Agent'])) in user_device):
        user_device[tuple((row['Users_IP'],row['User_Agent']))] += 1
    
    else:
        user_device[tuple((row['Users_IP'],row['User_Agent']))] = 1

In [12]:
frequent_users = dict(sorted(user_device.items(), key = itemgetter(1), reverse = True)[:10])

In [13]:
frequent_users

{('173.255.176.5',
  '"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0"'): 2663,
 ('173.255.176.5',
  '"Mozilla/5.0 (compatible; 008/0.83; http://www.80legs.com/webcrawler.html) Gecko/2008032620"'): 1678,
 ('178.44.47.170', '"python-requests/2.25.1"'): 2774,
 ('193.106.31.130',
  '"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"'): 136543,
 ('193.9.114.182',
  '"Mozilla/5.0 (Windows; U; Windows NT 5.1; ru; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3"'): 1986,
 ('197.52.128.37',
  '"Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko"'): 40771,
 ('45.15.143.155',
  '"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:28.0) Gecko/20100101 Firefox/28.0"'): 1284,
 ('45.15.143.155', '"python-requests/2.25.0"'): 642,
 ('51.210.183.78', '"curl/7.68.0"'): 2684,
 ('62.138.3.52',
  '"Mozilla/5.0 (X11; Linux x86_64; rv:81.0) Gecko/20100101 Firefox/81.0"'): 634}

### Q2

#### Show at least five sessions and the page views per each session. 

In [14]:
timestamp = []
pages_requested = []
users_details = []

for i in list(user_device)[:200]:
    for j,row in server_logs.iterrows():
        if(i==tuple((row['Users_IP'],row['User_Agent']))):
            timestamp.append(row['Timestamp'])
            pages_requested.append(row['Requested_Page'])
            users_details.append(i)

In [15]:
timestamp[:3]

['[19/Dec/2020:13:57:26 +0100]',
 '[22/Jan/2021:01:57:53 +0100]',
 '[07/Feb/2021:18:59:27 +0100]']

In [16]:
pages_requested[:3]

['"GET /index.php?option=com_phocagallery&view=category&id=1:almhuette-raith&Itemid=53 HTTP/1.1"',
 '"GET /index.php?option=com_phocagallery&view=category&id=1:almhuette-raith&Itemid=53 HTTP/1.1"',
 '"GET /index.php?option=com_content&view=article&id=46&Itemid=54 HTTP/1.1"']

In [17]:
users_details[:3]

[('13.66.139.0',
  '"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"'),
 ('13.66.139.0',
  '"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"'),
 ('13.66.139.0',
  '"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"')]

In [18]:
sessions = pd.DataFrame(data={'Session_Time': timestamp, 'Pages_Requested':pages_requested, 'User_Details': users_details})

In [19]:
sessions.head()

Unnamed: 0,Session_Time,Pages_Requested,User_Details
0,[19/Dec/2020:13:57:26 +0100],"""GET /index.php?option=com_phocagallery&view=c...","(13.66.139.0, ""Mozilla/5.0 (compatible; bingbo..."
1,[22/Jan/2021:01:57:53 +0100],"""GET /index.php?option=com_phocagallery&view=c...","(13.66.139.0, ""Mozilla/5.0 (compatible; bingbo..."
2,[07/Feb/2021:18:59:27 +0100],"""GET /index.php?option=com_content&view=articl...","(13.66.139.0, ""Mozilla/5.0 (compatible; bingbo..."
3,[11/Mar/2021:07:37:22 +0100],"""GET /index.php?option=com_content&view=articl...","(13.66.139.0, ""Mozilla/5.0 (compatible; bingbo..."
4,[19/Dec/2020:14:08:06 +0100],"""GET /apache-log/access.log HTTP/1.1""","(157.48.153.185, ""Mozilla/5.0 (Windows NT 6.3;..."


### Q3

#### Show five frequent pages which the users visit before visiting this particular web site

In [22]:
referrer_pages = server_logs['Referrer_Page'].dropna(how='all')

parsed_referrer_pages = []

for i in referrer_pages:       
    url = urlparse(i.replace('"', ''))
    res = '{uri.scheme}://{uri.netloc}/'.format(uri=url)
    parsed_referrer_pages.append(res)

In [23]:
word_counter = {}

for url in parsed_referrer_pages:
    if url in word_counter:
        word_counter[url] += 1
    else:
        word_counter[url] = 1

In [25]:
frequent_pages = sorted(word_counter, key = word_counter.get, reverse = True)

top_5_pages = frequent_pages[:6]

for i in top_5_pages:
    if(i==':///'''):
        top_5_pages.remove(i)
        
for i,j in enumerate(top_5_pages, start=1):
    print(i,"",j)

1  http://www.almhuette-raith.at/
2  http://almhuette-raith.at/
3  https://www.google.com/
4  http://simplesite.com/
5  http://www.google.com/


### Q4

#### Find at least ten frequent sequential patterns or navigational patterns which the users follow using the GSP algorithm, state your own support value and maximum length of item_set.   

In [26]:
requested_paged = list(sessions['Pages_Requested'])
accessed_pages = {}

for i,row in sessions.iterrows():
    if(row['User_Details'] in accessed_pages):
        accessed_pages[row['User_Details']].append(row['Pages_Requested'])    
    else:
        accessed_pages[row['User_Details']] = [row['Pages_Requested']]
        
for i in accessed_pages:
    print('\n Client {} accessed these pages \n {} \n'.format(i,accessed_pages[i]))
all_pages=[]
for i in accessed_pages:
    all_pages.append(accessed_pages[i])


 Client ('13.66.139.0', '"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"') accessed these pages 
 ['"GET /index.php?option=com_phocagallery&view=category&id=1:almhuette-raith&Itemid=53 HTTP/1.1"', '"GET /index.php?option=com_phocagallery&view=category&id=1:almhuette-raith&Itemid=53 HTTP/1.1"', '"GET /index.php?option=com_content&view=article&id=46&Itemid=54 HTTP/1.1"', '"GET /index.php?option=com_content&view=article&id=46&Itemid=54 HTTP/1.1"'] 


 Client ('157.48.153.185', '"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"') accessed these pages 
 ['"GET /apache-log/access.log HTTP/1.1"', '"GET /favicon.ico HTTP/1.1"'] 


 Client ('216.244.66.230', '"Mozilla/5.0 (compatible; DotBot/1.1; http://www.opensiteexplorer.org/dotbot, help@moz.com)"') accessed these pages 
 ['"GET /robots.txt HTTP/1.1"', '"GET /robots.txt HTTP/1.1"', '"GET /index.php?option=com_content&view=article&id=50&Itemid=56 HTTP