## Task 4 - Web Mining



In [1]:
import pandas as pd

df = pd.read_csv('datasets/Weblog.csv')

#df.head()

## Data Cleaning

In [2]:
df.rename(columns= {'Staus': 'Status', 'IP address': 'IP'}, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15789 entries, 0 to 15788
Data columns (total 4 columns):
IP           15789 non-null object
Timestamp    15789 non-null object
Request      15789 non-null object
Status       15789 non-null int64
dtypes: int64(1), object(3)
memory usage: 493.5+ KB


In [3]:
# correct date datatype
df['Datetime'] = pd.to_datetime( df.Timestamp, format='[%d/%b/%Y:%H:%M:%S')
df.drop(['Timestamp'], axis=1, inplace=True)

In [4]:
# remove all HTTP1.1 values and GET/POST values
df.Request.replace(regex=True, inplace=True, to_replace=r' HTTP/1\.1', value=r'')
df.Request.replace(regex=True, inplace=True, to_replace=r'GET ', value=r'')
df.Request.replace(regex=True, inplace=True, to_replace=r'POST ', value=r'')

In [5]:
# mask to filter all .js, .css, fonts and .png files files
mask = (df.Request.str.endswith('.js') | df.Request.str.endswith('.css') \
        | df.Request.str.startswith('/fonts') | df.Request.str.startswith('/img'))
print ("# Rows before: ", len(df))

# keep records not in mask
df = df[~mask]

print("Rows after: ", len(df))

# filter !OK all requests
df = df[df['Status'] == 200]
print("Rows after success filter: ", len(df))

# Rows before:  15789
Rows after:  10607
Rows after success filter:  6865


## User ID

In [6]:
from collections import defaultdict
import datetime

df2 = df.copy() # backup dataset

# sort rows by descending datetime
df.sort_values(by='Datetime', inplace=True)

# iniate sessionID and userID to 0
session_id = 0
user_id = 0

# create dicts to hold last access information
last_access = defaultdict(lambda:datetime.datetime.utcfromtimestamp(0))

# dict to find previous session, user ID and steps assigned to specific date/ip
session_dict = defaultdict(lambda:1)
user_id_dict = defaultdict(lambda:1)
session_steps = defaultdict(lambda:1)

# function to be applies row wise
# for each row, produce session, user ID and path traversal
def get_log_user_info(row):
    # access global variables shared between all rows
    global session_id, user_id, session_dict, user_id_dict, session_steps, last_access
    
    session_key = str(row['Datetime'].date()) + '_' + row['IP'] # date + IP key for find
    user_key = str(row['Datetime'].date()) + '_' + row['IP'] #+ '_' + row['Agent'] # date
    time_diff_session = row['Datetime'] - last_access[session_key] # session time diff
    time_diff_user = row['Datetime'] - last_access[user_key] # user time diffme diff
    
    # if the time diff from previous session is > 30 mins, assign new session ID
    if time_diff_session.total_seconds() > 1800:
        session_id += 1
        session_dict[session_key] = session_id
    # if the time diff from previous session is > 60 mins, assign new user ID
    if time_diff_user.total_seconds() > 3600:
        user_id += 1
        user_id_dict[user_key] = user_id
    # update last access for session and user
    last_access[session_key] = row['Datetime']
    last_access[user_key] = row['Datetime']
    
    # assign extracted info from the row
    row['SessionId'] = session_dict[session_key]
    row['Step'] = session_steps[row['SessionId']]
    row['UserId'] = user_id_dict[user_key]
    session_steps[row['SessionId']] += 1
    
    return row

# appy function above to get a new df with added info
df = df.apply(get_log_user_info, axis=1)
#df.head()

## Apply Association Mining

In [8]:
#remove unwanted columns
df2 = df.drop(columns=['IP', 'Datetime', 'UserId', 'Status', 'Step'])
sessions = df2.groupby(['SessionId'])['Request'].apply(list)

In [9]:
from apyori import apriori
#type cast transacts from pd into reg list and run apriori
session_list = list(sessions)
results = list(apriori(session_list, min_support=0.05))

#print(results[:2])

In [10]:
def convert_apriori_results_to_pandas_df(results):
    rules = []
    
    for rule_set in results:
        for rule in rule_set.ordered_statistics:
            # items_base = left side of rules, items_add = right side
            # support, confidence and lift for respective rules
            rules.append([','.join(rule.items_base), ','.join(rule.items_add),
                         rule_set.support, rule.confidence, rule.lift]) 
    
    # typecast it to pandas df
    return pd.DataFrame(rules, columns=['Left_side', 'Right_side', 'Support', 'Confidence', 'Lift']) 

result_df = convert_apriori_results_to_pandas_df(results)

#print(result_df.head())

In [11]:
# sort all acquired rules descending by lift
result_df = result_df[result_df['Left_side'] != '']
result_df = result_df.sort_values(by='Lift', ascending=False)
result_df.head(1000)

Unnamed: 0,Left_side,Right_side,Support,Confidence,Lift
11,/allsubmission.php,/archive.php,0.059719,0.784615,5.447655
12,/archive.php,/allsubmission.php,0.059719,0.414634,5.447655
15,/archive.php,/contest.php,0.062061,0.430894,4.043777
16,/contest.php,/archive.php,0.062061,0.582418,4.043777
38,"/login.php,/home.php",/contest.php,0.051522,0.34375,3.225962
37,"/login.php,/contest.php",/home.php,0.051522,0.745763,3.06193
14,/home.php,/allsubmission.php,0.055035,0.225962,2.968787
13,/allsubmission.php,/home.php,0.055035,0.723077,2.968787
35,"/login.php,/home.php",/archive.php,0.063232,0.421875,2.929116
34,"/login.php,/archive.php",/home.php,0.063232,0.701299,2.879371
