# Task 1


In [1]:
# Create a simple time series forecast with fbprophet
# Import libraries
from pathlib import Path
from fbprophet import Prophet
import pandas as pd
import re
import time

FILENAME = "input/2015_07_22_mktplace_shop_web_log_sample.log.gz"

# load the dataset into dataframe
weblog_file = Path(FILENAME) 
if weblog_file.exists():
    schema_columns = ['create_time', 'elb', 'client_host_port',
                      'backend_host_port', 'request_processing_time',
                     'backend_processing_time', 'response_processing_time',
                     'elb_status_code', 'backend_status_code',
                     'received_bytes', 'sent_bytes',
                      'request', 'user_agent',
                     'ssl_cipher', 'ssl_protocol']
    df = pd.read_csv(weblog_file, encoding='utf-8', sep=' ',
                     error_bad_lines=False, names=schema_columns,
                     parse_dates=['create_time'])
    print("Loaded file")
else:
    print(f"{FILENAME} does not exist.")

Importing plotly failed. Interactive plots will not work.


Loaded file


In [2]:
# Print to show the head of the dataset 
df.head()

Unnamed: 0,create_time,elb,client_host_port,backend_host_port,request_processing_time,backend_processing_time,response_processing_time,elb_status_code,backend_status_code,received_bytes,sent_bytes,request,user_agent,ssl_cipher,ssl_protocol
0,2015-07-22 09:00:28.019143+00:00,marketpalce-shop,123.242.248.130:54635,10.0.6.158:80,2.2e-05,0.026109,2e-05,200,200,0,699,GET https://paytm.com:443/shop/authresponse?co...,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,ECDHE-RSA-AES128-GCM-SHA256,TLSv1.2
1,2015-07-22 09:00:27.894580+00:00,marketpalce-shop,203.91.211.44:51402,10.0.4.150:80,2.4e-05,0.15334,2.6e-05,200,200,0,1497,GET https://paytm.com:443/shop/wallet/txnhisto...,Mozilla/5.0 (Windows NT 6.1; rv:39.0) Gecko/20...,ECDHE-RSA-AES128-GCM-SHA256,TLSv1.2
2,2015-07-22 09:00:27.885745+00:00,marketpalce-shop,1.39.32.179:56419,10.0.4.244:80,2.4e-05,0.164958,1.7e-05,200,200,0,157,GET https://paytm.com:443/shop/wallet/txnhisto...,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.3...,ECDHE-RSA-AES128-GCM-SHA256,TLSv1.2
3,2015-07-22 09:00:28.048369+00:00,marketpalce-shop,180.179.213.94:48725,10.0.6.108:80,2e-05,0.002333,2.1e-05,200,200,0,35734,GET https://paytm.com:443/shop/p/micromax-yu-y...,-,ECDHE-RSA-AES128-GCM-SHA256,TLSv1.2
4,2015-07-22 09:00:28.036251+00:00,marketpalce-shop,120.59.192.208:13527,10.0.4.217:80,2.4e-05,0.015091,1.6e-05,200,200,68,640,POST https://paytm.com:443/papi/v1/expresscart...,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,ECDHE-RSA-AES128-GCM-SHA256,TLSv1.2


In [3]:
# Check the data types for all columns
df.dtypes

create_time                 datetime64[ns, UTC]
elb                                      object
client_host_port                         object
backend_host_port                        object
request_processing_time                 float64
backend_processing_time                 float64
response_processing_time                float64
elb_status_code                           int64
backend_status_code                       int64
received_bytes                            int64
sent_bytes                                int64
request                                  object
user_agent                               object
ssl_cipher                               object
ssl_protocol                             object
dtype: object

In [4]:
# Check the unique values for all string columns
for column in df.columns:
    if 'object' in df[column].dtypes.name:
        print(f'{column}: {df[column].nunique()}')

elb: 1
client_host_port: 404391
backend_host_port: 25
request: 221839
user_agent: 13482
ssl_cipher: 9
ssl_protocol: 5


In [5]:
# Drop the feature that is will not contribute much
df.drop(columns=['elb'], axis=1, inplace=True)

In [6]:
# preprocess the dataset to get the client host and url from client_host_port and request respectively
df['client_host'] = df['client_host_port'].apply(lambda x: x.split(':')[0])
df['url'] = df['request'].apply(lambda x: x.split(' ')[1])

In [7]:
df.head()

Unnamed: 0,create_time,client_host_port,backend_host_port,request_processing_time,backend_processing_time,response_processing_time,elb_status_code,backend_status_code,received_bytes,sent_bytes,request,user_agent,ssl_cipher,ssl_protocol,client_host,url
0,2015-07-22 09:00:28.019143+00:00,123.242.248.130:54635,10.0.6.158:80,2.2e-05,0.026109,2e-05,200,200,0,699,GET https://paytm.com:443/shop/authresponse?co...,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,ECDHE-RSA-AES128-GCM-SHA256,TLSv1.2,123.242.248.130,https://paytm.com:443/shop/authresponse?code=f...
1,2015-07-22 09:00:27.894580+00:00,203.91.211.44:51402,10.0.4.150:80,2.4e-05,0.15334,2.6e-05,200,200,0,1497,GET https://paytm.com:443/shop/wallet/txnhisto...,Mozilla/5.0 (Windows NT 6.1; rv:39.0) Gecko/20...,ECDHE-RSA-AES128-GCM-SHA256,TLSv1.2,203.91.211.44,https://paytm.com:443/shop/wallet/txnhistory?p...
2,2015-07-22 09:00:27.885745+00:00,1.39.32.179:56419,10.0.4.244:80,2.4e-05,0.164958,1.7e-05,200,200,0,157,GET https://paytm.com:443/shop/wallet/txnhisto...,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.3...,ECDHE-RSA-AES128-GCM-SHA256,TLSv1.2,1.39.32.179,https://paytm.com:443/shop/wallet/txnhistory?p...
3,2015-07-22 09:00:28.048369+00:00,180.179.213.94:48725,10.0.6.108:80,2e-05,0.002333,2.1e-05,200,200,0,35734,GET https://paytm.com:443/shop/p/micromax-yu-y...,-,ECDHE-RSA-AES128-GCM-SHA256,TLSv1.2,180.179.213.94,https://paytm.com:443/shop/p/micromax-yu-yurek...
4,2015-07-22 09:00:28.036251+00:00,120.59.192.208:13527,10.0.4.217:80,2.4e-05,0.015091,1.6e-05,200,200,68,640,POST https://paytm.com:443/papi/v1/expresscart...,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,ECDHE-RSA-AES128-GCM-SHA256,TLSv1.2,120.59.192.208,https://paytm.com:443/papi/v1/expresscart/verify


In [8]:
# Create an time interval window for every second with number of hits/min
temp = df.assign(
    low=df['create_time'].dt.floor('1S'), 
    high=df['create_time'].dt.ceil('1S')
)
df.loc[:, 'interval'] = temp.apply(lambda row: pd.Interval(row['low'], row['high']), axis=1)

In [9]:
# Take the initial interval for every time window
df['interval_first'] = df['interval'].apply(lambda x: x.left)

In [10]:
# groupby based on the interval and count the total hits
grouped_df = df.groupby(["interval_first"]).count().reset_index()

In [11]:
# Extract the date and request from the dataframe and convert the columns name
series_df = grouped_df[['interval_first', 'request']].rename(columns={'interval_first': 'ds',
                                                          'request': 'y'})

In [12]:
# Convert the timestamp to datetime without the timezone
series_df['ds'] = series_df['ds'].dt.tz_convert(None)

In [13]:
# Build a simple prophet without seasonalities since we are dealing with seconds dataset
model = Prophet(changepoint_prior_scale=2.5)

start = time.time()
model.fit(series_df)
print("Fitting duration : {:.3f}s".format(time.time() - start) )

INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


Fitting duration : 8.038s


In [14]:
# Build the dataframe for the next 1 min to predict. we have 60 seconds to predict.
future_dates = model.make_future_dataframe(periods = 60, freq='s')

In [15]:
# predictions
forecast = model.predict(future_dates)

In [16]:
# The 60 seconds prediction for expected load(request/second) in the next minute.
forecast['yhat'].iloc[-60:]

4269    65.595974
4270    65.577678
4271    65.559382
4272    65.541086
4273    65.522791
4274    65.504495
4275    65.486199
4276    65.467903
4277    65.449608
4278    65.431312
4279    65.413016
4280    65.394720
4281    65.376424
4282    65.358129
4283    65.339833
4284    65.321537
4285    65.303241
4286    65.284946
4287    65.266650
4288    65.248354
4289    65.230058
4290    65.211762
4291    65.193467
4292    65.175171
4293    65.156875
4294    65.138579
4295    65.120283
4296    65.101988
4297    65.083692
4298    65.065396
4299    65.047100
4300    65.028805
4301    65.010509
4302    64.992213
4303    64.973917
4304    64.955621
4305    64.937326
4306    64.919030
4307    64.900734
4308    64.882438
4309    64.864142
4310    64.845847
4311    64.827551
4312    64.809255
4313    64.790959
4314    64.772664
4315    64.754368
4316    64.736072
4317    64.717776
4318    64.699480
4319    64.681185
4320    64.662889
4321    64.644593
4322    64.626297
4323    64.608001
4324    64

# Task 2

In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

from xgboost import XGBRegressor

In [18]:
def compiler(X_train):
    """ Compile and process the X_train
    
    Returns:
        preprocessor
    """
    numerical_cols = [cname for cname in X_train.columns if 
                    X_train[cname].dtype in ['int64', 'float64']]

    categorical_cols = [cname for cname in X_train.columns if 
                        X_train[cname].dtype == "object"]


    numerical_transformer = SimpleImputer(strategy='constant')

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
    ])
    return preprocessor

In [19]:
# Create an time interval window for every 15mins with number of hits
temp_15 = df.assign(
    low=df['create_time'].dt.floor('15min'), 
    high=df['create_time'].dt.ceil('15min')
)
df.loc[:, 'interval_15'] = temp_15.apply(lambda row: pd.Interval(row['low'], row['high']), axis=1)

In [20]:
# Create the min and max datetime for each session in order to find the average
sessioned_temp_df = df.groupby(["interval_15","client_host"]).agg({'create_time': [min, max]}).reset_index()

#rename multiindex columns to standard columns
sessioned_temp_df.columns = ['interval_15', 'client_host', 'beginning_hit_time', 'ending_hit_time']

In [21]:
# Calculate the average session length for each IP 
sessioned_temp_df['session_duration'] = sessioned_temp_df['ending_hit_time'] - sessioned_temp_df['beginning_hit_time']

In [22]:
# Convert the session from datetime to seconds
sessioned_temp_df['session_duration'] = sessioned_temp_df['session_duration'].apply(lambda x: x.total_seconds())

In [23]:
# Trim necessary columns for training
sessioned_length_df = sessioned_temp_df[['client_host', 'session_duration']]

In [24]:
X = sessioned_length_df[['client_host']]
y = sessioned_length_df[['session_duration']]

In [25]:
# Split the data to train and test
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1)

preprocessor = compiler(X_train)

xgb_model = XGBRegressor(n_estimators=10, learning_rate=0.02, random_state=0)

xgb_clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('xgb_model', xgb_model)
                     ])

xgb_clf.fit(X_train, y_train, xgb_model__verbose=False)

xgb_clf.fit(X_train, y_train)

xgb_preds = xgb_clf.predict(X_valid)

In [26]:
xgb_preds

array([11.065971, 11.065971, 11.065971, ..., 11.065971, 11.065971,
       11.065971], dtype=float32)

# Task 3

In [27]:
# groupby and then get the count of visits for unique url
grouped_unique_url = df.groupby(["interval_15", 'client_host', 'url']).count().reset_index()

In [28]:
grouped_unique_url.head()

Unnamed: 0,interval_15,client_host,url,create_time,client_host_port,backend_host_port,request_processing_time,backend_processing_time,response_processing_time,elb_status_code,backend_status_code,received_bytes,sent_bytes,request,user_agent,ssl_cipher,ssl_protocol,interval,interval_first
0,"(2015-07-22 02:30:00, 2015-07-22 02:45:00]",1.186.247.60,https://paytm.com:443/favicon.ico,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,"(2015-07-22 02:30:00, 2015-07-22 02:45:00]",1.186.247.60,https://paytm.com:443/shop/cart?channel=web&ve...,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
2,"(2015-07-22 02:30:00, 2015-07-22 02:45:00]",1.186.247.60,https://paytm.com:443/shop/h/electronics?utm_t...,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
3,"(2015-07-22 02:30:00, 2015-07-22 02:45:00]",1.186.247.60,https://paytm.com:443/shop/log,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
4,"(2015-07-22 02:30:00, 2015-07-22 02:45:00]",1.186.41.10,https://paytm.com:443/https://paytm.com/,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [29]:
X_unique = grouped_unique_url[['client_host']]
y_unique = grouped_unique_url[['request']]

In [30]:
# Split the data to train and test
X_train_unique, X_valid_unique, y_train_unique, y_valid_unique = train_test_split(X_unique, y_unique, random_state=1)

preprocessor_unique = compiler(X_train_unique)

In [None]:
xgb_model_unique = XGBRegressor(n_estimators=10, learning_rate=0.02, random_state=0)

xgb_clf_unique = Pipeline(steps=[('preprocessor', preprocessor_unique),
                      ('xgb_model', xgb_model_unique)
                     ])

xgb_clf_unique.fit(X_train_unique, y_train_unique)

xgb_clf_unique.fit(X_train_unique, y_train_unique)

xgb_preds_unique = xgb_clf_unique.predict(X_valid_unique)

In [None]:
xgb_preds_unique