In [1]:
import pandas as pd
import numpy as np

#### Loading our datasets

In [2]:
clickstream = pd.read_csv('clickstream.tsv',sep = '|',header = 0)
regusers = pd.read_csv('regusers.tsv',sep = '\t',header = 0)
products = pd.read_csv('products.tsv',sep='\t',header = 0)
products['url'] = 'http://www.RL.com'+products['url']

In [3]:
clickstream

Unnamed: 0,clickstream_id,timestamp,IP address,url,is_purchased?,is_page_errored?,user_session_id,city,state,country
0,1330588819,2012-03-01 00:00:19,147.222.227.200,http://www.RL.com/,0,0.000000,AF8A0FDF-B1F8-474C-8CD7-8CA06A8E435B,spokane,wa,usa
1,1330588825,2012-03-01 00:00:25,99.49.96.163,http://www.RL.com/product/4004,0,1.000000,C9183A22-6E1D-4147-BDC9-D634FC957098,detroit,mi,usa
2,1330588827,2012-03-01 00:00:27,147.222.227.200,http://www.RL.com/,0,0.000000,AF8A0FDF-B1F8-474C-8CD7-8CA06A8E435B,spokane,wa,usa
3,1330588857,2012-03-01 00:00:57,69.114.3.205,http://www.RL.com/product/4004,0,0.000000,F761B842-9DDA-42CC-9F28-A6359B6C7219,east northport,ny,usa
4,1330588859,2012-03-01 00:00:59,71.217.29.209,http://www.RL.com/review/3004,0,0.595960,6FE1CB72-95C9-47F9-A1CB-7295C927F916,tacoma,wa,usa
...,...,...,...,...,...,...,...,...,...,...
421261,1331881141,2012-03-15 23:59:01,74.240.132.6,http://www.RL.com/product/4004,0,0.000000,632959ED-DE61-43E4-A9BC-0F8E1AA6B070,slidell,la,usa
421262,1331881141,2012-03-15 23:59:01,74.240.132.6,http://www.RL.com/reco/2001,0,0.414141,632959ED-DE61-43E4-A9BC-0F8E1AA6B070,slidell,la,usa
421263,1331881165,2012-03-15 23:59:25,50.39.205.175,http://www.RL.com/,0,0.000000,E3E83074-A1DF-4745-87BA-97E3C64ECA00,portland,or,usa
421264,1331881171,2012-03-15 23:59:31,50.39.205.175,http://www.RL.com/reco/2001,0,0.717172,E3E83074-A1DF-4745-87BA-97E3C64ECA00,portland,or,usa


In [4]:
products.head()

Unnamed: 0,url,category,id
0,http://www.RL.com/,home page,1000
1,http://www.RL.com/video/1001,video review,1001
2,http://www.RL.com/video/1002,video review,1002
3,http://www.RL.com/video/1003,video review,1003
4,http://www.RL.com/reco/2001,celebrity recommendation,2001


In [5]:
regusers.head()

Unnamed: 0,SWID,BIRTH_DT,GENDER_CD
0,0001BDD9-EABF-4D0D-81BD-D9EABFCD0D7D,8-Apr-84,F
1,00071AA7-86D2-4EB9-871A-A786D27EB9BA,7-Feb-88,F
2,00071B7D-31AF-4D85-871B-7D31AFFD852E,22-Oct-64,F
3,0007967E-F188-4598-9C7C-E64390482CFB,1-Jun-66,M
4,000B90B2-92DC-4A7A-8B90-B292DC9A7A71,13-Jun-84,M


#### Question A:
Are celebrity reviews significantly more influential than customer reviews? And how do they both fare against video reviews? Is there a way to capture the “bump”, in terms of revenue, of celebrity and video reviews vs. customer reviews?

#### Answer

Let's first look into how many sessions have led to a purchase

In [6]:
data = clickstream.merge(products, left_on='url', right_on='url')
data.head()

Unnamed: 0,clickstream_id,timestamp,IP address,url,is_purchased?,is_page_errored?,user_session_id,city,state,country,category,id
0,1330588819,2012-03-01 00:00:19,147.222.227.200,http://www.RL.com/,0,0.0,AF8A0FDF-B1F8-474C-8CD7-8CA06A8E435B,spokane,wa,usa,home page,1000
1,1330588827,2012-03-01 00:00:27,147.222.227.200,http://www.RL.com/,0,0.0,AF8A0FDF-B1F8-474C-8CD7-8CA06A8E435B,spokane,wa,usa,home page,1000
2,1330588881,2012-03-01 00:01:21,147.222.227.200,http://www.RL.com/,0,0.0,AF8A0FDF-B1F8-474C-8CD7-8CA06A8E435B,spokane,wa,usa,home page,1000
3,1330588890,2012-03-01 00:01:30,68.5.184.133,http://www.RL.com/,0,0.0,60C8049D-C1A2-41C2-B503-6C1200424C49,mission viejo,ca,usa,home page,1000
4,1330588905,2012-03-01 00:01:45,69.114.3.205,http://www.RL.com/,0,0.0,F761B842-9DDA-42CC-9F28-A6359B6C7219,east northport,ny,usa,home page,1000


In [7]:
sessions_grouped = data.groupby(['user_session_id', 'category'])['is_purchased?'].agg(['sum', 'count'])
sessions_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,sum,count
user_session_id,category,Unnamed: 2_level_1,Unnamed: 3_level_1
0001BDD9-EABF-4D0D-81BD-D9EABFCD0D7D,celebrity recommendation,0,1
0001BDD9-EABF-4D0D-81BD-D9EABFCD0D7D,customer review,0,2
0001BDD9-EABF-4D0D-81BD-D9EABFCD0D7D,home page,0,4
0001BDD9-EABF-4D0D-81BD-D9EABFCD0D7D,product,1,4
000B90B2-92DC-4A7A-8B90-B292DC9A7A71,celebrity recommendation,0,1
...,...,...,...
FFFB1C5E-37B6-453A-83FB-86C580D18AE8,celebrity recommendation,0,2
FFFB1C5E-37B6-453A-83FB-86C580D18AE8,customer review,0,8
FFFB1C5E-37B6-453A-83FB-86C580D18AE8,home page,0,23
FFFB1C5E-37B6-453A-83FB-86C580D18AE8,product,3,17


In [8]:
sessions_count= sessions_grouped.pivot_table('count', ['user_session_id'], 'category')
sessions_count=sessions_count.drop(columns=['product','home page'])
sessions_count

category,celebrity recommendation,customer review,video review
user_session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0001BDD9-EABF-4D0D-81BD-D9EABFCD0D7D,1.0,2.0,
000B90B2-92DC-4A7A-8B90-B292DC9A7A71,1.0,,
000C47AD-EBFC-CDB8-CF70-DC4C2ED5051B,1.0,2.0,2.0
000E15BA-EB3E-14A6-4921-0E24C052821D,,5.0,1.0
000EE247-7758-E6B7-68D3-ACE8445BCD43,,2.0,1.0
...,...,...,...
FFF1BD4A-82B8-F6A6-C6D7-07914E229B02,,4.0,
FFF3EEA6-1A7A-4BF6-8C8E-84BBC530AD34,2.0,6.0,3.0
FFF47F68-28C7-4088-ADAE-BD09C695A448,1.0,1.0,3.0
FFF9E6CB-D3A2-455F-B5CF-6B8EC4E80ABE,1.0,1.0,2.0


In [9]:
sessions_product = sessions_grouped.pivot_table('sum', ['user_session_id'], 'category')
sessions_product = sessions_product.drop(columns=['celebrity recommendation','customer review','home page','video review'])
sessions_product

category,product
user_session_id,Unnamed: 1_level_1
0001BDD9-EABF-4D0D-81BD-D9EABFCD0D7D,1.0
000B90B2-92DC-4A7A-8B90-B292DC9A7A71,0.0
000C47AD-EBFC-CDB8-CF70-DC4C2ED5051B,3.0
000E15BA-EB3E-14A6-4921-0E24C052821D,1.0
000EE247-7758-E6B7-68D3-ACE8445BCD43,1.0
...,...
FFF1BD4A-82B8-F6A6-C6D7-07914E229B02,0.0
FFF3EEA6-1A7A-4BF6-8C8E-84BBC530AD34,0.0
FFF47F68-28C7-4088-ADAE-BD09C695A448,0.0
FFF9E6CB-D3A2-455F-B5CF-6B8EC4E80ABE,0.0


In [10]:
sessions = sessions_count.merge(sessions_product, left_on='user_session_id', right_on='user_session_id')
sessions = sessions.fillna(0)
sessions = sessions.rename(columns={"product": "purchases"})
sessions

category,celebrity recommendation,customer review,video review,purchases
user_session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0001BDD9-EABF-4D0D-81BD-D9EABFCD0D7D,1.0,2.0,0.0,1.0
000B90B2-92DC-4A7A-8B90-B292DC9A7A71,1.0,0.0,0.0,0.0
000C47AD-EBFC-CDB8-CF70-DC4C2ED5051B,1.0,2.0,2.0,3.0
000E15BA-EB3E-14A6-4921-0E24C052821D,0.0,5.0,1.0,1.0
000EE247-7758-E6B7-68D3-ACE8445BCD43,0.0,2.0,1.0,1.0
...,...,...,...,...
FFF1BD4A-82B8-F6A6-C6D7-07914E229B02,0.0,4.0,0.0,0.0
FFF3EEA6-1A7A-4BF6-8C8E-84BBC530AD34,2.0,6.0,3.0,0.0
FFF47F68-28C7-4088-ADAE-BD09C695A448,1.0,1.0,3.0,0.0
FFF9E6CB-D3A2-455F-B5CF-6B8EC4E80ABE,1.0,1.0,2.0,0.0


So we have a dataset where each row is a unique session. Then, we have created columns for with the purchases of each sessions, as well as the number of celebrity reccomendations, the number of customer reviews and the number of video reviews. We can then create a regression model which predicts the numbers of purchases based on the number of celebrity reccomendations, customer reviews and video reviews.

In [11]:
inputs = sessions.drop(columns =["purchases"])
target = sessions["purchases"]

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn import  linear_model
from sklearn.metrics import mean_squared_error, r2_score
regr = linear_model.LinearRegression()
regr.fit(inputs, target)
predictions = regr.predict(inputs)
# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print('Mean squared error: %.2f'% mean_squared_error(target, predictions))

Coefficients: 
 [0.12348858 0.12958619 0.06496646]
Mean squared error: 1.25


In [13]:
from statsmodels.api import OLS
OLS(target,inputs).fit().summary()

0,1,2,3
Dep. Variable:,purchases,R-squared (uncentered):,0.622
Model:,OLS,Adj. R-squared (uncentered):,0.622
Method:,Least Squares,F-statistic:,8855.0
Date:,"Tue, 26 May 2020",Prob (F-statistic):,0.0
Time:,15:59:20,Log-Likelihood:,-24702.0
No. Observations:,16123,AIC:,49410.0
Df Residuals:,16120,BIC:,49430.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
celebrity recommendation,0.1308,0.006,22.166,0.000,0.119,0.142
customer review,0.1366,0.003,47.346,0.000,0.131,0.142
video review,0.0710,0.004,18.615,0.000,0.063,0.078

0,1,2,3
Omnibus:,3862.346,Durbin-Watson:,1.97
Prob(Omnibus):,0.0,Jarque-Bera (JB):,16201.959
Skew:,1.126,Prob(JB):,0.0
Kurtosis:,7.365,Cond. No.,5.39


We can see that celebrity recommendations are not significantly more influential than customer reviews, but their coefficient is almost the same, with customer reviews being a little higher. 
In addition, we can see that video reviews are not that effective compared to celebrity and customer reviews, since their coefficient is much lower.

Regarding the increase on the revenue from celebrity reccomendations and video reviews compared to customer reviews, we can see that the combination of them has 1.5 times the effect of customer reviews.

### Question B
Is there a simple way to capture and show the customer journeys between the various elements (reviews, products)?

#### Answer
We thought of creating a graph, with each node representing a page and each edge the probability this node goes to an another node

In [14]:
data_orig = data
#data = data.sort_values(by=['user_session_id','timestamp'])
data = data.sort_values(by=['user_session_id','timestamp']).reset_index()
data.head()

Unnamed: 0,index,clickstream_id,timestamp,IP address,url,is_purchased?,is_page_errored?,user_session_id,city,state,country,category,id
0,213186,1331610150,2012-03-12 20:42:30,76.166.167.172,http://www.RL.com/reco/2002,0,0.505051,0001BDD9-EABF-4D0D-81BD-D9EABFCD0D7D,oxnard,ca,usa,celebrity recommendation,2002
1,417927,1331610161,2012-03-12 20:42:41,76.166.167.172,http://www.RL.com/review/3005,0,0.616162,0001BDD9-EABF-4D0D-81BD-D9EABFCD0D7D,oxnard,ca,usa,customer review,3005
2,168914,1331610177,2012-03-12 20:42:57,76.166.167.172,http://www.RL.com/product/4004,1,0.0,0001BDD9-EABF-4D0D-81BD-D9EABFCD0D7D,oxnard,ca,usa,product,4004
3,321810,1331610177,2012-03-12 20:42:57,76.166.167.172,http://www.RL.com/review/3003,0,0.777778,0001BDD9-EABF-4D0D-81BD-D9EABFCD0D7D,oxnard,ca,usa,customer review,3003
4,250863,1331610261,2012-03-12 20:44:21,76.166.167.172,http://www.RL.com/product/4005,0,0.0,0001BDD9-EABF-4D0D-81BD-D9EABFCD0D7D,oxnard,ca,usa,product,4005


In [15]:
length = len(data)
session_id_from = []
session_id_to = []
from_page = []
to_page = []

for i in range(1,length):
    if data["user_session_id"][i-1]==data["user_session_id"][i]:
        from_page.append(data["category"][i-1])
        to_page.append(data["category"][i])
        session_id_from.append(data["user_session_id"][i-1])
        session_id_to.append(data["user_session_id"][i])
    else:
        from_page.append(data["category"][i-1])
        to_page.append("abandon")
        session_id_from.append(data["user_session_id"][i-1])
        session_id_to.append(data["user_session_id"][i])
        
            

In [16]:
data_new = {'session_id_from':session_id_from,'session_id_to':session_id_to, 'from_page':from_page,'to_page':to_page}
df = pd.DataFrame(data_new) 
df = df.sort_values(by=['session_id_from'])
df

Unnamed: 0,session_id_from,session_id_to,from_page,to_page
0,0001BDD9-EABF-4D0D-81BD-D9EABFCD0D7D,0001BDD9-EABF-4D0D-81BD-D9EABFCD0D7D,celebrity recommendation,customer review
1,0001BDD9-EABF-4D0D-81BD-D9EABFCD0D7D,0001BDD9-EABF-4D0D-81BD-D9EABFCD0D7D,customer review,product
2,0001BDD9-EABF-4D0D-81BD-D9EABFCD0D7D,0001BDD9-EABF-4D0D-81BD-D9EABFCD0D7D,product,customer review
3,0001BDD9-EABF-4D0D-81BD-D9EABFCD0D7D,0001BDD9-EABF-4D0D-81BD-D9EABFCD0D7D,customer review,product
4,0001BDD9-EABF-4D0D-81BD-D9EABFCD0D7D,0001BDD9-EABF-4D0D-81BD-D9EABFCD0D7D,product,product
...,...,...,...,...
421234,FFFB1C5E-37B6-453A-83FB-86C580D18AE8,FFFB1C5E-37B6-453A-83FB-86C580D18AE8,home page,product
421235,FFFB1C5E-37B6-453A-83FB-86C580D18AE8,FFFB1C5E-37B6-453A-83FB-86C580D18AE8,product,product
421236,FFFB1C5E-37B6-453A-83FB-86C580D18AE8,FFFB1C5E-37B6-453A-83FB-86C580D18AE8,product,home page
421238,FFFB1C5E-37B6-453A-83FB-86C580D18AE8,FFFB1C5E-37B6-453A-83FB-86C580D18AE8,customer review,home page


Now we want to create an edgelist from every page to every other page, where we have calculated the weights of the edges as the probability to go from one page to the other.
This probability of going from one webpage to the other.

In [21]:
df_grouped = df.groupby(['from_page', 'to_page']).agg('count').reset_index()
df_grouped = df_grouped.drop(columns = 'session_id_to')
df_grouped_count = df_grouped.rename(columns={"session_id_from": "count"})
df_grouped_sum = df_grouped_count.groupby(['from_page'])['count'].agg(['sum']).reset_index()

In [24]:
df_grouped_count

Unnamed: 0,from_page,to_page,count
0,celebrity recommendation,abandon,1058
1,celebrity recommendation,celebrity recommendation,2433
2,celebrity recommendation,customer review,4318
3,celebrity recommendation,home page,8324
4,celebrity recommendation,product,8750
5,celebrity recommendation,video review,2709
6,customer review,abandon,2895
7,customer review,celebrity recommendation,4100
8,customer review,customer review,11612
9,customer review,home page,23228


In [25]:
df_grouped_sum

Unnamed: 0,from_page,sum
0,celebrity recommendation,27592
1,customer review,68074
2,home page,148594
3,product,135032
4,video review,41973
