In [9]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import math
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from scipy.sparse import csr_matrix
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import TruncatedSVD

In [None]:
df_cat_tree=pd.read_csv(r"D:\GeakMinds Internship\real world projects\datasets\retail rocket\category_tree.csv")
df_events=pd.read_csv(r"D:\GeakMinds Internship\real world projects\datasets\retail rocket\events.csv")
df_item1=pd.read_csv(r"D:\GeakMinds Internship\real world projects\datasets\retail rocket\item_properties_part1.csv")
df_item2=pd.read_csv(r"D:\GeakMinds Internship\real world projects\datasets\retail rocket\item_properties_part2.csv")

The dataset consists of three files: a file with behaviour data (events.csv), a file with item properties (item_properties.сsv) and a file, which describes category tree (category_tree.сsv). The data has been collected from a real-world ecommerce website we will use file with behaviour data (events.csv) because we will performe collaborative filtering so there is only columns need is user and items and there interaction value which available in events data iteslf, so we will ignore rest file

In [4]:
df_cat_tree.head()

Unnamed: 0,categoryid,parentid
0,1016,213.0
1,809,169.0
2,570,9.0
3,1691,885.0
4,536,1691.0


In [5]:
df_events.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


In [7]:
df_item1.head()

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513


In [8]:
df_item2.head()

Unnamed: 0,timestamp,itemid,property,value
0,1433041200000,183478,561,769062
1,1439694000000,132256,976,n26.400 1135780
2,1435460400000,420307,921,1149317 1257525
3,1431831600000,403324,917,1204143
4,1435460400000,230701,521,769062


In [10]:
df_cat_tree.shape

(1669, 2)

In [11]:
df_events.shape

(2756101, 5)

In [12]:
df_item1.shape

(10999999, 4)

In [13]:
df_item2.shape

(9275903, 4)

In [14]:
df_events=df_events.sort_values(by=['visitorid', 'timestamp']).reset_index(drop=True)
df_events.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1442004589439,0,view,285930,
1,1442004759591,0,view,357564,
2,1442004917175,0,view,67045,
3,1439487966444,1,view,72028,
4,1438969904567,2,view,325215,


In [16]:
df_events['timestamp'] = pd.to_datetime(df_events['timestamp'], unit='ms')

df_events = df_events.sort_values(by=['visitorid', 'timestamp'])

df_events['event'].value_counts()

event
view           2664312
addtocart        69332
transaction      22457
Name: count, dtype: int64

In [17]:
df_events['session_diff'] = df_events.groupby('visitorid')['timestamp'].diff()

df_events['new_session'] = (df_events['session_diff'] > pd.Timedelta(minutes=30)).fillna(True)

df_events['session_id'] = df_events['new_session'].cumsum()

In [18]:
def map_event_to_step(event):
    if event == 'view':
        return 'view'
    elif event == 'addtocart':
        return 'cart'
    elif event == 'transaction':
        return 'checkout'
    else:
        return None

df_events['funnel_step'] = df_events['event'].apply(map_event_to_step)

In [None]:
session_steps = df_events.groupby('session_id')['funnel_step'].apply(lambda x: list(x)).reset_index()


def has_step(steps, target):
    return target in steps

session_steps['homepage'] = session_steps['funnel_step'].apply(lambda x: has_step(x, 'view'))
session_steps['product_page'] = session_steps['funnel_step'].apply(lambda x: has_step(x, 'view')) 
session_steps['cart'] = session_steps['funnel_step'].apply(lambda x: has_step(x, 'cart'))
session_steps['checkout'] = session_steps['funnel_step'].apply(lambda x: has_step(x, 'checkout'))


In [20]:
def step_path(row):
    path = []
    if row['homepage']:
        path.append('Homepage')
    if row['product_page']:
        path.append('Product Page')
    if row['cart']:
        path.append('Cart')
    if row['checkout']:
        path.append('Checkout')
    return path

session_steps['path'] = session_steps.apply(step_path, axis=1)

homepage_sessions = session_steps[session_steps['homepage']].shape[0]
product_page_sessions = session_steps[session_steps['product_page']].shape[0]
cart_sessions = session_steps[session_steps['cart']].shape[0]
checkout_sessions = session_steps[session_steps['checkout']].shape[0]

print("Homepage:", homepage_sessions)
print("Product Page:", product_page_sessions)
print("Cart:", cart_sessions)
print("Checkout:", checkout_sessions)


Homepage: 353186
Product Page: 353186
Cart: 39834
Checkout: 13990


In [21]:
def dropoff(prev, current):
    return round(100 * (1 - current / prev), 2)

print("\nDrop-off Rates:")
print("Homepage → Product Page:", dropoff(homepage_sessions, product_page_sessions), "%")
print("Product Page → Cart:", dropoff(product_page_sessions, cart_sessions), "%")
print("Cart → Checkout:", dropoff(cart_sessions, checkout_sessions), "%")



Drop-off Rates:
Homepage → Product Page: 0.0 %
Product Page → Cart: 88.72 %
Cart → Checkout: 64.88 %


In [None]:
import plotly.express as px

funnel_data = {
    'Step': ['Homepage', 'Product Page', 'Cart', 'Checkout'],
    'Users': [homepage_sessions, product_page_sessions, cart_sessions, checkout_sessions]
}

funnel_df = pd.DataFrame(funnel_data)
fig = px.funnel(funnel_df, x='Users', y='Step', title="Website Funnel Conversion")
fig.show()

: 