# ðŸ“Œ Cell 1. Environment setup

In [3]:
import os
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", "{:.2f}".format)


# ðŸ“Œ Cell 2. File check

In [4]:
file_path = "2019-Oct.csv"

# File exists
os.path.exists(file_path)


True

In [5]:
# File size (MB)
os.path.getsize(file_path) / 1024**2


5406.010489463806

# ðŸ“Œ Cell 3. Load data (sample)

In [6]:
df = pd.read_csv(
    file_path,
    nrows=100_000
)

# ðŸ“Œ Cell 4. Check data shape

In [7]:
df.shape


(100000, 9)

# ðŸ“Œ Cell 5. Check schema (columns)

In [8]:
df.columns

Index(['event_time', 'event_type', 'product_id', 'category_id',
       'category_code', 'brand', 'price', 'user_id', 'user_session'],
      dtype='str')

The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


# ðŸ“Œ Cell 6. Preview sample data

In [9]:
df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-10-01 00:00:00 UTC,view,44600062,2103807459595387724,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c
1,2019-10-01 00:00:00 UTC,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc
2,2019-10-01 00:00:01 UTC,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8
3,2019-10-01 00:00:01 UTC,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713
4,2019-10-01 00:00:04 UTC,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d


In [10]:
df.tail()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
99995,2019-10-01 04:28:27 UTC,view,17300136,2053013553853497655,,montale,102.96,515593818,96d3640f-9d00-4b69-a4d8-ca581c1677d3
99996,2019-10-01 04:28:27 UTC,view,2601810,2053013563970159485,,artel,155.29,530821477,0db770f5-225c-4e8d-9814-f045e4bfae87
99997,2019-10-01 04:28:27 UTC,view,1306100,2053013558920217191,computers.notebook,xiaomi,1029.6,521314335,405566fc-f660-4a7a-baed-1ba95feccbbf
99998,2019-10-01 04:28:27 UTC,view,3200549,2053013555321504139,appliances.kitchen.meat_grinder,dauscher,25.71,555131253,fca1ed2e-c91b-40e1-a95c-b4bfcd65b295
99999,2019-10-01 04:28:27 UTC,view,6200949,2053013552293216471,appliances.environment.air_heater,,10.17,523211916,8d176f3e-a90e-48a1-ac13-760bc81a4583


# ðŸ“Œ Cell 7. Check data types

In [11]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   event_time     100000 non-null  str    
 1   event_type     100000 non-null  str    
 2   product_id     100000 non-null  int64  
 3   category_id    100000 non-null  int64  
 4   category_code  67413 non-null   str    
 5   brand          85607 non-null   str    
 6   price          100000 non-null  float64
 7   user_id        100000 non-null  int64  
 8   user_session   100000 non-null  str    
dtypes: float64(1), int64(3), str(5)
memory usage: 6.9 MB


# ðŸ“Œ Cell 8. Check missing values

In [12]:
df.isnull().sum().sort_values(ascending=False)

category_code    32587
brand            14393
event_time           0
product_id           0
event_type           0
category_id          0
price                0
user_id              0
user_session         0
dtype: int64

# ðŸ“Œ Cell 9. Basic stats / outlier check

In [13]:
df.describe()

Unnamed: 0,product_id,category_id,price,user_id
count,100000.0,100000.0,100000.0,100000.0
mean,10298188.21,2.0560937114866673e+18,286.83,531090895.65
std,11135458.34,1.4909815920515066e+16,360.3,16620297.33
min,1001588.0,2.053013552226108e+18,0.0,306441847.0
25%,1005124.0,2.0530135552879496e+18,61.01,515636023.0
50%,5100579.5,2.0530135556318828e+18,154.38,526964945.0
75%,15901690.5,2.0530135626112049e+18,357.11,547718922.0
max,53100015.0,2.175419595093968e+18,2574.07,555483540.0


In [17]:
df.describe(include="string")

Unnamed: 0,event_time,event_type,category_code,brand,user_session
count,100000,100000,67413,85607,100000
unique,8753,3,123,1506,24382
top,2019-10-01 04:05:18 UTC,view,electronics.smartphone,samsung,dab9fcec-5ecd-428a-914d-d60f3f73d32c
freq,34,97130,26738,11683,113


# ðŸ“Œ Cell 10. Check duplicates

In [15]:
df.duplicated().sum()

np.int64(17)

# ðŸ“Œ Cell 11. Unique values by column (meaning check)

In [16]:
df.nunique().sort_values()

event_type           3
category_code      123
category_id        509
brand             1506
price             8014
event_time        8753
user_id          20384
product_id       20621
user_session     24382
dtype: int64

# ðŸ“Œ Cell 12. Extract summary (Markdown)

### Extract EDA Summary

- Data: E-commerce event log (sample 100k rows)
- Columns: 9
- event_time: string â†’ needs datetime conversion
- category_code, brand: missing values present (business-specific)
- price: includes zeros â†’ needs handling policy
- event_type: view share is very high
- Duplicates: very few (17 rows)
- Next steps: type casting, missing-value policy, feature separation
