In [2]:
pip install opendatasets



In [3]:
import opendatasets as od

In [4]:
od.download("https://www.kaggle.com/datasets/yasserh/instacart-online-grocery-basket-analysis-dataset")


Skipping, found downloaded files in "./instacart-online-grocery-basket-analysis-dataset" (use force=True to force download)


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### libraries

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### datasets loading

In [7]:
orders = pd.read_csv("/content/instacart-online-grocery-basket-analysis-dataset/orders.csv")
products = pd.read_csv("/content/instacart-online-grocery-basket-analysis-dataset/products.csv")
order_products__prior = pd.read_csv("/content/instacart-online-grocery-basket-analysis-dataset/order_products__prior.csv")
order_products__train = pd.read_csv("/content/instacart-online-grocery-basket-analysis-dataset/order_products__train.csv")

### datasets understanding
- [ ] orders
- [ ] products
- [ ] order_products__prior

In [8]:
# orders csv understanding
orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 7 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   eval_set                object 
 3   order_number            int64  
 4   order_dow               int64  
 5   order_hour_of_day       int64  
 6   days_since_prior_order  float64
dtypes: float64(1), int64(5), object(1)
memory usage: 182.7+ MB


 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   eval_set                object
 3   order_number            int64  
 4   order_dow               int64  
 5   order_hour_of_day       int64  
 6   days_since_prior_order  float64
dtypes: float64(1), int64(5), object(1)

In [9]:
orders.isna()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,False,False,False,False,False,False,True
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
3421078,False,False,False,False,False,False,False
3421079,False,False,False,False,False,False,False
3421080,False,False,False,False,False,False,False
3421081,False,False,False,False,False,False,False


In [10]:
orders["days_since_prior_order"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 3421083 entries, 0 to 3421082
Series name: days_since_prior_order
Non-Null Count    Dtype  
--------------    -----  
3214874 non-null  float64
dtypes: float64(1)
memory usage: 26.1 MB


In [11]:
orders.isnull().sum()

Unnamed: 0,0
order_id,0
user_id,0
eval_set,0
order_number,0
order_dow,0
order_hour_of_day,0
days_since_prior_order,206209


In [12]:
orders.duplicated()

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
...,...
3421078,False
3421079,False
3421080,False
3421081,False


In [13]:
orders.dropna(inplace = True)

In [14]:
orders.isnull()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
3421078,False,False,False,False,False,False,False
3421079,False,False,False,False,False,False,False
3421080,False,False,False,False,False,False,False
3421081,False,False,False,False,False,False,False


## orders dataset is clean

In [15]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49688 entries, 0 to 49687
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   product_id     49688 non-null  int64 
 1   product_name   49688 non-null  object
 2   aisle_id       49688 non-null  int64 
 3   department_id  49688 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 1.5+ MB


In [16]:
products.duplicated()

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
...,...
49683,False
49684,False
49685,False
49686,False


In [17]:
products.isnull()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
49683,False,False,False,False
49684,False,False,False,False
49685,False,False,False,False
49686,False,False,False,False


In [18]:
products.dropna(inplace = True)

In [19]:
products.head(10)


Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13
5,6,Dry Nose Oil,11,11
6,7,Pure Coconut Water With Orange,98,7
7,8,Cut Russet Potatoes Steam N' Mash,116,1
8,9,Light Strawberry Blueberry Yogurt,120,16
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7


In [20]:
products.head(3)


Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7


In [21]:
orders.head(3)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0


In [22]:

order_products__prior.isnull().sum()
order_products__prior.duplicated()

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
...,...
32434484,False
32434485,False
32434486,False
32434487,False


In [34]:
df_order_products__prior = order_products__prior.merge(orders)
df_final = df_order_products__prior.merge(products)


In [35]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30356421 entries, 0 to 30356420
Data columns (total 13 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   product_id              int64  
 2   add_to_cart_order       int64  
 3   reordered               int64  
 4   user_id                 int64  
 5   eval_set                object 
 6   order_number            int64  
 7   order_dow               int64  
 8   order_hour_of_day       int64  
 9   days_since_prior_order  float64
 10  product_name            object 
 11  aisle_id                int64  
 12  department_id           int64  
dtypes: float64(1), int64(10), object(2)
memory usage: 2.9+ GB
