In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Loading and Merging the Data

In [2]:
order_products_prior = pd.read_csv('order_products__prior.csv') # opens the csv file
print("Rows and columns:",order_products_prior.shape)
pd.DataFrame.head(order_products_prior)

Rows and columns: (32434489, 4)


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [3]:
order_products_train = pd.read_csv('order_products__train.csv') # opens the csv file
print("Rows and columns:",order_products_train.shape)
pd.DataFrame.head(order_products_train)

Rows and columns: (1384617, 4)


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,1,49302,1,1
1,1,11109,2,1
2,1,10246,3,0
3,1,49683,4,0
4,1,43633,5,1


Concatenate the two datasets

In [4]:
order_product=pd.concat([order_products_prior, order_products_train])
print("Rows and columns:",order_product.shape)
pd.DataFrame.head(order_product)

Rows and columns: (33819106, 4)


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


Merge all the other datasets to the main one

In [5]:
products = pd.read_csv('products.csv') # opens the csv file
print("Rows and columns:",products.shape)
pd.DataFrame.head(products)

Rows and columns: (49688, 4)


Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [6]:
aisles = pd.read_csv('aisles.csv') # opens the csv file
print("Rows and columns:",aisles.shape)
pd.DataFrame.head(aisles)

Rows and columns: (134, 2)


Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation


In [7]:
departments = pd.read_csv('departments.csv') # opens the csv file
print("Rows and columns:",departments.shape)
pd.DataFrame.head(departments)

Rows and columns: (21, 2)


Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


In [8]:
products_all = pd.merge(products,aisles,on='aisle_id',how='left')

In [9]:
products_all = pd.merge(products_all,departments,on='department_id',how='left')
print("Rows and columns:",products_all.shape)
pd.DataFrame.head(products_all)

Rows and columns: (49688, 6)


Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department
0,1,Chocolate Sandwich Cookies,61,19,cookies cakes,snacks
1,2,All-Seasons Salt,104,13,spices seasonings,pantry
2,3,Robust Golden Unsweetened Oolong Tea,94,7,tea,beverages
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,frozen meals,frozen
4,5,Green Chile Anytime Sauce,5,13,marinades meat preparation,pantry


In [10]:
order_products_all = pd.merge(order_product,products_all,on='product_id',how='left')
print("Rows and columns:",order_products_all.shape)
pd.DataFrame.head(order_products_all)

Rows and columns: (33819106, 9)


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department
0,2,33120,1,1,Organic Egg Whites,86,16,eggs,dairy eggs
1,2,28985,2,1,Michigan Organic Kale,83,4,fresh vegetables,produce
2,2,9327,3,0,Garlic Powder,104,13,spices seasonings,pantry
3,2,45918,4,1,Coconut Butter,19,13,oils vinegars,pantry
4,2,30035,5,0,Natural Sweetener,17,13,baking ingredients,pantry


In [11]:
orders = pd.read_csv('orders.csv') # opens the csv file
print("Rows and columns:",orders.shape)
pd.DataFrame.head(orders)

Rows and columns: (3421083, 7)


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [13]:
orders_users = orders[['order_id','user_id']]
print("Rows and columns:",orders_users.shape)
pd.DataFrame.head(orders_users)

Rows and columns: (3421083, 2)


Unnamed: 0,order_id,user_id
0,2539329,1
1,2398795,1
2,473747,1
3,2254736,1
4,431534,1


In [14]:
order_products_all=pd.merge(order_products_all,orders_users,on='order_id',how='left')
print("Rows and columns:",order_products_all.shape)
pd.DataFrame.head(order_products_all)

Rows and columns: (33819106, 10)


Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,aisle,department,user_id
0,2,33120,1,1,Organic Egg Whites,86,16,eggs,dairy eggs,202279
1,2,28985,2,1,Michigan Organic Kale,83,4,fresh vegetables,produce,202279
2,2,9327,3,0,Garlic Powder,104,13,spices seasonings,pantry,202279
3,2,45918,4,1,Coconut Butter,19,13,oils vinegars,pantry,202279
4,2,30035,5,0,Natural Sweetener,17,13,baking ingredients,pantry,202279


In [25]:
order_products_all.to_csv('order_products_all.csv',sep=',')

## Creating the features for each user

### A) Order Patterns

In [15]:
group=orders.groupby(['user_id'],as_index=False)
min_days=pd.DataFrame(group['days_since_prior_order'].agg('min'))
pd.DataFrame.head(min_days)

Unnamed: 0,user_id,days_since_prior_order
0,1,0.0
1,2,3.0
2,3,7.0
3,4,0.0
4,5,6.0


In [16]:
test = np.array([1, np.nan, 3, 4])
min(test)

1.0

In [17]:
max(test)

4.0

In [18]:
sum(test)

nan

In [20]:
med(test)

NameError: name 'med' is not defined

In [22]:
avg(test)

NameError: name 'avg' is not defined

In [23]:
np.mean(test)

nan

In [24]:
np.median(test)

  r = func(a, **kwargs)


nan