In [1]:
# Imports and useful definitions
import random
import numpy as np
import pandas as pd
from IPython.display import display
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

random.seed(156)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
# Load train data
# TEMP : Only read a sample to avoid memory errors and speed up further processings
nrows = sum(1 for line in open("raw_data/train.csv")) - 1
sample_size = int(nrows * 0.8)
skip = sorted(random.sample(range(1, nrows + 1), nrows - sample_size))
df_train = pd.read_csv("raw_data/train.csv",
                       dtype = {'Semana' : 'int8', 'Agencia_ID' :'int16', 'Canal_ID' : 'int8', 'Ruta_SAK' : 'int16',
                                'Cliente-ID' : 'int32', 'Producto_ID':'int32', 'Venta_uni_hoy':'int16', 'Venta_hoy': 'float32', 
                                'Dev_uni_proxima':'int32', 'Dev_proxima':'float32', 'Demanda_uni_equil':'int16'}, 
                       skiprows = skip)
df_train.columns = ["week", "depot_id", "channel_id", "route_id", "store_id", "product_id", "sales_nb", "sales_pesos", 
                    "returns_nb", "returns_pesos", "adj_demand"]

# Split data into train-validation sets
# Since it's a time series, validation set will be week 9 data
x_train = df_train[df_train.week != 9]
y_train = x_train["adj_demand"]
x_valid = df_train[df_train.week == 9]
y_valid = x_valid["adj_demand"]
x_valid.drop(["sales_nb", "sales_pesos", "returns_nb", "returns_pesos"], axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [3]:
# First preprocess Products data
df_products = pd.read_csv("raw_data/producto_tabla.csv")
df_products.columns = ["product_id", "product_name"]

# Augment the product DF using the data from product_name
df_products["short_name"] = df_products.product_name.str.extract('^(\D*)', expand = False)
df_products["pieces"] =  df_products.product_name.str.extract('(\d+)p ', expand = False).astype('float')
weight = df_products.product_name.str.extract('(\d+)(kg|Kg|g) ', expand = True)
df_products["weight"] = weight[0].astype('float') * weight[1].map({'Kg':1000, 'kg':1000, 'g':1})
df_products["volume"] =  df_products.product_name.str.extract('(\d+)ml ', expand = False).astype('float')
df_products["inches"] =  df_products.product_name.str.extract('(\d+)in ', expand = False).astype('float')
df_products["brand"] = df_products.product_name.str.extract('^.+\s(\D+) \d+$', expand = False)

# TODO : handle digits in product name "Tostado Int 0pct Grasa Azuc" and packaging "NM MTA"

# Remove NO IDENTIFICADO entries
df_products = df_products[df_products.short_name != "NO IDENTIFICADO "]

# Infer several features like average price from trains.csv
x_train["product_price"] = x_train.sales_pesos / x_train.sales_nb
avg_prices = x_train.groupby("product_id")["product_price"].mean().to_frame()
df_products = pd.merge(left = df_products, right = avg_prices, how = "left", left_on = "product_id", right_index = True)
df_products.rename(columns = {"product_price" : "avg_price"}, inplace = True)

total_sales_nb = x_train.groupby("product_id")["sales_nb"].sum().to_frame()
df_products = pd.merge(left = df_products, right = total_sales_nb, how = "left", left_on = "product_id", right_index = True)
df_products.rename(columns = {"sales_nb" : "total_sales_nb"}, inplace = True)

total_return_nbs = x_train.groupby("product_id")["returns_nb"].sum().to_frame()
df_products = pd.merge(left = df_products, right = total_return_nbs, how = "left", left_on = "product_id", right_index = True)
df_products.rename(columns = {"returns_nb" : "total_returns_nb"}, inplace = True)

total_adj_demand = x_train.groupby("product_id")["adj_demand"].sum().to_frame()
df_products = pd.merge(left = df_products, right = total_adj_demand, how = "left", left_on = "product_id", right_index = True)
df_products.rename(columns = {"adj_demand" : "total_adj_demand"}, inplace = True)

# Remove rows with products who were never sold or returned once (thus providing no information)
df_products.dropna(axis = 0, how = "any", subset  = ["total_adj_demand"], inplace = True)

# Use product_id as the index
df_products.set_index("product_id", inplace = True)

display("Dimensions of Products table are now : " + str(df_products.shape))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


'Dimensions of Products table are now : (1741, 11)'

In [4]:
# Now preprocess Stores data
df_stores = pd.read_csv("raw_data/cliente_tabla.csv")
df_stores.columns = ["store_id", "store_name"]

# Handle duplicates (only spacing differences in names, just keep one occurrence of each)
df_stores = df_stores.drop_duplicates(subset = ["store_id"])

# Keep the "Sin Nombre" and "No Identificado" stores, those might be valid stores with no name entered in the DB
# We will see later if those stores are actually active in terms of sells and returns

# Infer several features like total sales in pesos from trains.csv
total_sales_nb = x_train.groupby("store_id")["sales_nb"].sum().to_frame()
df_stores = pd.merge(left = df_stores, right = total_sales_nb, how = "left", left_on = "store_id", right_index = True)
df_stores.rename(columns = {"sales_nb" : "total_sales_nb"}, inplace = True)

total_sales_pesos = x_train.groupby("store_id")["sales_pesos"].sum().to_frame()
df_stores = pd.merge(left = df_stores, right = total_sales_pesos, how = "left", left_on = "store_id", right_index = True)
df_stores.rename(columns = {"sales_pesos" : "total_sales_pesos"}, inplace = True)

total_returns_nb = x_train.groupby("store_id")["returns_nb"].sum().to_frame()
df_stores = pd.merge(left = df_stores, right = total_returns_nb, how = "left", left_on = "store_id", right_index = True)
df_stores.rename(columns = {"returns_nb" : "total_returns_nb"}, inplace = True)

total_returns_pesos = x_train.groupby("store_id")["returns_pesos"].sum().to_frame()
df_stores = pd.merge(left = df_stores, right = total_returns_pesos, how = "left", left_on = "store_id", right_index = True)
df_stores.rename(columns = {"returns_pesos" : "total_returns_pesos"}, inplace = True)

total_adj_demand = x_train.groupby("store_id")["adj_demand"].sum().to_frame()
df_stores = pd.merge(left = df_stores, right = total_adj_demand, how = "left", left_on = "store_id", right_index = True)
df_stores.rename(columns = {"adj_demand" : "total_adj_demand"}, inplace = True)

# Remove rows with NAs
df_stores.dropna(axis = 0, how = "any", inplace = True)

# Use store_id as the index
df_stores.set_index("store_id", inplace = True)

display("Dimensions of Stores table are now : " + str(df_stores.shape))

'Dimensions of Stores table are now : (867665, 6)'

In [5]:
# Now preprocess Depots data
df_depots = pd.read_csv("raw_data/town_state.csv")
df_depots.columns = ["depot_id", "city", "state"]

# Infer several features like total sales in pesos from trains.csv
total_sales_nb = x_train.groupby("depot_id")["sales_nb"].sum().to_frame()
df_depots = pd.merge(left = df_depots, right = total_sales_nb, how = "left", left_on = "depot_id", right_index = True)
df_depots.rename(columns = {"sales_nb" : "total_sales_nb"}, inplace = True)

total_sales_pesos = x_train.groupby("depot_id")["sales_pesos"].sum().to_frame()
df_depots = pd.merge(left = df_depots, right = total_sales_pesos, how = "left", left_on = "depot_id", right_index = True)
df_depots.rename(columns = {"sales_pesos" : "total_sales_pesos"}, inplace = True)

total_returns_nb = x_train.groupby("depot_id")["returns_nb"].sum().to_frame()
df_depots = pd.merge(left = df_depots, right = total_returns_nb, how = "left", left_on = "depot_id", right_index = True)
df_depots.rename(columns = {"returns_nb" : "total_returns_nb"}, inplace = True)

total_returns_pesos = x_train.groupby("depot_id")["returns_pesos"].sum().to_frame()
df_depots = pd.merge(left = df_depots, right = total_returns_pesos, how = "left", left_on = "depot_id", right_index = True)
df_depots.rename(columns = {"returns_pesos" : "total_returns_pesos"}, inplace = True)

total_adj_demand = x_train.groupby("depot_id")["adj_demand"].sum().to_frame()
df_depots = pd.merge(left = df_depots, right = total_adj_demand, how = "left", left_on = "depot_id", right_index = True)
df_depots.rename(columns = {"adj_demand" : "total_adj_demand"}, inplace = True)

# Remove rows with NAs
df_depots.dropna(axis = 0, how = "any", inplace = True)

# Use store_id as the index
df_depots.set_index("depot_id", inplace = True)

display("Dimensions of Depots table are now : " + str(df_depots.shape))

'Dimensions of Depots table are now : (552, 7)'

In [6]:
# Cluster products based on price and weight
temp_products = df_products.drop(["short_name", "product_name", "total_adj_demand", "total_returns_nb", 
                                  "total_sales_nb", "volume", "inches", "pieces", "brand"], axis = 1)
temp_products.dropna(axis = 0, how = "any", inplace = True)
prod_estimator = KMeans(n_clusters = 6)
prod_estimator.fit(temp_products)
temp_products["prod_cluster"] = prod_estimator.labels_
temp_products = temp_products.drop(["avg_price", "weight"], axis = 1)
df_products = pd.merge(left = df_products, right = temp_products, how = "left", left_index = True, right_index = True)
display(df_products.head())

Unnamed: 0_level_0,product_name,short_name,pieces,weight,volume,inches,brand,avg_price,total_sales_nb,total_returns_nb,total_adj_demand,prod_cluster
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
41,Bimbollos Ext sAjonjoli 6p 480g BIM 41,Bimbollos Ext sAjonjoli,6.0,480.0,,,BIM,18.012,15564.0,183.0,15531.0,4.0
53,Burritos Sincro 170g CU LON 53,Burritos Sincro,,170.0,,,LON,14.3,23148.0,1.0,23148.0,0.0
72,Div Tira Mini Doradita 4p 45g TR 72,Div Tira Mini Doradita,4.0,45.0,,,TR,3.697,491818.0,8078.0,486969.0,0.0
73,Pan Multigrano Linaza 540g BIM 73,Pan Multigrano Linaza,,540.0,,,BIM,21.361,489100.0,12122.0,477841.0,4.0
100,Super Pan Bco Ajonjoli 680g SP WON 100,Super Pan Bco Ajonjoli,,680.0,,,WON,20.181,812.0,690.0,781.0,4.0


In [7]:
# Cluster stores based on sales, returns and demand
temp_stores = df_stores.drop("store_name", axis = 1)
temp_stores.dropna(axis = 0, how = "any", inplace = True)
store_estimator = KMeans(n_clusters = 6)
store_estimator.fit(temp_stores)
temp_stores["store_cluster"] = store_estimator.labels_
temp_stores = temp_stores.drop(["total_sales_nb", "total_sales_pesos", "total_returns_nb", "total_returns_pesos", 
                               "total_adj_demand"], axis = 1)
df_stores = pd.merge(left = df_stores, right = temp_stores, how = "left", left_index = True, right_index = True)
display(df_stores.head())

Unnamed: 0_level_0,store_name,total_sales_nb,total_sales_pesos,total_returns_nb,total_returns_pesos,total_adj_demand,store_cluster
store_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
26,BODEGA COMERCIAL MEXICANA TOLUCA,4385.0,82960.203,324.0,4498.8,4091.0,2
60,SAMS CLUB TOLUCA,30702.0,1413790.125,0.0,0.0,30702.0,5
65,WAL MART METEPEC,50345.0,805253.875,0.0,0.0,50345.0,5
101,WAL MART TOLUCA,746.0,16916.58,0.0,0.0,746.0,0
105,SUPER KOMPRAS SAN BUENAVENTURA,12788.0,223084.875,0.0,0.0,12788.0,2


In [8]:
# Cluster depots based on sales, returns and demand
temp_depots = df_depots.drop(["city", "state"], axis = 1)
temp_depots.dropna(axis = 0, how = "any", inplace = True)
depot_estimator = KMeans(n_clusters = 6)
depot_estimator.fit(temp_depots)
temp_depots["depot_cluster"] = depot_estimator.labels_
temp_depots = temp_depots.drop(["total_sales_nb", "total_sales_pesos", "total_returns_nb", "total_returns_pesos", 
                               "total_adj_demand"], axis = 1)
df_depots = pd.merge(left = df_depots, right = temp_depots, how = "left", left_index = True, right_index = True)
display(df_depots.head())

Unnamed: 0_level_0,city,state,total_sales_nb,total_sales_pesos,total_returns_nb,total_returns_pesos,total_adj_demand,depot_cluster
depot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1110,2008 AG. LAGO FILT,"MÉXICO, D.F.",606047.0,6312734.0,25818.0,146704.25,603830.0,4
1111,2002 AG. AZCAPOTZALCO,"MÉXICO, D.F.",1871487.0,16545370.0,17482.0,185998.797,1858370.0,3
1112,2004 AG. CUAUTITLAN,ESTADO DE MÉXICO,1344550.0,11408405.0,16320.0,158888.047,1332715.0,1
1113,2008 AG. LAGO FILT,"MÉXICO, D.F.",986674.0,8267645.0,8244.0,82740.461,980734.0,4
1114,2029 AG.IZTAPALAPA 2,"MÉXICO, D.F.",2417170.0,43159804.0,102162.0,1681747.0,2325064.0,2


In [9]:
# Add the clustering info in the train DF
temp_products = pd.DataFrame(df_products.prod_cluster)
temp_stores = pd.DataFrame(df_stores.store_cluster)
temp_depots = pd.DataFrame(df_depots.depot_cluster)
x_train = pd.merge(left = x_train, right = temp_products, how = "left", left_on = "product_id", right_index = True)
x_train = pd.merge(left = x_train, right = temp_stores, how = "left", left_on = "store_id", right_index = True)
x_train = pd.merge(left = x_train, right = temp_depots, how = "left", left_on = "depot_id", right_index = True)
display(x_train.head())

Unnamed: 0,week,depot_id,channel_id,route_id,store_id,product_id,sales_nb,sales_pesos,returns_nb,returns_pesos,adj_demand,product_price,prod_cluster,store_cluster,depot_cluster
0,3,1110,7,3301,15766,1212,3,25.14,0,0.0,3,8.38,0.0,0,4
1,3,1110,7,3301,15766,1216,4,33.52,0,0.0,4,8.38,0.0,0,4
2,3,1110,7,3301,15766,1238,4,39.32,0,0.0,4,9.83,0.0,0,4
3,3,1110,7,3301,15766,1240,4,33.52,0,0.0,4,8.38,0.0,0,4
4,3,1110,7,3301,15766,1242,3,22.92,0,0.0,3,7.64,0.0,0,4


In [None]:
# Predict clustering info for validation DF
temp_products = pd.DataFrame(df_products[["weight", "avg_price"]])
temp_products.dropna(axis = 0, how = "any", inplace = True)
temp_products["prod_cluster"] = prod_estimator.predict(temp_products)
temp_products = temp_products.drop(["avg_price", "weight"], axis = 1)
x_valid = pd.merge(left = x_valid, right = temp_products, how = "left", left_on = "product_id", right_index = True)
###
temp_stores = pd.DataFrame(df_stores[["total_sales_nb", "total_sales_pesos", "total_returns_nb", "total_returns_pesos", 
                                     "total_adj_demand"]])
temp_stores.dropna(axis = 0, how = "any", inplace = True)
temp_stores["store_cluster"] = store_estimator.predict(temp_stores)
temp_stores = temp_stores.drop(["total_sales_nb", "total_sales_pesos", "total_returns_nb", "total_returns_pesos", 
                                "total_adj_demand"], axis = 1)
x_valid = pd.merge(left = x_valid, right = temp_stores, how = "left", left_on = "store_id", right_index = True)
###
temp_depots = pd.DataFrame(df_depots[["total_sales_nb", "total_sales_pesos", "total_returns_nb", "total_returns_pesos", 
                                     "total_adj_demand"]])
temp_depots.dropna(axis = 0, how = "any", inplace = True)
temp_depots["depot_cluster"] = depot_estimator.predict(temp_depots)
temp_depots = temp_depots.drop(["total_sales_nb", "total_sales_pesos", "total_returns_nb", "total_returns_pesos", 
                                "total_adj_demand"], axis = 1)
x_valid = pd.merge(left = x_valid, right = temp_depots, how = "left", left_on = "depot_id", right_index = True)

display(x_valid.head())

Unnamed: 0,week,depot_id,channel_id,route_id,store_id,product_id,adj_demand,prod_cluster,store_cluster,depot_cluster
51018561,9,1110,7,3301,15766,1212,1,0.0,0.0,4
51018562,9,1110,7,3301,15766,1240,2,0.0,0.0,4
51018563,9,1110,7,3301,15766,1242,1,0.0,0.0,4
51018564,9,1110,7,3301,15766,1250,10,0.0,0.0,4
51018565,9,1110,7,3301,15766,1309,3,0.0,0.0,4


In [None]:
# Save augmented data sets on disk
df_products.to_csv("temp_data/df_products80_6.csv", index = False, encoding = "utf-8")
df_stores.to_csv("temp_data/df_stores80_6.csv", index = False, encoding = "utf-8")
df_depots.to_csv("temp_data/df_depots80_6.csv", index = False, encoding = "utf-8")
x_train.to_csv("temp_data/x_train80_6.csv", index = False, encoding = "utf-8")
x_valid.to_csv("temp_data/x_valid80_6.csv", index = False, encoding = "utf-8")