In [None]:
# Imports and useful definitions
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

%matplotlib inline
random.seed(156)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
df_products = pd.read_csv("raw_data/producto_tabla.csv")

In [None]:
# Now a look at the Products table
display("Dimensions of Products table : " + str(df_products.shape))
display("DF head : ")
display(df_products.head())

In [None]:
# Rename columns in english for easier further use
df_products.columns = ["product_id", "product_name"]
display(df_products.head())

In [None]:
# Augment the product DF using the data from product_name
df_products["short_name"] = df_products.product_name.str.extract('^(\D*)', expand = False)
df_products["pieces"] =  df_products.product_name.str.extract('(\d+)p ', expand = False).astype('float')
weight = df_products.product_name.str.extract('(\d+)(kg|Kg|g) ', expand = True)
df_products["weight"] = weight[0].astype('float') * weight[1].map({'Kg':1000, 'kg':1000, 'g':1})
df_products["volume"] =  df_products.product_name.str.extract('(\d+)ml ', expand = False).astype('float')
df_products["inches"] =  df_products.product_name.str.extract('(\d+)in ', expand = False).astype('float')
df_products["brand"] = df_products.product_name.str.extract('^.+\s(\D+) \d+$', expand = False)
display(df_products.head())

# TODO : handle digits in product name "Tostado Int 0pct Grasa Azuc" and packaging "NM MTA"

In [None]:
# Remove NO IDENTIFICADO entries
display("Nb of products : " + str(df_products.shape[0]))
df_products = df_products[df_products.short_name != "NO IDENTIFICADO "]
display("Nb of products : " + str(df_products.shape[0]))
display(df_products.head(10))

In [None]:
df_products.short_name.value_counts(dropna = False)

In [None]:
# We will infer several features like average price from trains.csv
df_train = pd.read_csv("raw_data/train.csv",
                       dtype = {'Semana' : 'int8', 'Agencia_ID' :'int32', 'Canal_ID' : 'int32', 'Ruta_SAK' : 'int32',
                                'Cliente-ID' : 'int32', 'Producto_ID':'int32', 'Venta_hoy':'float32', 'Venta_uni_hoy': 'int32', 
                                'Dev_uni_proxima':'int32', 'Dev_proxima':'float32', 'Demanda_uni_equil':'int32'})
df_train.columns = ["week", "depot_id", "channel_id", "route_id", "client_id", "product_id", "sales_nb", "sales_pesos", 
                    "returns_nb", "returns_pesos", "adj_demand"]
df_train["product_price"] = df_train.sales_pesos / df_train.sales_nb
avg_prices = df_train.groupby("product_id")["product_price"].mean().to_frame()
df_products = pd.merge(left = df_products, right = avg_prices, how = "left", left_on = "product_id", right_index = True)
df_products.rename(columns = {"product_price" : "avg_price"}, inplace = True)

total_sales_nb = df_train.groupby("product_id")["sales_nb"].sum().to_frame()
df_products = pd.merge(left = df_products, right = total_sales_nb, how = "left", left_on = "product_id", right_index = True)
df_products.rename(columns = {"sales_nb" : "total_sales_nb"}, inplace = True)

total_return_nbs = df_train.groupby("product_id")["returns_nb"].sum().to_frame()
df_products = pd.merge(left = df_products, right = total_return_nbs, how = "left", left_on = "product_id", right_index = True)
df_products.rename(columns = {"returns_nb" : "total_returns_nb"}, inplace = True)

total_adj_demand = df_train.groupby("product_id")["adj_demand"].sum().to_frame()
df_products = pd.merge(left = df_products, right = total_adj_demand, how = "left", left_on = "product_id", right_index = True)
df_products.rename(columns = {"adj_demand" : "total_adj_demand"}, inplace = True)

In [None]:
display(df_products.sort_values("total_adj_demand", ascending = False).head(10))
display(df_products.sort_values("total_adj_demand", ascending = False).tail(10))
display(df_products.describe())

In [None]:
# Remove rows with products who were never sold or returned once (thus providing no information)
df_products.dropna(axis = 0, how = "any", subset  = ["total_adj_demand"], inplace = True)
display("Dimensions of Products table are now : " + str(df_products.shape))
display(df_products.sort_values("total_adj_demand", ascending = False).tail(10))

In [None]:
# Check distribution of total_adj_demand
target = df_products["total_adj_demand"].tolist()
plt.hist(target, bins = 100, color = "red", range = (0, 50000))
plt.title("Distribution of target variable under 50 000")
plt.xlabel("Adjusted demand")
plt.ylabel("Count")
plt.show()

In [None]:
# Use product_id as the index
df_products.set_index("product_id", inplace = True)
display(df_products.head())

In [None]:
# Only keep relevant columns for clustering
temp_products = df_products.drop(["short_name", "product_name", "brand", "pieces", "volume", "inches", "total_adj_demand", 
                                  "total_returns_nb", "total_sales_nb"], axis = 1)
display(temp_products.describe())

# Remove rows with NAs
temp_products.dropna(axis = 0, how = "any", inplace = True)
display(temp_products.describe())

In [None]:
# Separate stores in groups using K-Means
estimator = KMeans(n_clusters = 12)
estimator.fit(temp_products)

display(estimator.cluster_centers_)

In [None]:
# Quick plotting to check if the clustering went right
labels = estimator.labels_
pca_2 = PCA(2)
plot_columns = pca_2.fit_transform(temp_products) 
plt.scatter(x = plot_columns[:,0], y = plot_columns[:,1], c = labels) 
plt.show()

In [None]:
temp_products["cluster"] = labels
display(temp_products.groupby("cluster").describe())

In [None]:
temp_products = temp_products.drop(["avg_price", "weight"], axis = 1)
df_products = pd.merge(left = df_products, right = temp_products, how = "left", left_index = True, right_index = True)

In [None]:
display(df_products.sort_values("cluster", ascending = False).tail())