In [None]:
# Imports and useful definitions
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

%matplotlib inline
random.seed(156)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
df_stores = pd.read_csv("raw_data/cliente_tabla.csv")

In [None]:
# Now a look at the Stores table
display("Dimensions of Stores table : " + str(df_stores.shape))
display("DF head : ")
display(df_stores.head())

In [None]:
# Rename columns in english for easier further use
df_stores.columns = ["store_id", "store_name"]
display(df_stores.head())

In [None]:
# Check for missing values?
display("There are " + str(np.count_nonzero(df_stores.isnull())) + " rows with missing values")

# Handle duplicates
boolDupli = df_stores["store_id"].duplicated(keep = False)
display(df_stores[boolDupli].head(6))
display("There are " + str(df_stores[boolDupli].shape[0] / 2) + " duplicate Store IDs")

# Only spacing differences in name, just keep one occurrence of each
df_stores = df_stores.drop_duplicates(subset = ["store_id"])
display("New dimensions of Stores table : " + str(df_stores.shape))

# Keep the "Sin Nombre" and "No Identificado" stores, those might be valid stores with no name entered in the DB
# We will see later if those stores are actually active in terms of sells and returns

In [None]:
df_stores.store_name.value_counts(dropna = False)

In [None]:
# We will infer several features like total sales in pesos from trains.csv
df_train = pd.read_csv("raw_data/train.csv",
                       dtype = {'Semana' : 'int8', 'Agencia_ID' :'int32', 'Canal_ID' : 'int32', 'Ruta_SAK' : 'int32',
                                'Cliente-ID' : 'int32', 'Producto_ID':'int32', 'Venta_hoy':'float32', 'Venta_uni_hoy': 'int32', 
                                'Dev_uni_proxima':'int32', 'Dev_proxima':'float32', 'Demanda_uni_equil':'int32'})
df_train.columns = ["week", "depot_id", "channel_id", "route_id", "store_id", "product_id", "sales_nb", "sales_pesos", 
                    "returns_nb", "returns_pesos", "adj_demand"]
total_sales_nb = df_train.groupby("store_id")["sales_nb"].sum().to_frame()
df_stores = pd.merge(left = df_stores, right = total_sales_nb, how = "left", left_on = "store_id", right_index = True)
df_stores.rename(columns = {"sales_nb" : "total_sales_nb"}, inplace = True)

total_sales_pesos = df_train.groupby("store_id")["sales_pesos"].sum().to_frame()
df_stores = pd.merge(left = df_stores, right = total_sales_pesos, how = "left", left_on = "store_id", right_index = True)
df_stores.rename(columns = {"sales_pesos" : "total_sales_pesos"}, inplace = True)

total_returns_nb = df_train.groupby("store_id")["returns_nb"].sum().to_frame()
df_stores = pd.merge(left = df_stores, right = total_returns_nb, how = "left", left_on = "store_id", right_index = True)
df_stores.rename(columns = {"returns_nb" : "total_returns_nb"}, inplace = True)

total_returns_pesos = df_train.groupby("store_id")["returns_pesos"].sum().to_frame()
df_stores = pd.merge(left = df_stores, right = total_returns_pesos, how = "left", left_on = "store_id", right_index = True)
df_stores.rename(columns = {"returns_pesos" : "total_returns_pesos"}, inplace = True)

total_adj_demand = df_train.groupby("store_id")["adj_demand"].sum().to_frame()
df_stores = pd.merge(left = df_stores, right = total_adj_demand, how = "left", left_on = "store_id", right_index = True)
df_stores.rename(columns = {"adj_demand" : "total_adj_demand"}, inplace = True)

In [None]:
display(df_stores.sort_values("total_sales_pesos", ascending = False).head(10))
display(df_stores.sort_values("total_sales_pesos", ascending = False).tail(10))
display(df_stores.describe())

In [None]:
# Remove rows with NAs
df_stores.dropna(axis = 0, how = "any", inplace = True)
display("Dimensions of Stores table are now : " + str(df_stores.shape))
display(df_stores.head())

In [None]:
# Check distribution of total_adj_demand
target = df_stores["total_adj_demand"].tolist()
plt.hist(target, bins = 100, color = "red", range = (0, 5000))
plt.title("Distribution of target variable under 5000")
plt.xlabel("Adjusted demand")
plt.ylabel("Count")
plt.show()

In [None]:
# Remove non-numeric columns
df_stores.drop("store_name", axis = 1, inplace = True)
display(df_stores.head())

In [None]:
# Use store_id as the index
df_stores.set_index("store_id", inplace = True)
display(df_stores.head())

In [None]:
# Separate stores in groups using K-Means
estimator = KMeans(n_clusters = 10)
estimator.fit(df_stores)

display(estimator.cluster_centers_)

In [None]:
# Quick plotting to check if the clustering went right
labels = estimator.labels_
pca_2 = PCA(2)
plot_columns = pca_2.fit_transform(df_stores) 
plt.scatter(x = plot_columns[:,0], y = plot_columns[:,1], c = labels) 
plt.show()

In [None]:
df_stores["cluster"] = labels
display(df_stores.groupby("cluster").describe())

In [None]:
display(df_stores.sort_values("total_sales_pesos", ascending = False).head(10))
display(df_stores.sort_values("total_sales_pesos", ascending = False).tail(10))