# Profiling and Analysis Questions

- Do users purchase different products at different time of day?
- What clusters of aisle purchase standout?
- Do users who reorder have different behaviour? 
- does the behaviour change per number of order (eg.: min vs median, etc)
- do users purchase differently depending on days or weeks since last purchase?
- do users purchase differently on the weekends vs weekday?
- What is the top product/aisle purchased first or last per order?
- Reordered is per product. what products are most reordered by users?
- how many orders per user_id? 

## Aggregations
- days/weeks since last order
- number of orders (better to consider order number instead of count of order_id due to incomplete customer history)
- number of users who reordered
- orders per time of day (numeric or discrete count)


# File and libraries

In [45]:
import pandas as pd
import numpy as np


filepath=r'data\instacart_pre_proc.csv'

file_tag = "Instacart Market Basket"


## DSLabs

In [46]:
%run "scripts/dslabs_functions.py"

# data functions

In [47]:
%run "scripts/data_functions.py"


data_functions lodaded


# Load

In [48]:
test_data=True
# test_data=False

if test_data==True:

    data=pd.read_csv(filepath)

    # 5% sample
    data=data.sample(frac=0.01, replace=False)

    # sampled data to use in heavy analysis scripts like sparsity
    sampled_data=data.sample(frac=0.25, replace=False)


else:
    data=pd.read_csv(filepath)
    
    # sampled data to use in heavy analysis scripts like sparsity
    sampled_data=data.sample(frac=0.01, replace=False)



# data=enrich_instacart_df(data)



data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32430 entries, 891580 to 2807757
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   order_id                 32430 non-null  int64  
 1   product_id               32430 non-null  int64  
 2   add_to_cart_order        32430 non-null  int64  
 3   reordered                32430 non-null  int64  
 4   user_id                  32430 non-null  int64  
 5   order_number             32430 non-null  int64  
 6   order_dow                32430 non-null  int64  
 7   order_hour_of_day        32430 non-null  int64  
 8   days_since_prior_order   30327 non-null  float64
 9   order_time_of_day        32430 non-null  object 
 10  is_weekend               32430 non-null  int64  
 11  weeks_since_prior_order  30327 non-null  float64
dtypes: float64(2), int64(9), object(1)
memory usage: 3.2+ MB


## variable type definition

In [49]:
from pandas import Series, to_numeric, to_datetime


id_cols=['order_id', 'product_id', 'user_id','aisle_id','department_id']
# data=data.drop(columns=id_cols, axis=1)


def get_variable_types(df: DataFrame) -> dict[str, list]:
    variable_types: dict = {"numeric": [], "binary": [], "date": [], "symbolic": [], "id": []}

    nr_values: Series = df.nunique(axis=0, dropna=True)
    for c in df.columns:
        if 2 == nr_values[c]:
            variable_types["binary"].append(c)
            df[c].astype("bool")
        elif c in id_cols:
            variable_types["id"].append(c)
        else:
            try:
                to_numeric(df[c], errors="raise")
                variable_types["numeric"].append(c)
            except ValueError:
                try:
                    df[c] = to_datetime(df[c], errors="raise")
                    variable_types["date"].append(c)
                except ValueError:
                    variable_types["symbolic"].append(c)

    return variable_types


variable_types: dict[str, list] = get_variable_types(data)
print(variable_types)

{'numeric': ['add_to_cart_order', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order', 'weeks_since_prior_order'], 'binary': ['reordered', 'is_weekend'], 'date': [], 'symbolic': ['order_time_of_day'], 'id': ['order_id', 'product_id', 'user_id']}


## drop nulls

In [50]:
# drop any null values
data=data.dropna()

# K means clustering
https://web.ist.utl.pt/rmch/dash/guides/Clustering%20in%20Python.html

In [51]:
numeric: list[str] = variable_types["numeric"]
id_cols: list[str] = variable_types["id"]
binary: list[str] = variable_types["binary"]

# select columns that are numeric from list "numeric"

X = data[numeric]
y = data['reordered']
X.describe()


Unnamed: 0,add_to_cart_order,order_number,order_dow,order_hour_of_day,days_since_prior_order,weeks_since_prior_order
count,30327.0,30327.0,30327.0,30327.0,30327.0,30327.0
mean,8.386191,18.274508,2.724767,13.4404,11.170541,1.568932
std,7.217965,17.721837,2.085171,4.261176,8.80778,1.229537
min,1.0,2.0,0.0,0.0,0.0,0.0
25%,3.0,5.0,1.0,10.0,5.0,1.0
50%,6.0,12.0,3.0,13.0,8.0,1.0
75%,12.0,25.0,5.0,16.0,15.0,2.0
max,95.0,99.0,6.0,23.0,30.0,4.0


In [52]:
from sklearn import cluster, mixture
from sklearn.impute import SimpleImputer

# Impute missing values with the mean of the column
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

kmeans_algo = cluster.KMeans(n_clusters=3, random_state=1)
kmeans_model = kmeans_algo.fit(X_imputed)

In [53]:
# describe centroids
centroids = kmeans_model.cluster_centers_
pd.DataFrame(data = centroids, columns = X.columns)

Unnamed: 0,add_to_cart_order,order_number,order_dow,order_hour_of_day,days_since_prior_order,weeks_since_prior_order
0,8.267939,12.622019,2.725176,13.490373,7.487406,1.076234
1,8.248506,48.837786,2.806327,13.179262,5.797012,0.821968
2,8.813912,7.505391,2.655147,13.527987,25.404519,3.496677


In [54]:
kmeans_model.labels_


array([0, 0, 2, ..., 2, 0, 0])

In [55]:
y #compare against ground truth


891580     1
613197     0
685332     1
58499      0
2733383    0
          ..
2466178    0
1476529    0
926796     0
2408940    1
2807757    0
Name: reordered, Length: 30327, dtype: int64

In [56]:
from sklearn.metrics import pairwise_distances

def mydistance(x1, x2):
    res = 0.0001
    for j, weight in enumerate([1,2,3,1]):
        res += weight*abs(x1[j]-x2[j])
    return res

def sim_affinity(X):
    return pairwise_distances(X, metric=mydistance)

In [57]:
hier_algo = cluster.AgglomerativeClustering(n_clusters=3, metric=sim_affinity, linkage='average')
hier_model = hier_algo.fit(X)
hier_model.labels_

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [58]:
from sklearn import metrics
y_pred = kmeans_model.labels_
print("Silhouette:",metrics.silhouette_score(X, y_pred))
print("Silhouette per instance:\n",metrics.silhouette_samples(X, y_pred)[:5],"...")
print("Sum of squared distances:",kmeans_model.inertia_)

Silhouette: 0.334848199376882
Silhouette per instance:
 [0.43518857 0.18688657 0.15389771 0.47108309 0.497626  ] ...
Sum of squared distances: 5700639.289983627


In [62]:
def purity_score(y_true, y_pred):
    confusion_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    return np.sum(np.amax(confusion_matrix, axis=0)) / np.sum(confusion_matrix) 

print("Purity:",purity_score(y, y_pred))
print("Adjusted rand index:",metrics.adjusted_rand_score(y, y_pred))

Purity: 0.6295710093316187
Adjusted rand index: 0.011105327450333097


In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(20, 4))
n_samples = 1500
random_state = 170

# Incorrect number of clusters
X_blobs, y_blobs = data.make_blobs(n_samples=n_samples, random_state=random_state)
y_pred_blobs = cluster.KMeans(n_clusters=2, random_state=random_state).fit_predict(X_blobs)
plt.subplot(141)
plt.scatter(X_blobs[:, 0], X_blobs[:, 1], c=y_pred_blobs)
plt.title("Incorrect Number of Blobs")

# Anisotropicly distributed data
transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]
X_aniso, y_aniso = np.dot(X_blobs, transformation), y_blobs
y_pred_aniso = cluster.KMeans(n_clusters=3, random_state=random_state).fit_predict(X_aniso)
plt.subplot(142)
plt.scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred_aniso)
plt.title("Anisotropicly Distributed Blobs")

# Different variance
X_varied, y_varied = data.make_blobs(n_samples=n_samples,cluster_std=[1.0, 2.5, 0.5],random_state=random_state)
y_pred_varied = cluster.KMeans(n_clusters=3, random_state=random_state).fit_predict(X_varied)
plt.subplot(143)
plt.scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred_varied)
plt.title("Unequal Variance")

# Unevenly sized blobs
X_filtered = np.vstack((X[y == 0][:500], X[y == 1][:100], X[y == 2][:10]))
y_pred_filtered = cluster.KMeans(n_clusters=3,random_state=random_state).fit_predict(X_filtered)
plt.subplot(144)
plt.scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred_filtered)
plt.title("Unevenly Sized Blobs")
plt.show()