In [92]:
# Import main librairies
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, date, time
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score, rand_score

# For images
from IPython.display import Image

# Ignore warning parts
import warnings
warnings.filterwarnings("ignore")

In [4]:
# Import datasets
orders = pd.read_csv('olist_orders_dataset.csv', sep=',')
order_reviews = pd.read_csv('olist_order_reviews_dataset.csv', sep=',')
order_items = pd.read_csv('olist_order_items_dataset.csv', sep=',')
customers = pd.read_csv('olist_customers_dataset.csv', sep=',')

In [51]:
# Fonction to create standardized customer data with date filter
# Using Standard_Scaler
def filter_date_and_fit(date):
    # Create dataframe 'orders_order_reviews' = 'oor' by merging 'orders' and 'order_reviews'
    orders_order_reviews = pd.merge(orders[['order_id',
                                            'customer_id',
                                            'order_purchase_timestamp']],
                                   order_reviews[['review_id',
                                                  'order_id',
                                                  'review_score',
                                                  'review_creation_date']])
    oor = orders_order_reviews
    
    # We keep only the most recent review for each order_id ('order_id')
    oor = oor.sort_values(by = 'review_creation_date', ascending = False)
    oor = oor.drop_duplicates(subset = ['order_id'], keep ='first')
    
    # Merge datasets 'orders_order_reviews', 'customers' and 'order_items'
    orders_customers = pd.merge(oor[['order_id', 'customer_id', 'order_purchase_timestamp',
                                     'review_id','review_score']],
                                customers[['customer_id', 'customer_unique_id']])

    orders_customers = pd.merge(orders_customers,
                                order_items[['order_id', 'order_item_id', 'product_id', 'price']])
    
    # Change dtypes 'object' to 'datetime'
    orders_customers['order_purchase_timestamp'] = pd.to_datetime(orders_customers['order_purchase_timestamp'])
    # Calculate duration between today and order_purchase_timestamp
    orders_customers['last_order'] = (datetime.now() - orders_customers['order_purchase_timestamp'])
    # Keep number of days only
    orders_customers['last_order_days'] = orders_customers['last_order'].astype(str).str[:4].astype(int)
    
    # Creating dataframe with no duplicates (1 product/1 purchage for each customer): 
    # Dropping all 'customer_unique_id" duplicates
    df_no_duplicates = orders_customers.drop_duplicates(subset = ['customer_unique_id'],
                                                        keep = False)
    
    # Creating dataframe only with duplicates (several products and several purchases)
    df_duplicates = orders_customers.loc[orders_customers['customer_unique_id'].duplicated(keep = False) == True]
    
    # Change names of features in 'df_no_duplicates'
    df_no_duplicates = df_no_duplicates.rename(columns={'customer_unique_id' : 'Customer',
                                                        'price' : 'payments_total',
                                                        'review_score' : 'review_score_mean'})
    # Create new feature in 'df_no_duplicates' : number of purchases
    df_no_duplicates['number_of_purchases'] = 1
    
    # Select the date of the most recent purchase in 'df_duplicates'
    # Sort by date and create dataframe 'df_last_order'
    df_last_order = df_duplicates.sort_values('order_purchase_timestamp',
                                              ascending = False)
    # Keep the line of the customer with the most recent date
    df_last_order = df_last_order.drop_duplicates(subset = ['customer_unique_id'],
                                                  keep = 'first')
    # Change names of features in 'df_duplicates'
    df_last_order = df_last_order.rename(columns={'customer_unique_id' : 'Customer'})
    # Select relevant features
    df_last_order = df_last_order[['Customer', 'order_purchase_timestamp', 'last_order_days', 'price']]
    
    # Calculate payments_total for each customer
    # Save index for the column "price" in dataframe 'df_duplicates'
    index_price = df_duplicates.columns.get_loc('price')
    # Create list of customer_unique_id
    list_customer_unique_id = df_last_order['Customer'].unique().tolist()
    # Create new column "payments_total" in df_last_order
    df_last_order["payments_total"] = 0
    # Save index for the column "payments_total" in dataframe df_last_order
    index_payments_total = df_last_order.columns.get_loc('payments_total')
    # Reset index of dataset df_duplicates
    df_duplicates = df_duplicates.reset_index(drop = True)
    # Reset index of dataset df
    df_last_order = df_last_order.reset_index(drop = True)
    # Loop to calculate and assign payments_total to each customer
    for customer in list_customer_unique_id:
        # Save list of indexes corresponding to the same customer in df_duplicates
        index_list = df_duplicates.loc[df_duplicates['customer_unique_id'] == customer].index
        # Save index of customer in df_last_order
        index_customer = df_last_order.loc[df_last_order['Customer'] == customer].index
        # Add prices for each customers to obtain 'payment_total'
        df_last_order.iloc[index_customer, index_payments_total] = df_duplicates.iloc[index_list, index_price].sum()
    
    # Calculate number of purchases for each customer
    # Create list of customer_unique_id
    list_customer_unique_id = df_duplicates['customer_unique_id'].unique().tolist()
    # Create new column "number_of_purchases" in df_last_order
    df_last_order["number_of_purchases"] = 0
    # Save index for the column "number_of_purchases" in dataframe df_last_order
    index_number_of_purchases = df_last_order.columns.get_loc('number_of_purchases')
    # Loop to calculate and assign number_of_purchases to each customer
    for customer in list_customer_unique_id:
        # Save index of customer in df_last_order
        index_customer = df_last_order.loc[df_last_order['Customer'] == customer].index
        # Create dataframe for each customer
        df_customer = df_duplicates.loc[df_duplicates['customer_unique_id'] == customer]
        # Add number_of_purchases for each customers
        df_last_order.iloc[index_customer, index_number_of_purchases] = len(df_customer['order_id'].unique().tolist())
    
    # Calculate review_score_mean for each customer
    # Create list of customer_unique_id
    list_customer_unique_id = df_duplicates['customer_unique_id'].unique().tolist()
    # Create new column "review_score_mean" in df_last_order
    df_last_order["review_score_mean"] = 0
    # Save index for the column "review_score_mean" in dataframe df_last_order
    index_review_score_mean = df_last_order.columns.get_loc('review_score_mean')
    # Loop to calculate and assign review_score_mean to each customer
    for customer in list_customer_unique_id:
        # Save index of customer in df_last_order
        index_customer = df_last_order.loc[df_last_order['Customer'] == customer].index
        # Create dataframe for each customer
        df_customer = df_duplicates.loc[df_duplicates['customer_unique_id'] == customer]        
        # Calculate the mean of review_scores for each customers to obtain 'review_score_mean'
        df_last_order.iloc[index_customer, index_review_score_mean] = df_customer['review_score'].mean()
    
    # Concatenate dataframes keeping only common features
    data_final = pd.concat([df_no_duplicates, df_last_order], join = "inner", ignore_index = True)
    
    # Keep only relevant features
    data_final = data_final[['Customer',
                             'number_of_purchases',
                             'payments_total',
                             'order_purchase_timestamp',
                             'last_order_days',
                             'review_score_mean']]
    
    # set feature 'Customer' as index
    data_final = data_final.set_index('Customer')
    
    # Filter by date
    data = data_final.loc[data_final['order_purchase_timestamp'] < date]
    data = data[['number_of_purchases',
                 'payments_total',
                 'last_order_days',
                 'review_score_mean']]
    
    # Standardize data
    X = data.values
    std_scaled = preprocessing.StandardScaler().fit(X)
    X_scaled = std_scaled.transform(X)
    
    # return data
    return X_scaled

In [73]:
# Fonction to create standardized customer final data with date filter
# Using Standard_Scaler fitted on oldest customer data
def filter_date_and_predict(date):
    # Create dataframe 'orders_order_reviews' = 'oor' by merging 'orders' and 'order_reviews'
    orders_order_reviews = pd.merge(orders[['order_id',
                                            'customer_id',
                                            'order_purchase_timestamp']],
                                   order_reviews[['review_id',
                                                  'order_id',
                                                  'review_score',
                                                  'review_creation_date']])
    oor = orders_order_reviews
    
    # We keep only the most recent review for each order_id ('order_id')
    oor = oor.sort_values(by = 'review_creation_date', ascending = False)
    oor = oor.drop_duplicates(subset = ['order_id'], keep ='first')
    
    # Merge datasets 'orders_order_reviews', 'customers' and 'order_items'
    orders_customers = pd.merge(oor[['order_id', 'customer_id', 'order_purchase_timestamp',
                                     'review_id','review_score']],
                                customers[['customer_id', 'customer_unique_id']])

    orders_customers = pd.merge(orders_customers,
                                order_items[['order_id', 'order_item_id', 'product_id', 'price']])
    
    # Change dtypes 'object' to 'datetime'
    orders_customers['order_purchase_timestamp'] = pd.to_datetime(orders_customers['order_purchase_timestamp'])
    # Calculate duration between today and order_purchase_timestamp
    orders_customers['last_order'] = (datetime.now() - orders_customers['order_purchase_timestamp'])
    # Keep number of days only
    orders_customers['last_order_days'] = orders_customers['last_order'].astype(str).str[:4].astype(int)
    
    # Creating dataframe with no duplicates (1 product/1 purchage for each customer): 
    # Dropping all 'customer_unique_id" duplicates
    df_no_duplicates = orders_customers.drop_duplicates(subset = ['customer_unique_id'],
                                                        keep = False)
    
    # Creating dataframe only with duplicates (several products and several purchases)
    df_duplicates = orders_customers.loc[orders_customers['customer_unique_id'].duplicated(keep = False) == True]
    
    # Change names of features in 'df_no_duplicates'
    df_no_duplicates = df_no_duplicates.rename(columns={'customer_unique_id' : 'Customer',
                                                        'price' : 'payments_total',
                                                        'review_score' : 'review_score_mean'})
    # Create new feature in 'df_no_duplicates' : number of purchases
    df_no_duplicates['number_of_purchases'] = 1
    
    # Select the date of the most recent purchase in 'df_duplicates'
    # Sort by date and create dataframe 'df_last_order'
    df_last_order = df_duplicates.sort_values('order_purchase_timestamp',
                                              ascending = False)
    # Keep the line of the customer with the most recent date
    df_last_order = df_last_order.drop_duplicates(subset = ['customer_unique_id'],
                                                  keep = 'first')
    # Change names of features in 'df_duplicates'
    df_last_order = df_last_order.rename(columns={'customer_unique_id' : 'Customer'})
    # Select relevant features
    df_last_order = df_last_order[['Customer', 'order_purchase_timestamp', 'last_order_days', 'price']]
    
    # Calculate payments_total for each customer
    # Save index for the column "price" in dataframe 'df_duplicates'
    index_price = df_duplicates.columns.get_loc('price')
    # Create list of customer_unique_id
    list_customer_unique_id = df_last_order['Customer'].unique().tolist()
    # Create new column "payments_total" in df_last_order
    df_last_order["payments_total"] = 0
    # Save index for the column "payments_total" in dataframe df_last_order
    index_payments_total = df_last_order.columns.get_loc('payments_total')
    # Reset index of dataset df_duplicates
    df_duplicates = df_duplicates.reset_index(drop = True)
    # Reset index of dataset df
    df_last_order = df_last_order.reset_index(drop = True)
    # Loop to calculate and assign payments_total to each customer
    for customer in list_customer_unique_id:
        # Save list of indexes corresponding to the same customer in df_duplicates
        index_list = df_duplicates.loc[df_duplicates['customer_unique_id'] == customer].index
        # Save index of customer in df_last_order
        index_customer = df_last_order.loc[df_last_order['Customer'] == customer].index
        # Add prices for each customers to obtain 'payment_total'
        df_last_order.iloc[index_customer, index_payments_total] = df_duplicates.iloc[index_list, index_price].sum()
    
    # Calculate number of purchases for each customer
    # Create list of customer_unique_id
    list_customer_unique_id = df_duplicates['customer_unique_id'].unique().tolist()
    # Create new column "number_of_purchases" in df_last_order
    df_last_order["number_of_purchases"] = 0
    # Save index for the column "number_of_purchases" in dataframe df_last_order
    index_number_of_purchases = df_last_order.columns.get_loc('number_of_purchases')
    # Loop to calculate and assign number_of_purchases to each customer
    for customer in list_customer_unique_id:
        # Save index of customer in df_last_order
        index_customer = df_last_order.loc[df_last_order['Customer'] == customer].index
        # Create dataframe for each customer
        df_customer = df_duplicates.loc[df_duplicates['customer_unique_id'] == customer]
        # Add number_of_purchases for each customers
        df_last_order.iloc[index_customer, index_number_of_purchases] = len(df_customer['order_id'].unique().tolist())
    
    # Calculate review_score_mean for each customer
    # Create list of customer_unique_id
    list_customer_unique_id = df_duplicates['customer_unique_id'].unique().tolist()
    # Create new column "review_score_mean" in df_last_order
    df_last_order["review_score_mean"] = 0
    # Save index for the column "review_score_mean" in dataframe df_last_order
    index_review_score_mean = df_last_order.columns.get_loc('review_score_mean')
    # Loop to calculate and assign review_score_mean to each customer
    for customer in list_customer_unique_id:
        # Save index of customer in df_last_order
        index_customer = df_last_order.loc[df_last_order['Customer'] == customer].index
        # Create dataframe for each customer
        df_customer = df_duplicates.loc[df_duplicates['customer_unique_id'] == customer]        
        # Calculate the mean of review_scores for each customers to obtain 'review_score_mean'
        df_last_order.iloc[index_customer, index_review_score_mean] = df_customer['review_score'].mean()
    
    # Concatenate dataframes keeping only common features
    data_final = pd.concat([df_no_duplicates, df_last_order], join = "inner", ignore_index = True)
    
    # Keep only relevant features
    data_final = data_final[['Customer',
                             'number_of_purchases',
                             'payments_total',
                             'order_purchase_timestamp',
                             'last_order_days',
                             'review_score_mean']]
    
    # set feature 'Customer' as index
    data_final = data_final.set_index('Customer')
    
    # Filter by date
    data = data_final.loc[data_final['order_purchase_timestamp'] < date]
    data = data[['number_of_purchases',
                 'payments_total',
                 'last_order_days',
                 'review_score_mean']]
    
    # Standardize data
    X_final = data_final[['number_of_purchases', 'payments_total', 'last_order_days', 'review_score_mean']].values
    X = data.values
    std_scaled = preprocessing.StandardScaler().fit(X)
    X_final_scaled = std_scaled.transform(X_final)
    
    # return data
    return X_final_scaled

### Test 1 week

In [None]:
# Create final - 1 week standardized customer data (last order : 2018-08-27)
# Standard Scaler fit on F1week
F1week = filter_date_and_fit('2018-08-28')

In [63]:
F1week.shape

(94670, 4)

In [67]:
# Create initial model at Tfinal - 1 week = T1week
km1 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km1.fit(F1week)

In [59]:
# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on Ffinal
Ffinal_fit = filter_date_and_fit('2018-09-04')

In [60]:
Ffinal_fit

array([[-0.16027064,  0.25233837, -1.54027974,  0.67653723],
       [-0.16027064, -0.24563887, -1.52722222,  0.67653723],
       [-0.16027064,  0.49809337, -1.50110717,  0.67653723],
       ...,
       [-0.16027064, -0.43596041,  2.97762305,  0.67653723],
       [-0.16027064, -0.03328253,  3.10166951, -2.33671593],
       [-0.16027064, -0.32005829,  3.16695713, -2.33671593]])

In [61]:
Ffinal_fit.shape

(94721, 4)

In [71]:
# Create model at Tfinal
kmfinal_fit = KMeans(n_clusters = 5,
                     max_iter = 300,
                     n_init = 10,
                     init = 'k-means++')
kmfinal_fit.fit(Ffinal_fit)
labels_k_means_final_fit = kmfinal_fit.labels_

In [72]:
labels_k_means_final_fit

array([3, 3, 3, ..., 0, 2, 2])

In [77]:
# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F1week
Ffinal_predict_1 = filter_date_and_predict('2018-08-28')

In [78]:
Ffinal_predict_1

array([[-0.16019128,  0.25223204, -1.54184635,  0.67659322],
       [-0.16019128, -0.2456625 , -1.52878379,  0.67659322],
       [-0.16019128,  0.49794623, -1.50265868,  0.67659322],
       ...,
       [-0.16019128, -0.43595244,  2.97779789,  0.67659322],
       [-0.16019128, -0.03334142,  3.10189217, -2.33646448],
       [-0.16019128, -0.32006956,  3.16720495, -2.33646448]])

In [79]:
Ffinal_predict_1.shape

(94721, 4)

In [84]:
# Predict clusters at Tfinal using (Tfinal - 1 week) KMeans model
labels_k_means_final_predict_1 = km1.predict(Ffinal_predict_1)

In [85]:
labels_k_means_final_predict_1

array([3, 3, 3, ..., 0, 1, 1])

In [89]:
# Calculate adjusted rand score
ARI_1_week = adjusted_rand_score(labels_k_means_final_fit, labels_k_means_final_predict_1)

In [90]:
ARI_1_week

1.0521610151373788

In [93]:
# Calculate rand score
RI_1_week = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_1)

In [94]:
RI_1_week

0.9994289947226235

### Test 2 weeks

In [91]:
# Create final - 2 weeks standardized customer data (last order : 2018-08-20)
# Standard Scaler fit on F2weeks
F2weeks = filter_date_and_fit('2018-08-21')

In [95]:
F2weeks.shape

(93801, 4)

In [96]:
# Create initial model at Tfinal - 2 weeks = T2weeks
km2 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km2.fit(F2weeks)

KMeans(n_clusters=5)

In [97]:
# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F2weeks
Ffinal_predict_2 = filter_date_and_predict('2018-08-21')

In [98]:
Ffinal_predict_2.shape

(94721, 4)

In [99]:
# Predict clusters at Tfinal using (Tfinal - 2 weeks) KMeans model
labels_k_means_final_predict_2 = km2.predict(Ffinal_predict_2)

In [100]:
# Calculate adjusted rand score
ARI_2_weeks = adjusted_rand_score(labels_k_means_final_fit, labels_k_means_final_predict_2)
ARI_2_weeks

1.647627470783356

In [101]:
# Calculate rand score
RI_2_weeks = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_2)
RI_2_weeks

0.993155693270735

### Test 3 weeks

In [103]:
# Create final - 3 weeks standardized customer data (last order : 2018-08-13)
# Standard Scaler fit on F3weeks
F3weeks = filter_date_and_fit('2018-08-14')
F3weeks.shape

(91996, 4)

In [104]:
# Create initial model at Tfinal - 3 weeks = T3weeks
km3 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km3.fit(F3weeks)

KMeans(n_clusters=5)

In [105]:
# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F3weeks
Ffinal_predict_3 = filter_date_and_predict('2018-08-14')
Ffinal_predict_3.shape

(94721, 4)

In [106]:
# Predict clusters at Tfinal using (Tfinal - 3 weeks) KMeans model
labels_k_means_final_predict_3 = km3.predict(Ffinal_predict_3)

In [107]:
# Calculate adjusted rand score
ARI_3_weeks = adjusted_rand_score(labels_k_means_final_fit, labels_k_means_final_predict_3)
ARI_3_weeks

2.9786030555120346

In [108]:
# Calculate rand score
RI_3_weeks = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_3)
RI_3_weeks

0.980558122982874

### Test 4 weeks

In [109]:
# Create final - 4 weeks standardized customer data (last order : 2018-08-06)
# Standard Scaler fit on F4weeks
F4weeks = filter_date_and_fit('2018-08-07')
F4weeks.shape

(90131, 4)

In [110]:
# Create initial model at Tfinal - 4 weeks = T4weeks
km4 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km4.fit(F4weeks)

KMeans(n_clusters=5)

In [111]:
# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F4weeks
Ffinal_predict_4 = filter_date_and_predict('2018-08-07')
Ffinal_predict_4.shape

(94721, 4)

In [112]:
# Predict clusters at Tfinal using (Tfinal - 4 weeks) KMeans model
labels_k_means_final_predict_4 = km4.predict(Ffinal_predict_4)

In [113]:
# Calculate adjusted rand score
ARI_4_weeks = adjusted_rand_score(labels_k_means_final_fit, labels_k_means_final_predict_4)
ARI_4_weeks

16.77265625518444

In [114]:
# Calculate rand score
RI_4_weeks = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_4)
RI_4_weeks

0.9194519379032647

### Test 5 weeks

In [115]:
# Create final - 5 weeks standardized customer data (last order : 2018-07-30)
# Standard Scaler fit on F5weeks
F5weeks = filter_date_and_fit('2018-07-31')

# Create initial model at Tfinal - 5 weeks = T5weeks
km5 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km5.fit(F5weeks)

# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F5weeks
Ffinal_predict_5 = filter_date_and_predict('2018-07-31')

# Predict clusters at Tfinal using (Tfinal - 5 weeks) KMeans model
labels_k_means_final_predict_5 = km5.predict(Ffinal_predict_5)

# Calculate rand score
RI_5_weeks = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_5)
RI_5_weeks

0.907267985216612

### Test 6 weeks

In [116]:
# Create final - 6 weeks standardized customer data (last order : 2018-07-23)
# Standard Scaler fit on F6weeks
F6weeks = filter_date_and_fit('2018-07-24')

# Create initial model at Tfinal - 6 weeks = T6weeks
km6 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km6.fit(F6weeks)

# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F6weeks
Ffinal_predict_6 = filter_date_and_predict('2018-07-24')

# Predict clusters at Tfinal using (Tfinal - 6 weeks) KMeans model
labels_k_means_final_predict_6 = km6.predict(Ffinal_predict_6)

# Calculate rand score
RI_6_weeks = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_6)
RI_6_weeks

0.8984291089806564

### Test 7 weeks

In [118]:
# Create final - 7 weeks standardized customer data (last order : 2018-07-16)
# Standard Scaler fit on F7weeks
F7weeks = filter_date_and_fit('2018-07-17')

# Create initial model at Tfinal - 7 weeks = T7weeks
km7 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km7.fit(F7weeks)

# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F7weeks
Ffinal_predict_7 = filter_date_and_predict('2018-07-17')

# Predict clusters at Tfinal using (Tfinal - 7 weeks) KMeans model
labels_k_means_final_predict_7 = km7.predict(Ffinal_predict_7)

# Calculate rand score
RI_7_weeks = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_7)
RI_7_weeks

0.8912307349846362

### Test 8 weeks

In [119]:
# Create final - 8 weeks standardized customer data (last order : 2018-07-09)
# Standard Scaler fit on F8weeks
F8weeks = filter_date_and_fit('2018-07-10')

# Create initial model at Tfinal - 8 weeks = T8weeks
km8 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km8.fit(F8weeks)

# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F8weeks
Ffinal_predict_8 = filter_date_and_predict('2018-07-10')

# Predict clusters at Tfinal using (Tfinal - 8 weeks) KMeans model
labels_k_means_final_predict_8 = km8.predict(Ffinal_predict_8)

# Calculate rand score
RI_8_weeks = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_8)
RI_8_weeks

0.8877419227042892

### Test 9 weeks

In [120]:
# Create final - 9 weeks standardized customer data (last order : 2018-07-02)
# Standard Scaler fit on F9weeks
F9weeks = filter_date_and_fit('2018-07-03')

# Create initial model at Tfinal - 9 weeks = T9weeks
km9 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km9.fit(F9weeks)

# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F9weeks
Ffinal_predict_9 = filter_date_and_predict('2018-07-03')

# Predict clusters at Tfinal using (Tfinal - 9 weeks) KMeans model
labels_k_means_final_predict_9 = km9.predict(Ffinal_predict_9)

# Calculate rand score
RI_9_weeks = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_9)
RI_9_weeks

0.8831062104207463

### Test 10 weeks

In [121]:
# Create final - 10 weeks standardized customer data (last order : 2018-06-25)
# Standard Scaler fit on F10weeks
F10weeks = filter_date_and_fit('2018-06-26')

# Create initial model at Tfinal - 10 weeks = T10weeks
km10 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km10.fit(F10weeks)

# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F10weeks
Ffinal_predict_10 = filter_date_and_predict('2018-06-26')

# Predict clusters at Tfinal using (Tfinal - 10 weeks) KMeans model
labels_k_means_final_predict_10 = km10.predict(Ffinal_predict_10)

# Calculate rand score
RI_10_weeks = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_10)
RI_10_weeks

0.8793368903896136

### Test 11 weeks

In [None]:
# Create final - 11 weeks standardized customer data (last order : 2018-06-18)
# Standard Scaler fit on F11weeks
F11weeks = filter_date_and_fit('2018-06-19')

# Create initial model at Tfinal - 11 weeks = T11weeks
km11 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km11.fit(F11weeks)

# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F11weeks
Ffinal_predict_11 = filter_date_and_predict('2018-06-19')

# Predict clusters at Tfinal using (Tfinal - 11 weeks) KMeans model
labels_k_means_final_predict_11 = km11.predict(Ffinal_predict_11)

# Calculate rand score
RI_11_weeks = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_11)
RI_11_weeks

### Test 12 weeks

In [None]:
# Create final - 12 weeks standardized customer data (last order : 2018-06-11)
# Standard Scaler fit on F12weeks
F12weeks = filter_date_and_fit('2018-06-12')

# Create initial model at Tfinal - 12 weeks = T12weeks
km12 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km12.fit(F12weeks)

# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F12weeks
Ffinal_predict_12 = filter_date_and_predict('2018-06-12')

# Predict clusters at Tfinal using (Tfinal - 12 weeks) KMeans model
labels_k_means_final_predict_12 = km12.predict(Ffinal_predict_12)

# Calculate rand score
RI_12_weeks = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_12)
RI_12_weeks

### Test 13 weeks

In [None]:
# Create final - 13 weeks standardized customer data (last order : 2018-06-04)
# Standard Scaler fit on F13weeks
F13weeks = filter_date_and_fit('2018-06-05')

# Create initial model at Tfinal - 13 weeks = T13weeks
km13 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km13.fit(F13weeks)

# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F13weeks
Ffinal_predict_13 = filter_date_and_predict('2018-06-05')

# Predict clusters at Tfinal using (Tfinal - 13 weeks) KMeans model
labels_k_means_final_predict_13 = km13.predict(Ffinal_predict_13)

# Calculate rand score
RI_13_weeks = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_13)
RI_13_weeks

### Test 14 weeks

In [None]:
# Create final - 14 weeks standardized customer data (last order : 2018-05-28)
# Standard Scaler fit on F14weeks
F14weeks = filter_date_and_fit('2018-05-29')

# Create initial model at Tfinal - 14 weeks = T14weeks
km14 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km14.fit(F14weeks)

# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F14weeks
Ffinal_predict_14 = filter_date_and_predict('2018-05-29')

# Predict clusters at Tfinal using (Tfinal - 14 weeks) KMeans model
labels_k_means_final_predict_14 = km14.predict(Ffinal_predict_14)

# Calculate rand score
RI_14_weeks = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_14)
RI_14_weeks

### Test 15 weeks

In [None]:
# Create final - 15 weeks standardized customer data (last order : 2018-05-21)
# Standard Scaler fit on F15weeks
F14weeks = filter_date_and_fit('2018-05-22')

# Create initial model at Tfinal - 15 weeks = T15weeks
km15 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km15.fit(F15weeks)

# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F15weeks
Ffinal_predict_15 = filter_date_and_predict('2018-05-22')

# Predict clusters at Tfinal using (Tfinal - 15 weeks) KMeans model
labels_k_means_final_predict_15 = km15.predict(Ffinal_predict_15)

# Calculate rand score
RI_15_weeks = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_15)
RI_15_weeks

### Test 16 weeks

In [122]:
# Create final - 16 weeks standardized customer data (last order : 2018-05-14)
# Standard Scaler fit on F16weeks
F16weeks = filter_date_and_fit('2018-05-15')

# Create initial model at Tfinal - 16 weeks = T16weeks
km16 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km16.fit(F16weeks)

# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F16weeks
Ffinal_predict_16 = filter_date_and_predict('2018-05-15')

# Predict clusters at Tfinal using (Tfinal - 16 weeks) KMeans model
labels_k_means_final_predict_16 = km16.predict(Ffinal_predict_16)

# Calculate rand score
RI_16_weeks = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_16)
RI_16_weeks

0.8418190044688855

### Test 17 weeks

In [None]:
# Create final - 17 weeks standardized customer data (last order : 2018-05-07)
# Standard Scaler fit on F17weeks
F17weeks = filter_date_and_fit('2018-05-08')

# Create initial model at Tfinal - 17 weeks = T17weeks
km17 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km17.fit(F17weeks)

# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F17weeks
Ffinal_predict_17 = filter_date_and_predict('2018-05-08')

# Predict clusters at Tfinal using (Tfinal - 17 weeks) KMeans model
labels_k_means_final_predict_17 = km17.predict(Ffinal_predict_17)

# Calculate rand score
RI_17_weeks = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_17)
RI_17_weeks

### Test 18 weeks

In [123]:
# Create final - 18 weeks standardized customer data (last order : 2018-04-30)
# Standard Scaler fit on F18weeks
F18weeks = filter_date_and_fit('2018-05-01')

# Create initial model at Tfinal - 18 weeks = T18weeks
km18 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km18.fit(F18weeks)

# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F18weeks
Ffinal_predict_18 = filter_date_and_predict('2018-05-01')

# Predict clusters at Tfinal using (Tfinal - 18 weeks) KMeans model
labels_k_means_final_predict_18 = km18.predict(Ffinal_predict_18)

# Calculate rand score
RI_18_weeks = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_18)
RI_18_weeks

0.8260530559859725

### Test 19 weeks

In [None]:
# Create final - 18 weeks standardized customer data (last order : 2018-04-23)
# Standard Scaler fit on F18weeks
F18weeks = filter_date_and_fit('2018-04-24')

# Create initial model at Tfinal - 18 weeks = T18weeks
km18 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km18.fit(F18weeks)

# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F18weeks
Ffinal_predict_18 = filter_date_and_predict('2018-04-24')

# Predict clusters at Tfinal using (Tfinal - 18 weeks) KMeans model
labels_k_means_final_predict_18 = km18.predict(Ffinal_predict_18)

# Calculate rand score
RI_18_weeks = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_18)
RI_18_weeks

### Test 20 weeks

In [None]:
# Create final - 18 weeks standardized customer data (last order : 2018-04-16)
# Standard Scaler fit on F18weeks
F18weeks = filter_date_and_fit('2018-04-17')

# Create initial model at Tfinal - 18 weeks = T18weeks
km18 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km18.fit(F18weeks)

# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F18weeks
Ffinal_predict_18 = filter_date_and_predict('2018-04-17')

# Predict clusters at Tfinal using (Tfinal - 18 weeks) KMeans model
labels_k_means_final_predict_18 = km18.predict(Ffinal_predict_18)

# Calculate rand score
RI_18_weeks = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_18)
RI_18_weeks

### Test 21 weeks

In [None]:
# Create final - 21 weeks standardized customer data (last order : 2018-04-09)
# Standard Scaler fit on F21weeks
F21weeks = filter_date_and_fit('2018-04-10')

# Create initial model at Tfinal - 21 weeks = T21weeks
km21 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km21.fit(F21weeks)

# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F21weeks
Ffinal_predict_21 = filter_date_and_predict('2018-04-10')

# Predict clusters at Tfinal using (Tfinal - 21 weeks) KMeans model
labels_k_means_final_predict_21 = km21.predict(Ffinal_predict_21)

# Calculate rand score
RI_21_weeks = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_21)
RI_21_weeks

### Test 22 weeks

In [124]:
# Create final - 22 weeks standardized customer data (last order : 2018-04-02)
# Standard Scaler fit on F22weeks
F22weeks = filter_date_and_fit('2018-04-03')

# Create initial model at Tfinal - 22 weeks = T18weeks
km22 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km22.fit(F22weeks)

# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F22weeks
Ffinal_predict_22 = filter_date_and_predict('2018-04-03')

# Predict clusters at Tfinal using (Tfinal - 22 weeks) KMeans model
labels_k_means_final_predict_22 = km22.predict(Ffinal_predict_22)

# Calculate rand score
RI_22_weeks = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_22)
RI_22_weeks

0.8245479665458472

### Test 23 weeks

In [None]:
# Create final - 23 weeks standardized customer data (last order : 2018-03-26)
# Standard Scaler fit on F23weeks
F23weeks = filter_date_and_fit('2018-03-27')

# Create initial model at Tfinal - 23 weeks = T23weeks
km23 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km23.fit(F23weeks)

# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F23weeks
Ffinal_predict_23 = filter_date_and_predict('2018-03-27')

# Predict clusters at Tfinal using (Tfinal - 23 weeks) KMeans model
labels_k_means_final_predict_23 = km23.predict(Ffinal_predict_23)

# Calculate rand score
RI_23_weeks = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_23)
RI_23_weeks

In [125]:
# Create final - 24 weeks standardized customer data (last order : 2018-03-19)
# Standard Scaler fit on F24weeks
F24weeks = filter_date_and_fit('2018-03-20')

# Create initial model at Tfinal - 24 weeks = T24weeks
km24 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km24.fit(F24weeks)

# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F24weeks
Ffinal_predict_24 = filter_date_and_predict('2018-03-20')

# Predict clusters at Tfinal using (Tfinal - 24 weeks) KMeans model
labels_k_means_final_predict_24 = km24.predict(Ffinal_predict_24)

# Calculate rand score
RI_24_weeks = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_24)
RI_24_weeks

0.8190812560080429

In [None]:
# Create final - 25 weeks standardized customer data (last order : 2018-03-12)
# Standard Scaler fit on F25weeks
F25weeks = filter_date_and_fit('2018-03-13')

# Create initial model at Tfinal - 25 weeks = T25weeks
km25 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km25.fit(F25weeks)

# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F25weeks
Ffinal_predict_25 = filter_date_and_predict('2018-03-13')

# Predict clusters at Tfinal using (Tfinal - 25 weeks) KMeans model
labels_k_means_final_predict_25 = km25.predict(Ffinal_predict_25)

# Calculate rand score
RI_25_weeks = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_25)
RI_25_weeks

In [None]:
# Create final - 25 weeks standardized customer data (last order : 2018-03-12)
# Standard Scaler fit on F25weeks
F25weeks = filter_date_and_fit('2018-03-13')

# Create initial model at Tfinal - 25 weeks = T25weeks
km25 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km25.fit(F25weeks)

# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F25weeks
Ffinal_predict_25 = filter_date_and_predict('2018-03-13')

# Predict clusters at Tfinal using (Tfinal - 25 weeks) KMeans model
labels_k_means_final_predict_25 = km25.predict(Ffinal_predict_25)

# Calculate rand score
RI_25_weeks = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_25)
RI_25_weeks

In [None]:
# Create final - 26 weeks standardized customer data (last order : 2018-03-05)
# Standard Scaler fit on F26weeks
F26weeks = filter_date_and_fit('2018-03-06')

# Create initial model at Tfinal - 26 weeks = T26weeks
km26 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km26.fit(F26weeks)

# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F26weeks
Ffinal_predict_26 = filter_date_and_predict('2018-03-06')

# Predict clusters at Tfinal using (Tfinal - 26 weeks) KMeans model
labels_k_means_final_predict_26 = km26.predict(Ffinal_predict_26)

# Calculate rand score
RI_26_weeks = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_26)
RI_26_weeks

In [126]:
# Create final - 27 weeks standardized customer data (last order : 2018-02-26)
# Standard Scaler fit on F27weeks
F27weeks = filter_date_and_fit('2018-02-27')

# Create initial model at Tfinal - 27 weeks = T27weeks
km27 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km27.fit(F27weeks)

# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F27weeks
Ffinal_predict_27 = filter_date_and_predict('2018-02-27')

# Predict clusters at Tfinal using (Tfinal - 27 weeks) KMeans model
labels_k_means_final_predict_27 = km27.predict(Ffinal_predict_27)

# Calculate rand score
RI_27_weeks = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_27)
RI_27_weeks

0.7776287495163605

In [None]:
# Create final - 28 weeks standardized customer data (last order : 2018-02-19)
# Standard Scaler fit on F28weeks
F28weeks = filter_date_and_fit('2018-02-20')

# Create initial model at Tfinal - 28 weeks = T28weeks
km28 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km28.fit(F28weeks)

# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F28weeks
Ffinal_predict_28 = filter_date_and_predict('2018-02-20')

# Predict clusters at Tfinal using (Tfinal - 28 weeks) KMeans model
labels_k_means_final_predict_28 = km28.predict(Ffinal_predict_28)

# Calculate rand score
RI_28_weeks = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_28)
RI_28_weeks

In [None]:
# Create final - 29 weeks standardized customer data (last order : 2018-02-12)
# Standard Scaler fit on F29weeks
F29weeks = filter_date_and_fit('2018-02-13')

# Create initial model at Tfinal - 29 weeks = T29weeks
km29 = KMeans(n_clusters = 5,
             max_iter = 300,
             n_init = 10,
             init = 'k-means++')
km29.fit(F29weeks)

# Create final standardized customer data (last order : 2018-09-03)
# Standard Scaler fit on F29weeks
Ffinal_predict_29 = filter_date_and_predict('2018-02-13')

# Predict clusters at Tfinal using (Tfinal - 29 weeks) KMeans model
labels_k_means_final_predict_29 = km29.predict(Ffinal_predict_29)

# Calculate rand score
RI_29_weeks = rand_score(labels_k_means_final_fit, labels_k_means_final_predict_29)
RI_29_weeks

## Visualisation