## Import the raw data

This workbook imports the weekly transaction files, concatenates them into one large file for analysis and run some simple EDA

### Set code parameters and import libraries

In [1]:
import pandas as pd
import boto3
from sagemaker import get_execution_role

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

%matplotlib inline

from import_data import *

In [2]:
pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)

In [3]:
role = get_execution_role()
region = boto3.Session().region_name
bucket = "udacity-machine-learning-capstone-data"

### Import the data

In [None]:
# Import all customer transaction data and the calendar data
time_data, all_trans = import_data(bucket)

Importing data for udacity_capstone_data/time.csv
s3://udacity-machine-learning-capstone-data/udacity_capstone_data/time.csv
udacity_capstone_data/time.csv
Importing data for udacity_capstone_data/transactions_200607.csv
s3://udacity-machine-learning-capstone-data/udacity_capstone_data/transactions_200607.csv
udacity_capstone_data/transactions_200607.csv
Importing data for udacity_capstone_data/transactions_200608.csv
s3://udacity-machine-learning-capstone-data/udacity_capstone_data/transactions_200608.csv
udacity_capstone_data/transactions_200608.csv
Importing data for udacity_capstone_data/transactions_200609.csv
s3://udacity-machine-learning-capstone-data/udacity_capstone_data/transactions_200609.csv
udacity_capstone_data/transactions_200609.csv
Importing data for udacity_capstone_data/transactions_200610.csv
s3://udacity-machine-learning-capstone-data/udacity_capstone_data/transactions_200610.csv
udacity_capstone_data/transactions_200610.csv
Importing data for udacity_capstone_data

Importing data for udacity_capstone_data/transactions_200647.csv
s3://udacity-machine-learning-capstone-data/udacity_capstone_data/transactions_200647.csv
udacity_capstone_data/transactions_200647.csv
Importing data for udacity_capstone_data/transactions_200648.csv
s3://udacity-machine-learning-capstone-data/udacity_capstone_data/transactions_200648.csv
udacity_capstone_data/transactions_200648.csv
Importing data for udacity_capstone_data/transactions_200649.csv
s3://udacity-machine-learning-capstone-data/udacity_capstone_data/transactions_200649.csv
udacity_capstone_data/transactions_200649.csv
Importing data for udacity_capstone_data/transactions_200650.csv
s3://udacity-machine-learning-capstone-data/udacity_capstone_data/transactions_200650.csv
udacity_capstone_data/transactions_200650.csv
Importing data for udacity_capstone_data/transactions_200651.csv
s3://udacity-machine-learning-capstone-data/udacity_capstone_data/transactions_200651.csv
udacity_capstone_data/transactions_200651

### EDA raw transaction files

In [None]:
all_trans.head()

In [None]:
all_trans.describe()

In [None]:
# Note that there are some potential outlier values in spend and quantity

In [None]:
# Any missing data in the transaction table?
all_trans.isnull().sum()

In [None]:
# Note missing data in CUST_CODE, CUST_PRICE_SENSITIVITY and CUST_LIFESTAGE

In [None]:
# Get counts of SHOP_WEEK, SHOP_DATE, SHOP_WEEKDAY and SHOP_HOUR
time_vars = all_trans[["SHOP_WEEK", "SHOP_DATE", "SHOP_WEEKDAY", "SHOP_HOUR"]]
time_vars.apply(pd.Series.value_counts)

In [None]:
# Get counts of CUST_PRICE_SENSITIVITY, CUST_LIFESTAGE
cust_vars = all_trans[["CUST_PRICE_SENSITIVITY", "CUST_LIFESTAGE"]]
cust_vars.apply(pd.Series.value_counts)

In [None]:
# Get counts of BASKET_SIZE, BASKET_PRICE_SENSITIVITY, BASKET_TYPE and BASKET_DOMINANT_MISSION
basket_vars = all_trans[
    [
        "BASKET_SIZE",
        "BASKET_PRICE_SENSITIVITY",
        "BASKET_TYPE",
        "BASKET_DOMINANT_MISSION",
    ]
]
basket_vars.apply(pd.Series.value_counts)

In [None]:
# Get counts of STORE_FORMAT and STORE_REGION
store_vars = all_trans[["STORE_FORMAT", "STORE_REGION"]]
store_vars.apply(pd.Series.value_counts)

In [None]:
# Get counts of product codes
prod_vars = all_trans[["PROD_CODE_10", "PROD_CODE_20", "PROD_CODE_30", "PROD_CODE_40"]]
prod_vars.apply(pd.Series.value_counts)

In [None]:
# Number of distinct customers
all_trans["CUST_CODE"].nunique()

In [None]:
# Number of distinct items
all_trans["PROD_CODE"].nunique()

In [None]:
# Number of unique stores
all_trans

In [None]:
all_trans.info()

In [None]:
time_data.head()

In [None]:
time_data.describe()

In [None]:
time_data.isnull().sum()

In [None]:
time_data.info()

### Create EDA Plots

In [None]:
# Density plot of the item spend
plt.figure(figsize=(20, 8))
plt.rcParams["font.size"] = 18

sns.kdeplot(all_trans["SPEND"], label="Item Spend", linewidth=2)
plt.xlabel("Item Spend")
plt.ylabel("Density")
plt.title("Item Spend Density Plot")

formatter = ticker.FormatStrFormatter('$%1.0f')
plt.gca().xaxis.set_major_formatter(formatter)


In [None]:
# Density plot of the item quantity
plt.figure(figsize=(20, 8))
plt.rcParams["font.size"] = 18

sns.kdeplot(all_trans["QUANTITY"], label="Item Quantity", linewidth=2)
plt.xlabel("Item Quantity")
plt.ylabel("Density")
plt.title("Item Quantity Density Plot")

In [None]:
bask_segments = all_trans.loc[
    :,
    [
        "CUST_CODE",
        "BASKET_SIZE",
        "BASKET_PRICE_SENSITIVITY",
        "BASKET_TYPE",
        "BASKET_DOMINANT_MISSION",
    ],
].drop_duplicates()


fig, axs = plt.subplots(1, 4, figsize=(24, 6))
i = 0

# Plot of four hyperparameters
for i, var in enumerate(
    [
        "BASKET_SIZE",
        "BASKET_PRICE_SENSITIVITY",
        "BASKET_TYPE",
        "BASKET_DOMINANT_MISSION",
    ]
):
    bask_segments[var].value_counts().plot.bar(
        color="blue", title="Profile of {}".format(var), ax=axs[i]
    )
    axs[i].get_yaxis().set_major_formatter(
        ticker.FuncFormatter(lambda x, p: format(int(x), ","))
    )

plt.tight_layout()

In [None]:
cust_segments = all_trans.loc[
    :, ["CUST_CODE", "CUST_PRICE_SENSITIVITY", "CUST_LIFESTAGE"]
].drop_duplicates()

fig, axs = plt.subplots(1, 2, figsize=(24, 6))
i = 0

# Plot of four hyperparameters
for i, var in enumerate(["CUST_PRICE_SENSITIVITY", "CUST_LIFESTAGE"]):
    cust_segments[var].value_counts().plot.bar(
        color="blue", title="Profile of {}".format(var), ax=axs[i]
    )
    axs[i].get_yaxis().set_major_formatter(
        ticker.FuncFormatter(lambda x, p: format(int(x), ","))
    )

plt.tight_layout()

### Pickle DataFrames and upload to s3

In [None]:
# Transactions
all_trans.to_pickle("./all_trans.pkl")
key = "udacity_capstone_data/all_trans.pkl"
boto3.Session().resource("s3").Bucket(bucket).Object(key).upload_file("./all_trans.pkl")

In [None]:
# Calendar
time_data.to_pickle("./time_data.pkl")
key = "udacity_capstone_data/time_data.pkl"
boto3.Session().resource("s3").Bucket(bucket).Object(key).upload_file("./time_data.pkl")