In [3]:
# import the packages
import pandas as pd
import numpy as np

import datetime

# 1. Import Data

In [4]:
# import the data into pandas dataframes
hvc_hvvisitresult = pd.read_csv("../final_data/HVC_HVVISITRESULT.csv", sep=";")
hvc_so0 = pd.read_csv("../final_data/HVC_SO0.csv", sep=";")
hvc_am0 = pd.read_csv("../final_data/HVC_AM0.csv", sep=";")
hvc_ar0 = pd.read_csv("../final_data/HVC_AR0.csv", sep=";")
hvc_hvvisitresultdetails = pd.read_csv("../final_data/HVC_VISITRESULTDETAILS.csv", sep=";")

FileNotFoundError: [Errno 2] No such file or directory: '../final_data/HVC_HVVISITRESULT.csv'

# 2. Explore the data

Now that we have imported all the datasets into Pandas DataFrames, 
we can do some basic exploration on these datasets.
This will give us more inisghts and will help us to do some analytics on this data.

## 2.1. Transactions

In [None]:
# inspect the first 5 observations of the transactions
hvc_hvvisitresult.head(5)

In [None]:
# inspect the last 5 observations of the transactions
hvc_hvvisitresult.tail(5)

In [None]:
# check dtypes
hvc_hvvisitresult.dtypes

In [None]:
# check the number of transactions 
len(hvc_hvvisitresult)

In [None]:
# inspect some descriptive statistics with respect to the transaction amount
hvc_hvvisitresult["AMOUNT"].describe()

In [None]:
# inspect some descriptive statistics with respect to the payment term
hvc_hvvisitresult["PAYMENTTERM"].describe()

In [None]:
# inspect the total number of missing values for the transaction amount
sum(hvc_hvvisitresult["AMOUNT"].isnull())

In [None]:
# inspect the total number of missing values for the payment term
sum(hvc_hvvisitresult["PAYMENTTERM"].isnull())

In [None]:
# inspect the proportion of missing values for the payment term
sum(hvc_hvvisitresult["PAYMENTTERM"].isnull()) / len(hvc_hvvisitresult)

## 2.2. Customers

In [None]:
# inspect first 5 observations of customers
hvc_so0.head(5)

In [None]:
# inspect total number of unique customers
len(hvc_so0["SO0_NRID"].unique())

In [None]:
# get some descriptive statistics with respect to the spoken language as well as the season type of the customers
hvc_so0[["LANGUAGE", "SEASON_TYPE"]].describe()

In [None]:
# get the counts for every season type
hvc_so0["SEASON_TYPE"].value_counts()

In [None]:
# get the counts for every spoken language
hvc_so0["LANGUAGE"].value_counts()

## 2.3. Employees

In [None]:
# inspect first 5 observations of employees
hvc_am0.head(5)

In [None]:
# inspect the total number of employees
len(hvc_am0["AM0_NRID"].unique())

## 2.4. Products

In [None]:
# inspect the first 5 products
hvc_ar0.head(5)

In [None]:
# inspect the total number of products 
len(hvc_ar0["AR0_NRID"].unique())

In [None]:
# inspect the proportion of product families
hvc_ar0["FAMILY"].value_counts(normalize=True)

In [None]:
# get some descriptive statistics with respect to the price of the products
hvc_ar0["PRICE"].describe()

In [None]:
# inspect which product had a negative price
hvc_ar0[hvc_ar0["PRICE"] < 0]

# 2. Data Conversion

Before we start analyzing the data, we should first do the required data conversions.

In [None]:
# lets convert DATEONLY column from a string variable to a datetime variable
hvc_hvvisitresult["DATEONLY"] = pd.to_datetime(hvc_hvvisitresult["DATEONLY"])

# 3. Analytics

Now that we have somehow explored the data, we can do some more advanced analytics.

## 3.1 Customer 


### 3.1.1 CLV

Lets have a look at the CLV or total sales generated by a specific customer

In [None]:
# define customer
customer_id = 721110.0

# get subset of transactions per customer
customer_transactions = hvc_hvvisitresult[hvc_hvvisitresult["SO0_NRID"] == customer_id]

# get the transactions with a positive outcome
customer_transactions = customer_transactions[customer_transactions["HVOUTCOME_NRID"] == 2]

# get total amount purchased by customer
customer_clv = np.sum(customer_transactions["AMOUNT"])

# check
print(customer_clv)

Now lets define a function such that we can extract the total sales for any specified customer

In [None]:
# define a function that accepts a customer id and the transactions data as parameters
def get_clv(cust_id, transactions):
    
    # get customer transactions
    cust_transactions = transactions[transactions["SO0_NRID"] == cust_id]
    # get transactions with positive outcome
    cust_transactions_pos = cust_transactions[cust_transactions["HVOUTCOME_NRID"] == 2]
    # get clv
    clv = np.sum(cust_transactions["AMOUNT"])
    # return clv
    return(clv)

In [None]:
# check
clv = get_clv(cust_id=721110.0, transactions=hvc_hvvisitresult)
print(clv)

### 3.1.2 Customer Frequency

How often did a customer purchase at the company at different days?

In [None]:
# define customer
customer_id = 721110.0

# get transactions made by customer
transactions_cust = hvc_hvvisitresult[hvc_hvvisitresult["SO0_NRID"] == customer_id]

# get transactions from succesfull visits
transactions_cust = transactions_cust[transactions_cust["HVOUTCOME_NRID"] == 2]

# get total number of unique days on which customer made a purchase
frequency = len(transactions_cust)

In [None]:
# check
frequency

Now lets define a function that accepts the customer id and the transactions dataset and return the frequency

In [None]:
def get_frequency(cust_id, transactions):
    
    # get transactions made by customer
    transactions_cust = transactions[hvc_hvvisitresult["SO0_NRID"] == cust_id]

    # get transactions from succesfull visits
    transactions_cust = transactions_cust[transactions_cust["HVOUTCOME_NRID"] == 2]

    # get total number of unique days on which customer made a purchase
    frequency = len(transactions_cust)
    
    # return
    return(frequency)

In [None]:
# check
get_frequency(721110.0, hvc_hvvisitresult)

### 3.1.3 Customer Products

Lets also investigete which products a customer bought.

In [None]:
# define customer
customer_id = 721110.0

In [None]:
# join transactions with transaction details with products
transactions_products = hvc_hvvisitresult.merge(hvc_hvvisitresultdetails, on="HVVISITRESULT_NRID", how="inner") \
                                         .merge(hvc_ar0, on="AR0_NRID", how="inner")

In [None]:
# check
transactions_products.head(3)

In [None]:
# get all the transactions for customer
transactions_products_customer = transactions_products[transactions_products["SO0_NRID"] == customer_id]

# get all transactions with positive outcome
transactions_products_customer = transactions_products_customer[transactions_products_customer["HVOUTCOME_NRID"] == 2]

In [None]:
# get total number of products bought by the customer
transactions_products_customer["QUANTITY"].sum()

In [None]:
# get a list of all the unique products bought by customer
transactions_products_customer["DESCRIPTION"].unique().tolist()

Again we will create a function to get the list of unique products a customer bought

In [None]:
# define a function that accepts a customer id, the transactions data, transactions details data and product data
def get_products(cust_id, transactions, transaction_details, products):
    

    # join transactions with transaction details with products
    transactions_products = transactions.merge(transaction_details, on="HVVISITRESULT_NRID", how="inner") \
                                        .merge(products, on="AR0_NRID", how="inner")
    
    # get transactions of customer 
    transactions_products_cust = transactions_products[transactions_products["SO0_NRID"] == cust_id]
    
    # only get transactions of succesfull visits
    transactions_products_cust = transactions_products_cust[transactions_products_cust["HVOUTCOME_NRID"] == 2]
    
    # get total amount of products bought by the customer
    total_products = transactions_products_cust["QUANTITY"].sum()
    
    # get total number of unique products bought by the customer
    total_unique_products = len(transactions_products_cust["DESCRIPTION"].unique())
    
    # return
    return(total_products, total_unique_products)

In [None]:
# check
get_products(cust_id=721110.0, 
                  transactions=hvc_hvvisitresult, 
                  transaction_details=hvc_hvvisitresultdetails, 
                  products=hvc_ar0)

### 3.1.5. Combine everything

Lets now combine the created functions to get a descriptice summary of a certain customer.
More specifically, we are going to build a function that uses the predefined functions and returns:
        
        - The CLV of a customer
        - The Frequency of a customer
        - The total number of products bought by the customer
        - The total number of unique products bought by the customer        

In [None]:
# define function
def get_customer_report(cust_id, transactions, transaction_details, products):
    
    # get the customer clv
    clv = get_clv(cust_id, transactions)
    
    # get customer frequency
    freq = get_frequency(cust_id, transactions)
    
    # get total number of products as well as total number of unique products
    total_products, total_unique_products = get_products(cust_id, transactions, transaction_details, products)
    
    # return
    return(clv, freq, total_products, total_unique_products)

In [None]:
get_customer_report(cust_id=721110.0, 
                    transactions=hvc_hvvisitresult, 
                    transaction_details=hvc_hvvisitresultdetails, 
                    products=hvc_ar0)

# 4. Next Steps

- Analyze the other datasets as well (employees, customers, products, routes, ...)
- How many customers visited each employee?
- Are there customers that stopped buying products?
- Who are the best customers in terms of CLV?
- Who are the top performing employees?
- What are the best selling products?
- What are the worst selling products?
- ...