# 1. Import the transactions data

Let us first import the transactions 
data. This can be done by looping over every line of the csv file, splitting each line by the seperator and storing this list of observations into a list.

In [28]:
# initialize empty container to store the observations
container = list()

# open file
with open("../final_data/HVC_HVVISITRESULT.csv", "r") as f:
    
    # loop through lines
    for line in f:
 
        # remove the newline at the end of the line
        line = line.replace("\n", "")
        
        # split obs by seperator
        obs = line.split(";")
        
        # add observation to container
        container.append(obs)       
        

In [29]:
# check first observation
container[0]

['HVVISITRESULT_NRID',
 'SO0_NRID',
 'AM0_NRID',
 'HVOUTCOME_NRID',
 'VISITDATE',
 'DATEONLY',
 'AMOUNT',
 'PAYMENTTERM']

In [30]:
container[1]

['55681456',
 '719952',
 '23186288605042',
 '2',
 '12:20:01,000000000',
 '2019-05-06',
 '16.9',
 'cash']

In [31]:
# store column names in variable
column_names = container[0]

In [32]:
# remove column names from container
container.pop(0)

['HVVISITRESULT_NRID',
 'SO0_NRID',
 'AM0_NRID',
 'HVOUTCOME_NRID',
 'VISITDATE',
 'DATEONLY',
 'AMOUNT',
 'PAYMENTTERM']

In [33]:
# check
container[0]

['55681456',
 '719952',
 '23186288605042',
 '2',
 '12:20:01,000000000',
 '2019-05-06',
 '16.9',
 'cash']

# 2. Explore the data

In [34]:
# check number of observations
print(len(container))

240415


In [35]:
# inspect the number of features 
print(len(column_names))

8


In [36]:
# inspect the first observation
container[0]

['55681456',
 '719952',
 '23186288605042',
 '2',
 '12:20:01,000000000',
 '2019-05-06',
 '16.9',
 'cash']

In [37]:
# inspect last observation
container[-1]

['73465634',
 '875410',
 '19946584592834',
 '1',
 '14:02:19,000000000',
 '2020-12-13',
 '0.0',
 '']

# 3. Convert the data

Lets convert the list of observations to a dictionary with the keys being the column names and the values being the column values. This will make it easier to do analytics on the different features.

In [38]:
# first, initialize a dictionary with the keys being the column names and the values being empty lists
data_dict = dict()
for name in column_names:
    data_dict[name] = []

In [39]:
# check
data_dict

{'HVVISITRESULT_NRID': [],
 'SO0_NRID': [],
 'AM0_NRID': [],
 'HVOUTCOME_NRID': [],
 'VISITDATE': [],
 'DATEONLY': [],
 'AMOUNT': [],
 'PAYMENTTERM': []}

In [46]:
# loop through the observations
for obs in container:
    # loop through the observation feature values
    for i in range(len(obs)):
        # get the feature value
        column_value = obs[i]
        # get the feature name
        column_name = column_names[i]
        # add feature value to dictionary
        data_dict[column_name].append(column_value)

In [47]:
# check
data_dict["AMOUNT"]

['16.9',
 '7.4',
 '18.9',
 '0.0',
 '34.9',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '8.5',
 '0.0',
 '9.7',
 '0.0',
 '10.1',
 '11.1',
 '7.4',
 '8.9',
 '0.0',
 '0.0',
 '3.7',
 '7.4',
 '12.7',
 '0.0',
 '8.9',
 '0.0',
 '22.6',
 '0.0',
 '0.0',
 '25.1',
 '0.0',
 '0.0',
 '0.0',
 '9.3',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '8.4',
 '44.4',
 '4.0',
 '32.1',
 '0.0',
 '24.3',
 '0.0',
 '7.65',
 '0.0',
 '14.8',
 '0.0',
 '0.0',
 '11.0',
 '9.5',
 '0.0',
 '37.0',
 '0.0',
 '16.8',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '15.4',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '15.4',
 '14.6',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '0.0',
 '3.7',
 '20.7',
 '0.0',
 '0.0',
 '0.0',
 '40.7',
 '0.0',
 '7.4',
 '7.4',
 '8.5',
 '15.0',
 '0.0',
 '7.5',
 '15.9',
 '0.0',
 '8.9',
 '15.8',
 '0.0',
 '17.0',
 '7.4',
 '0.0',
 '9.7',
 '0.0',
 '0.0',
 '0.0',
 '7.4',
 '14.6',
 '0.0',
 '16.8',
 '3.6',
 '3.7',
 '22.2',
 '9.5',
 '28

In [48]:
# get the transaction amount of the first observation
data_dict["AMOUNT"][0]

'16.9'

Lets now have a look at the data data types of each feature

In [49]:
# loop through the dictionary items
for k, v in data_dict.items():
    # get the type of the first feature value
    type_val = type(v[0])
    # check feature and data type
    print(k, type_val)

HVVISITRESULT_NRID <class 'str'>
SO0_NRID <class 'str'>
AM0_NRID <class 'str'>
HVOUTCOME_NRID <class 'str'>
VISITDATE <class 'str'>
DATEONLY <class 'str'>
AMOUNT <class 'str'>
PAYMENTTERM <class 'str'>


Note that all the data types are str. However, we want to convert the ID features to be integer and the AMOUNT feature to be a float

In [50]:
# loop through all the observations
for i in range(len(data_dict["AMOUNT"])):
    # convert data dypes
    data_dict["HVVISITRESULT_NRID"][i] = int(data_dict["HVVISITRESULT_NRID"][i])
    data_dict["SO0_NRID"][i] = int(data_dict["SO0_NRID"][i])
    data_dict["AM0_NRID"][i] = int(data_dict["AM0_NRID"][i])
    data_dict["HVOUTCOME_NRID"][i] = int(data_dict["HVOUTCOME_NRID"][i])
    data_dict["AMOUNT"][i] = float(data_dict["AMOUNT"][i])

Lets check if the data types of the features are converted correctly

In [51]:
# loop through the dictionary items
for k, v in data_dict.items():
    # get the type of the first column value
    type_val = type(v[0])
    # check column and value
    print(k, type_val)

HVVISITRESULT_NRID <class 'int'>
SO0_NRID <class 'int'>
AM0_NRID <class 'int'>
HVOUTCOME_NRID <class 'int'>
VISITDATE <class 'str'>
DATEONLY <class 'str'>
AMOUNT <class 'float'>
PAYMENTTERM <class 'str'>


In [52]:
# check amount value for the first 5 observations
data_dict["AMOUNT"][:5]

[16.9, 7.4, 18.9, 0.0, 34.9]

# 4. Basic analytics

## 4.1. Total number of customers

In [53]:
# get customer ids of all the transactions
cust_ids = data_dict["SO0_NRID"]
# convert to set to extract unique set of customers
cust_ids = set(cust_ids)
# get total number of customers
num_cust = len(cust_ids)

In [54]:
# check
print("number of customers that made transaction: %s" %num_cust)

number of customers that made transaction: 5612


## 4.2. Min and Max purchase amount

In [55]:
# get minimum purchase amount
min_amount = min(data_dict["AMOUNT"])
# get maximum purchase amount
max_amount = max(data_dict["AMOUNT"])

In [56]:
# check
print("min amount: %s" %min_amount)
print("max amount: %s" %max_amount)

min amount: -50.0
max amount: 1476.8


## 4.3. Average purchase amount

In [57]:
# get total transaction amount
total_amount = sum(data_dict["AMOUNT"])

# get total number of transactions
num_transactions = len(data_dict["AMOUNT"])

# get average amount
avg_amount = total_amount / num_transactions

In [58]:
# check
print("average purchase amount: %s" %avg_amount)

average purchase amount: 4.427551525487817


## 4.4. Average purchases per customer

In [59]:
# get total number of transactions
total_transactions = len(data_dict["AMOUNT"])
# get total customers
total_cust = len(set(data_dict["SO0_NRID"]))
# get average transactions per customer
avg_transactions = total_transactions / total_cust

In [60]:
# check
print("average purchases per customer: %s" %avg_transactions)

average purchases per customer: 42.83945117605132


## 4.5. Customer CLV

Lets now have a look at the clv, or total amount purchased per customer. 
For this, we will only take into account successfull visists, i.e. visits where the customer actually bought something.
This is indicated by the HVOUTCOME_NRID column, which equals 2 for a successfull visit.

In [62]:
# initialize an empty dict
cust_clv_dict = dict()

# loop through all the observations
for i in range(len(data_dict["SO0_NRID"])):
    
    # get the customer id
    cust_id = data_dict["SO0_NRID"][i]
    # get the amount
    amount = data_dict["AMOUNT"][i]
    # get the outcome id
    outcome_id = data_dict["HVOUTCOME_NRID"][i]
    
    # check if visit is succesfull
    if outcome_id == 2:
        
        # create new dictionary key if cust id not yet in dictionary
        if cust_id not in cust_clv_dict.keys():
            cust_clv_dict[cust_id] = amount
        # update customer clv
        else:
            cust_clv_dict[cust_id] += amount
            
    
    

In [63]:
# inspect clv of first 10 customers
for i in range(5):
    # get cust id
    i_cust_id = list(cust_clv_dict.keys())[i]
    # get clv 
    i_clv = cust_clv_dict[i_cust_id]
    # print
    print("customer %s has a CLV of %s" %(i_cust_id, i_clv))

customer 719952 has a CLV of 296.00000000000006
customer 1453609 has a CLV of 493.2999999999999
customer 1453614 has a CLV of 401.54999999999995
customer 1252497 has a CLV of 1129.7
customer 1251884 has a CLV of 200.50000000000003


# 5. Next Steps

- Try to import the other datasets (customers, employees, products, ...)
- Try to come up with some meaningfull analytics for these datasets
- Try to find those customers with the highest / lowest CLV
- Try to find the number of customers per employee
- ...