In [1]:
# Import dependencies
from datetime import datetime
from datetime import date
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Read into excel file
xls = pd.ExcelFile("./Resources/KPMG_VI_New_raw_data_update_final.xlsx")
transactions_xls = pd.read_excel(xls, "Transactions", header=1)
new_customer_list_xls = pd.read_excel(xls, "NewCustomerList", header=1)
customer_demographic_xls = pd.read_excel(xls, "CustomerDemographic", header=1)
customer_address_xls = pd.read_excel(xls, "CustomerAddress", header=1)

## Data Cleaning

In [3]:
# Create function to drop all customer_ids above 3500
def consistent_id(excel_sheet):
    excel_sheet = excel_sheet.loc[excel_sheet["customer_id"] <= 3500]
    return excel_sheet

In [9]:
# Apply range of below 3500 to all sheets
cleaned_transactions_xls = consistent_id(transactions_xls)
cleaned_customer_demographic_xls = consistent_id(customer_demographic_xls)
cleaned_customer_address_xls = consistent_id(customer_address_xls)

### Transactions

In [15]:
cleaned_transactions_xls.head()

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,list_price,standard_cost,product_first_sold_date
0,1,2,2950,2017-02-25,0.0,Approved,Solex,Standard,medium,medium,71.49,53.62,41245.0
1,2,3,3120,2017-05-21,1.0,Approved,Trek Bicycles,Standard,medium,large,2091.47,388.92,41701.0
2,3,37,402,2017-10-16,0.0,Approved,OHM Cycles,Standard,low,medium,1793.43,248.82,36361.0
3,4,88,3135,2017-08-31,0.0,Approved,Norco Bicycles,Standard,medium,medium,1198.46,381.1,36145.0
4,5,78,787,2017-10-01,1.0,Approved,Giant Bicycles,Standard,medium,large,1765.3,709.48,42226.0


In [59]:
# Convert online order column to boolean
cleaned_transactions_xls["online_order"] = cleaned_transactions_xls["online_order"].astype(bool)

In [21]:
# Drop all null values for transactions
cleaned_transactions_xls = cleaned_transactions_xls.dropna()

In [60]:
# Display cleaned dataframe
cleaned_transactions_xls.head()

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,list_price,standard_cost,product_first_sold_date
0,1,2,2950,2017-02-25,False,Approved,Solex,Standard,medium,medium,71.49,53.62,41245.0
1,2,3,3120,2017-05-21,True,Approved,Trek Bicycles,Standard,medium,large,2091.47,388.92,41701.0
2,3,37,402,2017-10-16,False,Approved,OHM Cycles,Standard,low,medium,1793.43,248.82,36361.0
3,4,88,3135,2017-08-31,False,Approved,Norco Bicycles,Standard,medium,medium,1198.46,381.1,36145.0
4,5,78,787,2017-10-01,True,Approved,Giant Bicycles,Standard,medium,large,1765.3,709.48,42226.0


### Customer Demographics

In [30]:
# Drop null values only for DOB and job title columns of customer demographics
cleaned_customer_demographic_xls = cleaned_customer_demographic_xls.dropna(subset=["DOB", "job_title"])

In [32]:
# Drop default column of meta data
cleaned_customer_demographic_xls = cleaned_customer_demographic_xls.drop(columns="default")

In [33]:
# Check remaining null values
# Remain nulls for last_name and job_industry_category as they do not affect future analysis
cleaned_customer_demographic_xls.count()

customer_id                            3004
first_name                             3004
last_name                              2908
gender                                 3004
past_3_years_bike_related_purchases    3004
DOB                                    3004
job_title                              3004
job_industry_category                  2531
wealth_segment                         3004
deceased_indicator                     3004
owns_car                               3004
tenure                                 3004
dtype: int64

In [39]:
# Find outlier of DOB column
cleaned_customer_demographic_xls["DOB"].min()

Timestamp('1843-12-21 00:00:00')

In [40]:
# Drop outlier
cleaned_customer_demographic_xls = cleaned_customer_demographic_xls[cleaned_customer_demographic_xls.DOB != "1843-12-21 00:00:00"]

In [49]:
# Display cleaned dataframe
cleaned_customer_demographic_xls.head()

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure
0,1,Laraine,Medendorp,F,93,1953-10-12,Executive Secretary,Health,Mass Customer,N,Yes,11.0
1,2,Eli,Bockman,Male,81,1980-12-16,Administrative Officer,Financial Services,Mass Customer,N,Yes,16.0
2,3,Arlin,Dearle,Male,61,1954-01-20,Recruiting Manager,Property,Mass Customer,N,Yes,15.0
4,5,Sheila-kathryn,Calton,Female,56,1977-05-13,Senior Editor,,Affluent Customer,N,Yes,8.0
7,8,Rod,Inder,Male,31,1962-03-30,Media Manager I,,Mass Customer,N,No,7.0


### Customer Address

In [50]:
# Check for null values
cleaned_customer_address_xls.count()

customer_id           3496
address               3496
postcode              3496
state                 3496
country               3496
property_valuation    3496
dtype: int64

In [51]:
# Check for values that need to be replaced
cleaned_customer_address_xls["state"].value_counts()

NSW                1783
VIC                 801
QLD                 744
New South Wales      86
Victoria             82
Name: state, dtype: int64

In [57]:
# Replace inconsistent values in state column
cleaned_customer_address_xls["state"] = cleaned_customer_address_xls["state"].replace(["New South Wales", "Victoria"], ["NSW", "VIC"])

In [58]:
# Check if values were successfully replaced
cleaned_customer_address_xls["state"].value_counts()

NSW    1869
VIC     883
QLD     744
Name: state, dtype: int64

In [63]:
# Display cleaned dataframe
cleaned_customer_address_xls.head()

Unnamed: 0,customer_id,address,postcode,state,country,property_valuation
0,1,060 Morning Avenue,2016,NSW,Australia,10
1,2,6 Meadow Vale Court,2153,NSW,Australia,10
2,4,0 Holy Cross Court,4211,QLD,Australia,9
3,5,17979 Del Mar Point,2448,NSW,Australia,4
4,6,9 Oakridge Court,3216,VIC,Australia,9
