In [17]:
import pandas as pd
from datetime import datetime

# Data Exploration & Model Building

Sprocket Central Pty Ltd has given us a new list of 1000 potential customers with their demographics and attributes. However, these customers do not have prior transaction history with the organisation.

The marketing team at Sprocket Central Pty Ltd is sure that, if correctly analysed, the data would reveal useful customer insights which could help optimise resource allocation for targeted marketing. Hence, improve performance by focusing on high value customers.

In [2]:
# Func for reading in the datasets and skipping the first row

def read_data(data_path):
  return (pd.read_csv(data_path,skiprows=1))

In [3]:
# Reading in the datasets
transaction = read_data('https://raw.githubusercontent.com/idowujames/KPMG-Virtual-Data-Analytics-Internship/main/KPMG%20-%20Transactions.csv')
cust_address = read_data('https://raw.githubusercontent.com/idowujames/KPMG-Virtual-Data-Analytics-Internship/main/KPMG%20-%20CustomerAddress.csv')
cust_demographic = read_data('https://raw.githubusercontent.com/idowujames/KPMG-Virtual-Data-Analytics-Internship/main/KPMG%20-%20CustomerDemographic.csv')
cust_new = read_data('https://raw.githubusercontent.com/idowujames/KPMG-Virtual-Data-Analytics-Internship/main/KPMG%20-%20NewCustomerList.csv')

### Exploration of New Customer Dataset
Exploration and feature engineering of the new customer dataset to find insights to help see how to better target these customers

In [5]:
df = cust_new.drop(columns=['Unnamed: 16','Unnamed: 17','Unnamed: 18','Unnamed: 19','Unnamed: 20'])
df.head()

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,address,postcode,state,country,property_valuation,Rank,Value
0,Chickie,Brister,Male,86,1957-07-12,General Manager,Manufacturing,Mass Customer,N,Yes,14,45 Shopko Center,4500,QLD,Australia,6.0,1,1.71875
1,Morly,Genery,Male,69,1970-03-22,Structural Engineer,Property,Mass Customer,N,No,16,14 Mccormick Park,2113,NSW,Australia,11.0,1,1.71875
2,Ardelis,Forrester,Female,10,1974-08-28,Senior Cost Accountant,Financial Services,Affluent Customer,N,No,10,5 Colorado Crossing,3505,VIC,Australia,5.0,1,1.71875
3,Lucine,Stutt,Female,64,1979-01-28,Account Representative III,Manufacturing,Affluent Customer,N,Yes,5,207 Annamark Plaza,4814,QLD,Australia,1.0,4,1.703125
4,Melinda,Hadlee,Female,34,1965-09-21,Financial Analyst,Financial Services,Affluent Customer,N,No,19,115 Montana Place,2093,NSW,Australia,9.0,4,1.703125


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 18 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   first_name                           1000 non-null   object 
 1   last_name                            971 non-null    object 
 2   gender                               1000 non-null   object 
 3   past_3_years_bike_related_purchases  1000 non-null   int64  
 4   DOB                                  983 non-null    object 
 5   job_title                            894 non-null    object 
 6   job_industry_category                835 non-null    object 
 7   wealth_segment                       1000 non-null   object 
 8   deceased_indicator                   1000 non-null   object 
 9   owns_car                             1000 non-null   object 
 10  tenure                               1000 non-null   int64  
 11  address                        

In [9]:
# Getting percentage of missing values in dataset
(df.isna().sum() / len(df)) * 100

first_name                              0.0
last_name                               2.9
gender                                  0.0
past_3_years_bike_related_purchases     0.0
DOB                                     1.7
job_title                              10.6
job_industry_category                  16.5
wealth_segment                          0.0
deceased_indicator                      0.0
owns_car                                0.0
tenure                                  0.0
address                                 0.0
postcode                                0.0
state                                   0.0
country                                 0.0
property_valuation                      0.0
Rank                                    0.0
Value                                   0.0
dtype: float64

In [14]:
# dropping Missing values from dataset
df = df.dropna()
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 715 entries, 0 to 999
Data columns (total 18 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   first_name                           715 non-null    object 
 1   last_name                            715 non-null    object 
 2   gender                               715 non-null    object 
 3   past_3_years_bike_related_purchases  715 non-null    int64  
 4   DOB                                  715 non-null    object 
 5   job_title                            715 non-null    object 
 6   job_industry_category                715 non-null    object 
 7   wealth_segment                       715 non-null    object 
 8   deceased_indicator                   715 non-null    object 
 9   owns_car                             715 non-null    object 
 10  tenure                               715 non-null    int64  
 11  address                         

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,address,postcode,state,country,property_valuation,Rank,Value
0,Chickie,Brister,Male,86,1957-07-12,General Manager,Manufacturing,Mass Customer,N,Yes,14,45 Shopko Center,4500,QLD,Australia,6.0,1,1.71875
1,Morly,Genery,Male,69,1970-03-22,Structural Engineer,Property,Mass Customer,N,No,16,14 Mccormick Park,2113,NSW,Australia,11.0,1,1.71875
2,Ardelis,Forrester,Female,10,1974-08-28,Senior Cost Accountant,Financial Services,Affluent Customer,N,No,10,5 Colorado Crossing,3505,VIC,Australia,5.0,1,1.71875
3,Lucine,Stutt,Female,64,1979-01-28,Account Representative III,Manufacturing,Affluent Customer,N,Yes,5,207 Annamark Plaza,4814,QLD,Australia,1.0,4,1.703125
4,Melinda,Hadlee,Female,34,1965-09-21,Financial Analyst,Financial Services,Affluent Customer,N,No,19,115 Montana Place,2093,NSW,Australia,9.0,4,1.703125


## Feature Engineering

### Getting the age and age brackets of the new customers

In [15]:
# Converting the DOB column to date
df['DOB'] = pd.to_datetime(df['DOB'])

In [22]:
# Getting the ages of customers as of 2018(Time the dataset was given)
df['age'] = (datetime(2018, 1, 1) - df['DOB']).astype('<m8[Y]')

In [19]:
# Getting their age groups
bins = [0, 18, 35, 60, 100]
labels = ['child', 'young adult', 'middle aged', 'senior']
df['age_group'] = pd.cut(df['age'], bins=bins, labels=labels)


In [21]:
df.sample(8)

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,address,postcode,state,country,property_valuation,Rank,Value,age,age_group
384,Palmer,Heaven,Male,82,1995-05-18,Staff Scientist,Financial Services,Affluent Customer,N,Yes,9,5 Hoard Parkway,3754,VIC,Australia,6.0,382,0.95,22.0,young adult
468,Norah,Mapis,Female,75,2000-12-09,Assistant Manager,Argiculture,Mass Customer,N,No,10,057 Victoria Crossing,2263,NSW,Australia,9.0,468,0.889844,17.0,child
175,Tanya,Kiefer,Female,54,1992-07-05,Speech Pathologist,Telecommunications,High Net Worth,N,No,7,4 Warner Park,2146,NSW,Australia,9.0,174,1.16875,25.0,young adult
417,Freddi,Litherborough,Female,46,1989-01-14,Product Engineer,Financial Services,Mass Customer,N,No,7,7873 Meadow Vale Plaza,2460,NSW,Australia,3.0,418,0.918,28.0,young adult
268,Raff,Waycott,Male,70,1951-12-16,Engineer IV,Manufacturing,Affluent Customer,N,Yes,14,94694 Eagle Crest Terrace,3977,VIC,Australia,7.0,259,1.0625,66.0,senior
556,Worthington,Ahmed,Male,79,1972-03-24,Senior Cost Accountant,Financial Services,High Net Worth,N,No,13,39408 Manufacturers Road,3335,VIC,Australia,3.0,555,0.8125,45.0,middle aged
126,Rebeca,Aggas,Female,66,1953-02-27,Social Worker,Health,Affluent Customer,N,No,21,7026 Katie Lane,3818,VIC,Australia,1.0,127,1.25,64.0,senior
581,Debbie,Tillman,Female,3,1990-07-06,Account Coordinator,Manufacturing,High Net Worth,N,Yes,13,527 Jay Trail,4551,QLD,Australia,8.0,582,0.79,27.0,young adult
