# Churn Analysis
### details
words



In [1]:
# Import Packages
import pandas as pd
import numpy as np
np.random.seed(1)
import matplotlib.pyplot as plt

#import scikit-learn
#import jupyterlab
#import seaborn
#import xgboost

# Set Display Preferences
pd.set_option('display.max_columns', None)
pd.options.display.float_format = "{:.2f}".format



### Clean Data

In [2]:
### UK Based ecom retailer sales from Jan 12, 2009 to Sep 12, 2011.
### Mainly sells unique all-occasion gift-ware.
df1 = pd.read_excel("data/online_retail_II.xlsx", sheet_name='Year 2009-2010', header=0)
df2 = pd.read_excel("data/online_retail_II.xlsx", sheet_name='Year 2010-2011', header=0)

### Note: Both excel sheets contain 2010 sales. Duplicates must be removed before concatenating.
# There is 1 week of overlap in the datasets
print(df1['InvoiceDate'].max())
print(df2['InvoiceDate'].min())

# Remove duplicate entries 
df1 = df1[~df1["Invoice"].isin(df2['Invoice'].unique())]
# Note: achives identical result as -> df1[df1['InvoiceDate']<df2['InvoiceDate'].min()] 

2010-12-09 20:01:00
2010-12-01 08:26:00


In [None]:
df = pd.concat([df1, df2], axis=0, ignore_index=True)
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.00,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.00,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.00,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.10,13085.00,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.00,United Kingdom
...,...,...,...,...,...,...,...,...
1044843,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.00,France
1044844,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.00,France
1044845,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.00,France
1044846,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680.00,France


In [4]:
### Null Customer IDs (required to determine churn). 
### Reason for missing Customer ID is unknown. For simplicity, these entries will be excluded from the analysis. This likely creates a source of bias.
### Recomendation is to update the ordering platform to require purchasers to create an account, generating a unique Customer ID.
null_id_pct = sum(df['Customer ID'].isnull()) / len(df)
null_id_pct
print(f'Percent of entries that are missing Customer ID: {null_id_pct*100:.2f}%')


### Some items have a price of $0. These entries will not count towards orders.
### Primarily consisting of car flags, stickers, shipping materials & invoice notes.
non_items = len(df[df['Price']==0]) / len(df)
print(f'Percent of entries that had a price of $0: {non_items*100:.2f}%')


### Remove entries is no Customer ID or Price = $0
df = df[~df['Customer ID'].isnull()]
df = df[df['Price']!=0]

df = df.reset_index()
df["StockCode"] = df["StockCode"].astype(str)

Percent of entries that are missing Customer ID: 22.52%
Percent of entries that had a price of $0: 0.58%


In [5]:
### Examining the Stock Codes, entries starting with alphabetic characters are atypical invoice entries that do not represent merchandise sales.
### Ex. Test products, adjustments, postage, carriage, etc.
stockcode = (
    df.groupby(["StockCode", "Description"])["Quantity"]
      .sum()
      .reset_index()        
      .sort_values("StockCode", ascending=False)
    )
stockcode

stockcode.head(25)

### Remove entries for Stock Codes beginning with alphabetic characters
df = df[df['StockCode'].str.match(r'^\d')]

### Create Calculated Colums

In [6]:
### Create a Invoice Amount column
df['Invoice Amount'] = df['Price'] * df['Quantity']

### Sales are overwhelmingly from the UK.
uk_pct = len(df[df['Country']=='United Kingdom']) / len(df)
print(f'Percent of entries sold in the UK: {uk_pct*100:.2f}%')

### Create a binary variable indicating UK vs. Global.
df['UK_Ind'] = np.where(df['Country']=='United Kingdom', 1, 0)


Percent of entries sold in the UK: 90.12%


In [7]:
### Create a Invoice Amount column
df['Invoice Amount'] = df['Price'] * df['Quantity']

### Examining the Stock Codes, entries starting with alphabetic characters are atypical invoice entries that do not represent merchandise sales.
### Ex. Test products, adjustments, postage, carriage, etc.
stockcode = (
    df.groupby(["StockCode", "Description"])["Invoice Amount"]
      .sum()
      .reset_index()        
      .sort_values("StockCode", ascending=False)
    )

stockcode.head(25)

Unnamed: 0,StockCode,Description,Invoice Amount
5314,90214Z,"LETTER ""Z"" BLING KEY RING",9.73
5313,90214Y,"LETTER ""Y"" BLING KEY RING",21.15
5312,90214W,"LETTER ""W"" BLING KEY RING",5.98
5311,90214V,"LETTER ""V"" BLING KEY RING",15.98
5310,90214U,"LETTER ""U"" BLING KEY RING",4.73
5309,90214T,"LETTER ""T"" BLING KEY RING",42.23
5308,90214S,"LETTER ""S"" BLING KEY RING",108.5
5307,90214R,"LETTER ""R"" BLING KEY RING",53.77
5306,90214P,"LETTER ""P"" BLING KEY RING",41.0
5305,90214O,"LETTER ""O"" BLING KEY RING",27.23


In [None]:
df_aggregated = df.groupby(['Customer ID', 'UK_Ind'], as_index=False).agg(
    # First & last txn in the measurement period
    first_txn = ('InvoiceDate', 'min'),
    last_txn = ('InvoiceDate', 'max'),
    # Transaction Count
    total_txns = ('Invoice', 'nunique'),
    sale_txns = ('Invoice', lambda x: x[df.loc[x.index, 'Quantity'] >= 0].nunique()),
    return_txns = ('Invoice', lambda x: x[df.loc[x.index, 'Quantity'] < 0].nunique()),
    # Sales amount ($)
    total_amt = ('Invoice Amount', 'sum'),
    sale_amt = ('Invoice Amount', lambda x: x[df.loc[x.index, 'Quantity'] >= 0].sum()),
    return_amt = ('Invoice Amount', lambda x: x[df.loc[x.index, 'Quantity'] < 0].sum()),
    # Sale quantity
    total_qty = ('Quantity', 'sum'),
    sale_qty = ('Quantity', lambda x: x[df.loc[x.index, 'Quantity'] >= 0].sum()),
    return_qty = ('Quantity', lambda x: x[df.loc[x.index, 'Quantity'] < 0].sum()),
    # Variaty of SKUs
    total_skus = ('StockCode', 'nunique'),
    sale_skus = ('StockCode', lambda x: x[df.loc[x.index, 'Quantity'] >= 0].nunique())
    )

# Returns Percent
df_aggregated['return_pct'] = df_aggregated['return_amt'] / df_aggregated['sale_amt']
# Units per Transaction
df_aggregated['upt'] = df_aggregated['total_qty'] / df_aggregated['total_txns']
# Average Unit Retail
df_aggregated['aur'] = df_aggregated['total_amt'] / df_aggregated['total_qty']
# Average Order Value
df_aggregated['aov'] = df_aggregated['total_amt'] / df_aggregated['total_txns']





df_aggregated.head()

Unnamed: 0,Customer ID,UK_Ind,first_txn,last_txn,total_txns,sale_txns,return_txns,total_amt,sale_amt,return_amt,total_qty,sale_qty,return_qty,total_skus,sale_skus
0,12346.0,1,2010-03-02 13:08:00,2011-01-18 10:17:00,4,3,1,169.36,77352.96,-77183.6,24,74239,-74215,25,25
1,12347.0,0,2010-10-31 14:20:00,2011-12-07 15:52:00,8,8,0,4921.53,4921.53,0.0,2967,2967,0,126,126
2,12348.0,0,2010-09-27 14:59:00,2011-09-25 13:13:00,5,5,0,1658.4,1658.4,0.0,2704,2704,0,24,24
3,12349.0,0,2009-12-04 12:49:00,2011-11-21 09:51:00,4,3,1,3654.54,3678.69,-24.15,1616,1621,-5,138,137
4,12350.0,0,2011-02-02 16:01:00,2011-02-02 16:01:00,1,1,0,294.4,294.4,0.0,196,196,0,16,16


In [None]:
### Check to confirm there are no Customer IDs found in multiple markets/countries
cx_market = df_aggregated.groupby("Customer ID")["UK_Ind"].count()
multi_market_cx = sum(cx_market>1)
print('Number of customers in multiple markets:', multi_market_cx)

Number of customers in multiple markets: 0
