In [1]:
# import relevant libraries
import pandas as pd
import scipy.stats
import numpy as np
import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import datetime as dt
import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('float_format', '{:,.2f}'.format)

# Importing dataset

In [2]:
# import dataset
df = pd.read_csv('cleaned_data.csv', index_col = 0)

In [3]:
df.shape

(380401, 35)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 380401 entries, 0 to 544593
Data columns (total 35 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   Site_Level_Price_Index          380401 non-null  float64
 1   CUSTOMER_ID                     380401 non-null  int64  
 2   CUSTOMER_SITE_ID                380401 non-null  int64  
 3   SHORT_VERTICAL                  380401 non-null  object 
 4   POSTAL_CODE                     380401 non-null  int64  
 5   CUSTOMER_CLASS                  380401 non-null  object 
 6   TERRITORY_TYPE                  380401 non-null  object 
 7   SUPPLIES_SEGMENTATION           380401 non-null  object 
 8   SUPPLIES_DECLINE_REASONS        380401 non-null  object 
 9   DUNS_NUMBER                     380401 non-null  int64  
 10  TRX_DATE                        380401 non-null  object 
 11  TRX_AMT_USD                     380401 non-null  float64
 12  Margin          

In [5]:
# set date columns to datetime 
df['TRX_DATE'] = pd.to_datetime(df['TRX_DATE'])

# set trx year to str
df['TRX_YEAR'] = df['TRX_YEAR'].astype(str)

In [6]:
df.head()

Unnamed: 0,Site_Level_Price_Index,CUSTOMER_ID,CUSTOMER_SITE_ID,SHORT_VERTICAL,POSTAL_CODE,CUSTOMER_CLASS,TERRITORY_TYPE,SUPPLIES_SEGMENTATION,SUPPLIES_DECLINE_REASONS,DUNS_NUMBER,TRX_DATE,TRX_AMT_USD,Margin,SALES_CHANNEL,QUANTITY,ORDER_TYPE,ORDER_NUM,PRODUCT_FAMILY,PRODUCT_MODEL,Total_SVC_Incidents,Total_Repeat_Calls,Total_FTF_Calls,Most_Frequent_Interaction_Type,Total_Visits,Total_Cases,Max_Case_Origin,Max_Case_Reason,Num_of_Active_Install_Bases,Total_Contracts,Contract_length,Contract_Category,Num_of_Inactive_Install_Bases,STRATEGIC_ACCOUNTS,TERRITORY_REGION,TRX_YEAR
0,0.8,117841,609636,FRUIT & VEGETABLE,97301,END USER,Industrial,S,Over Stocked / Timing,78842640,2016-02-05,207.72,188.59,Online,1,STANDARD DOMESTIC,511759211.0,CIJ,MAKE-UP,11.0,3.0,8.0,Call,18.0,1.97,unknown,unknown,6.0,0.0,0.0,No Contract,0.0,0,NW,2016
1,0.91,113032,578406,AERO/AUTO,32539,END USER,Industrial,S,,43202248,2016-12-16,214.79,198.94,Esker,1,STANDARD DOMESTIC,511868043.0,CIJ,VALUE PACK,21.0,5.0,16.0,Email,58.0,1.0,Email - VTI NACC,Customer Experience,4.0,4.0,1011.25,FSMA,0.0,0,SE,2016
2,0.8,117841,609636,FRUIT & VEGETABLE,97301,END USER,Industrial,S,Over Stocked / Timing,78842640,2016-08-26,207.72,188.59,Online,1,STANDARD DOMESTIC,511823154.0,CIJ,MAKE-UP,11.0,3.0,8.0,Call,18.0,1.97,unknown,unknown,6.0,0.0,0.0,No Contract,0.0,0,NW,2016
3,0.91,113032,578406,AERO/AUTO,32539,END USER,Industrial,S,,43202248,2016-10-19,429.58,397.89,Esker,2,STANDARD DOMESTIC,511849315.0,CIJ,VALUE PACK,21.0,5.0,16.0,Email,58.0,1.0,Email - VTI NACC,Customer Experience,4.0,4.0,1011.25,FSMA,0.0,0,SE,2016
4,0.8,117841,609636,FRUIT & VEGETABLE,97301,END USER,Industrial,S,Over Stocked / Timing,78842640,2016-08-19,623.16,565.77,Online,3,STANDARD DOMESTIC,511823154.0,CIJ,MAKE-UP,11.0,3.0,8.0,Call,18.0,1.97,unknown,unknown,6.0,0.0,0.0,No Contract,0.0,0,NW,2016


# Calculating aggregation variables

## Recency, Frequency, Tenure, Churn

In [7]:
print('The earliest transaction date: ', min(df.TRX_DATE))
print('The latest transaction date: ', max(df.TRX_DATE))

The earliest transaction date:  2015-01-02 00:00:00
The latest transaction date:  2020-06-03 00:00:00


In [8]:
# set now to max transaction date + 1
from datetime import timedelta
NOW =  max(df.TRX_DATE) +  timedelta(days=1)

In [9]:
# calculate recency
rec = df.groupby('CUSTOMER_SITE_ID').agg({'TRX_DATE': lambda x: (NOW - x.max()).days}).reset_index()
rec['TRX_DATE'] = rec['TRX_DATE'].astype(int)
rec.rename(columns = {'TRX_DATE': 'Recency'}, inplace = True)
rec.head(10)

Unnamed: 0,CUSTOMER_SITE_ID,Recency
0,24,24
1,90,50
2,111,415
3,114,6
4,126,49
5,141,80
6,158,78
7,234,447
8,247,17
9,287,64


In [10]:
# update variables table
variables = rec

In [11]:
# calculate frequency
freq = df.groupby('CUSTOMER_SITE_ID').agg({'TRX_DATE': lambda x: (x.max()- x.min()).days + 1,
                                           'ORDER_NUM': lambda x: len(x)}).reset_index()
freq['TRX_DATE'] = freq['TRX_DATE'].astype(int)
freq.rename(columns = {'TRX_DATE': 'days', 
                       'ORDER_NUM': 'order_counts'}, inplace = True)
freq['Frequency'] = freq['days'] / (freq['order_counts']-1)
freq.head(10)

Unnamed: 0,CUSTOMER_SITE_ID,days,order_counts,Frequency
0,24,53,4.0,17.67
1,90,1829,98.0,18.86
2,111,1457,21.0,72.85
3,114,1968,174.0,11.38
4,126,1115,19.0,61.94
5,141,1883,92.0,20.69
6,158,1896,65.0,29.62
7,234,1522,176.0,8.7
8,247,1869,54.0,35.26
9,287,1882,43.0,44.81


In [12]:
(freq['Frequency'] == np.inf).sum()

0

In [13]:
freq['Frequency'] = freq['Frequency'].replace(np.inf, 0)

In [14]:
# update variables table
variables['Frequency'] = freq['Frequency']
variables['Tenure'] = freq['days']
variables.head()

Unnamed: 0,CUSTOMER_SITE_ID,Recency,Frequency,Tenure
0,24,24,17.67,53
1,90,50,18.86,1829
2,111,415,72.85,1457
3,114,6,11.38,1968
4,126,49,61.94,1115


In [15]:
(variables['Frequency'] == np.inf).sum()

0

In [16]:
variables['Churned_365'] = (variables['Recency'] > 365).replace({False: 0, True: 1})
variables.head()

Unnamed: 0,CUSTOMER_SITE_ID,Recency,Frequency,Tenure,Churned_365
0,24,24,17.67,53,0
1,90,50,18.86,1829,0
2,111,415,72.85,1457,1
3,114,6,11.38,1968,0
4,126,49,61.94,1115,0


In [17]:
variables['Churned_365'].sum()/len(variables)

0.2604278713995458

In [18]:
variables.head()

Unnamed: 0,CUSTOMER_SITE_ID,Recency,Frequency,Tenure,Churned_365
0,24,24,17.67,53,0
1,90,50,18.86,1829,0
2,111,415,72.85,1457,1
3,114,6,11.38,1968,0
4,126,49,61.94,1115,0


## Transaction variables

In [19]:
# make a copy of useful variables from the original data
df1 = df[['CUSTOMER_SITE_ID', 'TRX_DATE', 'Margin',
         'TRX_AMT_USD', 'QUANTITY', 'PRODUCT_FAMILY', 'PRODUCT_MODEL']].sort_values(['CUSTOMER_SITE_ID', 'TRX_DATE'])

#df1 = df1.set_index('TRX_DATE')
df1.head()

Unnamed: 0,CUSTOMER_SITE_ID,TRX_DATE,Margin,TRX_AMT_USD,QUANTITY,PRODUCT_FAMILY,PRODUCT_MODEL
540298,24,2020-03-20,1381.2,1855.74,6,LASER,FUME EXTRACTION
81899,24,2020-04-15,537.02,552.68,2,LCM,INK
81898,24,2020-05-04,4843.37,5000.0,20,LCM,INK
81900,24,2020-05-11,4843.37,5000.0,20,LCM,INK
68989,90,2015-04-14,405.99,563.34,1,CIJ,INK


### Number of transactions: count

In [20]:
num_trxn_overall = df1.groupby('CUSTOMER_SITE_ID')['TRX_DATE'].count().reset_index()
num_trxn_overall.rename(columns = {'TRX_DATE': 'Num_of_Trxns'}, inplace = True)

In [21]:
# update variables table

variables = pd.merge(variables, num_trxn_overall, how = 'left', on = 'CUSTOMER_SITE_ID').fillna(0)

In [22]:
variables.head()

Unnamed: 0,CUSTOMER_SITE_ID,Recency,Frequency,Tenure,Churned_365,Num_of_Trxns
0,24,24,17.67,53,0,4
1,90,50,18.86,1829,0,98
2,111,415,72.85,1457,1,21
3,114,6,11.38,1968,0,174
4,126,49,61.94,1115,0,19


### TRX_AMT_USD: average

In [23]:
# calculate average transaction amount

amt_avg_overall = df1.groupby('CUSTOMER_SITE_ID').agg({'TRX_AMT_USD': lambda x: x.mean()}).reset_index()
amt_avg_overall.rename(columns = {'TRX_AMT_USD': 'Avg_Trxn_Amt'}, inplace = True)

In [24]:
# update variables table

variables = pd.merge(variables, amt_avg_overall, how = 'left', on = 'CUSTOMER_SITE_ID').fillna(0)
variables.head()

Unnamed: 0,CUSTOMER_SITE_ID,Recency,Frequency,Tenure,Churned_365,Num_of_Trxns,Avg_Trxn_Amt
0,24,24,17.67,53,0,4,3102.11
1,90,50,18.86,1829,0,98,233.9
2,111,415,72.85,1457,1,21,220.75
3,114,6,11.38,1968,0,174,145.43
4,126,49,61.94,1115,0,19,835.78


### Margin: average

In [25]:
# calculate average margin amount

margin_avg_overall = df1.groupby('CUSTOMER_SITE_ID').agg({'Margin': lambda x: x.mean()}).reset_index()
margin_avg_overall.rename(columns = {'Margin': 'Avg_Margin'}, inplace = True)

In [26]:
# update variables table

variables = pd.merge(variables, margin_avg_overall, how = 'left', on = 'CUSTOMER_SITE_ID').fillna(0)
variables.head()

Unnamed: 0,CUSTOMER_SITE_ID,Recency,Frequency,Tenure,Churned_365,Num_of_Trxns,Avg_Trxn_Amt,Avg_Margin
0,24,24,17.67,53,0,4,3102.11,2901.24
1,90,50,18.86,1829,0,98,233.9,172.39
2,111,415,72.85,1457,1,21,220.75,188.43
3,114,6,11.38,1968,0,174,145.43,103.28
4,126,49,61.94,1115,0,19,835.78,605.25


### QUANTITY: average

In [27]:
# calculate average quantity

qt_avg_overall = df1.groupby('CUSTOMER_SITE_ID').agg({'QUANTITY': lambda x: x.mean()}).reset_index()
qt_avg_overall.rename(columns = {'QUANTITY': 'Avg_Quantity'}, inplace = True)

In [28]:
# update variables table
variables = pd.merge(variables, qt_avg_overall, how = 'left', on = 'CUSTOMER_SITE_ID').fillna(0)
variables.head()

Unnamed: 0,CUSTOMER_SITE_ID,Recency,Frequency,Tenure,Churned_365,Num_of_Trxns,Avg_Trxn_Amt,Avg_Margin,Avg_Quantity
0,24,24,17.67,53,0,4,3102.11,2901.24,12.0
1,90,50,18.86,1829,0,98,233.9,172.39,11.37
2,111,415,72.85,1457,1,21,220.75,188.43,1.05
3,114,6,11.38,1968,0,174,145.43,103.28,4.43
4,126,49,61.94,1115,0,19,835.78,605.25,8.84


### PRODUCT_FAMILY / PRODUCT_MODEL: mode, count

In [29]:
# calculate number of product family /product model

prod_mode_overall = df1.groupby('CUSTOMER_SITE_ID').agg({'PRODUCT_FAMILY': lambda x: x.mode()[0],
                                                         'PRODUCT_MODEL': lambda x: x.mode()[0]}).reset_index()
prod_mode_overall.rename(columns = {'PRODUCT_FAMILY': 'Mode_of_Product_Family',
                                    'PRODUCT_MODEL': 'Mode_of_Product_Model'}, inplace = True)

In [30]:
# update variables table

variables = pd.merge(variables, prod_mode_overall, how = 'left', on = 'CUSTOMER_SITE_ID')
variables.head()

Unnamed: 0,CUSTOMER_SITE_ID,Recency,Frequency,Tenure,Churned_365,Num_of_Trxns,Avg_Trxn_Amt,Avg_Margin,Avg_Quantity,Mode_of_Product_Family,Mode_of_Product_Model
0,24,24,17.67,53,0,4,3102.11,2901.24,12.0,LCM,INK
1,90,50,18.86,1829,0,98,233.9,172.39,11.37,CIJ,MAKE-UP
2,111,415,72.85,1457,1,21,220.75,188.43,1.05,CIJ,MAKE-UP
3,114,6,11.38,1968,0,174,145.43,103.28,4.43,CIJ,MAKE-UP
4,126,49,61.94,1115,0,19,835.78,605.25,8.84,TIJ,INK


In [31]:
# calculate number of product family /product model

prod_count_overall = df1.groupby('CUSTOMER_SITE_ID').agg({'PRODUCT_FAMILY': lambda x: x.nunique(),
                                                         'PRODUCT_MODEL': lambda x: x.nunique()}).reset_index()
prod_count_overall.rename(columns = {'PRODUCT_FAMILY': 'Types_of_Product_Family',
                                    'PRODUCT_MODEL': 'Types_of_Product_Model'}, inplace = True)

In [32]:
# update variables table

variables = pd.merge(variables, prod_count_overall, how = 'left', on = 'CUSTOMER_SITE_ID').fillna(0)
variables.head()

Unnamed: 0,CUSTOMER_SITE_ID,Recency,Frequency,Tenure,Churned_365,Num_of_Trxns,Avg_Trxn_Amt,Avg_Margin,Avg_Quantity,Mode_of_Product_Family,Mode_of_Product_Model,Types_of_Product_Family,Types_of_Product_Model
0,24,24,17.67,53,0,4,3102.11,2901.24,12.0,LCM,INK,2,2
1,90,50,18.86,1829,0,98,233.9,172.39,11.37,CIJ,MAKE-UP,1,3
2,111,415,72.85,1457,1,21,220.75,188.43,1.05,CIJ,MAKE-UP,1,2
3,114,6,11.38,1968,0,174,145.43,103.28,4.43,CIJ,MAKE-UP,3,4
4,126,49,61.94,1115,0,19,835.78,605.25,8.84,TIJ,INK,1,1


## Other variables

### SALES_CHANNEL: mode

In [33]:
# calculate most frequent sales channel
sales = df.groupby('CUSTOMER_SITE_ID').agg({'SALES_CHANNEL': lambda x: x.mode()[0]}).reset_index()
sales.rename(columns = {'SALES_CHANNEL': 'Most_Frequent_Sales_Channel'}, inplace = True)
sales.head()

Unnamed: 0,CUSTOMER_SITE_ID,Most_Frequent_Sales_Channel
0,24,Copy
1,90,EDI
2,111,EDI
3,114,EDI
4,126,EDI


In [34]:
# update variables table
variables['Most_Frequent_Sales_Channel'] = sales['Most_Frequent_Sales_Channel']
variables.head()

Unnamed: 0,CUSTOMER_SITE_ID,Recency,Frequency,Tenure,Churned_365,Num_of_Trxns,Avg_Trxn_Amt,Avg_Margin,Avg_Quantity,Mode_of_Product_Family,Mode_of_Product_Model,Types_of_Product_Family,Types_of_Product_Model,Most_Frequent_Sales_Channel
0,24,24,17.67,53,0,4,3102.11,2901.24,12.0,LCM,INK,2,2,Copy
1,90,50,18.86,1829,0,98,233.9,172.39,11.37,CIJ,MAKE-UP,1,3,EDI
2,111,415,72.85,1457,1,21,220.75,188.43,1.05,CIJ,MAKE-UP,1,2,EDI
3,114,6,11.38,1968,0,174,145.43,103.28,4.43,CIJ,MAKE-UP,3,4,EDI
4,126,49,61.94,1115,0,19,835.78,605.25,8.84,TIJ,INK,1,1,EDI


### ORDER_TYPE: mode

In [35]:
# calculate most frequent order type
ot = df.groupby('CUSTOMER_SITE_ID').agg({'ORDER_TYPE': lambda x: x.mode()[0]}).reset_index()
ot.rename(columns = {'ORDER_TYPE': 'Most_Frequent_Order_Type'}, inplace = True)
ot.head()

Unnamed: 0,CUSTOMER_SITE_ID,Most_Frequent_Order_Type
0,24,STANDARD DOMESTIC
1,90,EDI
2,111,EDI
3,114,EDI
4,126,EDI


In [36]:
# update variables table
variables['Most_Frequent_Order_Type'] = ot['Most_Frequent_Order_Type']
variables.head()

Unnamed: 0,CUSTOMER_SITE_ID,Recency,Frequency,Tenure,Churned_365,Num_of_Trxns,Avg_Trxn_Amt,Avg_Margin,Avg_Quantity,Mode_of_Product_Family,Mode_of_Product_Model,Types_of_Product_Family,Types_of_Product_Model,Most_Frequent_Sales_Channel,Most_Frequent_Order_Type
0,24,24,17.67,53,0,4,3102.11,2901.24,12.0,LCM,INK,2,2,Copy,STANDARD DOMESTIC
1,90,50,18.86,1829,0,98,233.9,172.39,11.37,CIJ,MAKE-UP,1,3,EDI,EDI
2,111,415,72.85,1457,1,21,220.75,188.43,1.05,CIJ,MAKE-UP,1,2,EDI,EDI
3,114,6,11.38,1968,0,174,145.43,103.28,4.43,CIJ,MAKE-UP,3,4,EDI,EDI
4,126,49,61.94,1115,0,19,835.78,605.25,8.84,TIJ,INK,1,1,EDI,EDI


### PRICE_IDX: average

In [37]:
# calculate average price index
#pi_avg = df.groupby(['CUSTOMER_SITE_ID', 'TRX_YEAR']).agg({'PRICE_IDX': lambda x: x.mean()}).reset_index()
pi_avg = df.groupby('CUSTOMER_SITE_ID').agg({'Site_Level_Price_Index': lambda x: x.mean()}).reset_index()
pi_avg.rename(columns = {'Site_Level_Price_Index': 'Avg_Price_Index'}, inplace = True)
pi_avg.head()

Unnamed: 0,CUSTOMER_SITE_ID,Avg_Price_Index
0,24,0.79
1,90,1.35
2,111,1.28
3,114,1.25
4,126,0.75


In [38]:
# update variables table
variables['Avg_Price_Index'] = pi_avg['Avg_Price_Index']
variables.head()

Unnamed: 0,CUSTOMER_SITE_ID,Recency,Frequency,Tenure,Churned_365,Num_of_Trxns,Avg_Trxn_Amt,Avg_Margin,Avg_Quantity,Mode_of_Product_Family,Mode_of_Product_Model,Types_of_Product_Family,Types_of_Product_Model,Most_Frequent_Sales_Channel,Most_Frequent_Order_Type,Avg_Price_Index
0,24,24,17.67,53,0,4,3102.11,2901.24,12.0,LCM,INK,2,2,Copy,STANDARD DOMESTIC,0.79
1,90,50,18.86,1829,0,98,233.9,172.39,11.37,CIJ,MAKE-UP,1,3,EDI,EDI,1.35
2,111,415,72.85,1457,1,21,220.75,188.43,1.05,CIJ,MAKE-UP,1,2,EDI,EDI,1.28
3,114,6,11.38,1968,0,174,145.43,103.28,4.43,CIJ,MAKE-UP,3,4,EDI,EDI,1.25
4,126,49,61.94,1115,0,19,835.78,605.25,8.84,TIJ,INK,1,1,EDI,EDI,0.75


In [39]:
variables.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8367 entries, 0 to 8366
Data columns (total 16 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   CUSTOMER_SITE_ID             8367 non-null   int64  
 1   Recency                      8367 non-null   int64  
 2   Frequency                    8367 non-null   float64
 3   Tenure                       8367 non-null   int64  
 4   Churned_365                  8367 non-null   int64  
 5   Num_of_Trxns                 8367 non-null   int64  
 6   Avg_Trxn_Amt                 8367 non-null   float64
 7   Avg_Margin                   8367 non-null   float64
 8   Avg_Quantity                 8367 non-null   float64
 9   Mode_of_Product_Family       8367 non-null   object 
 10  Mode_of_Product_Model        8367 non-null   object 
 11  Types_of_Product_Family      8367 non-null   int64  
 12  Types_of_Product_Model       8367 non-null   int64  
 13  Most_Frequent_Sale

# Joining calculated variables with CUSTOMER_SITE_ID unique variables

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 380401 entries, 0 to 544593
Data columns (total 35 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   Site_Level_Price_Index          380401 non-null  float64       
 1   CUSTOMER_ID                     380401 non-null  int64         
 2   CUSTOMER_SITE_ID                380401 non-null  int64         
 3   SHORT_VERTICAL                  380401 non-null  object        
 4   POSTAL_CODE                     380401 non-null  int64         
 5   CUSTOMER_CLASS                  380401 non-null  object        
 6   TERRITORY_TYPE                  380401 non-null  object        
 7   SUPPLIES_SEGMENTATION           380401 non-null  object        
 8   SUPPLIES_DECLINE_REASONS        380401 non-null  object        
 9   DUNS_NUMBER                     380401 non-null  int64         
 10  TRX_DATE                        380401 non-null  datetim

In [41]:
# extract the CUSTOMER_SITE_ID unique variables from df
df_unique_vairables = df[['CUSTOMER_SITE_ID', 'SHORT_VERTICAL', 'POSTAL_CODE',
                               'CUSTOMER_CLASS', 'TERRITORY_TYPE',
                               'SUPPLIES_SEGMENTATION', 'SUPPLIES_DECLINE_REASONS', 'DUNS_NUMBER',
                               'Total_SVC_Incidents', 'Total_Repeat_Calls',
                               'Total_FTF_Calls', 'Most_Frequent_Interaction_Type', 'Total_Visits', 
                               'Total_Cases', 'Num_of_Active_Install_Bases',
                               'Total_Contracts', 'Contract_length',
                               'Contract_Category',
                               'STRATEGIC_ACCOUNTS', 'TERRITORY_REGION'
                               ]]
df_unique_vairables.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 380401 entries, 0 to 544593
Data columns (total 20 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   CUSTOMER_SITE_ID                380401 non-null  int64  
 1   SHORT_VERTICAL                  380401 non-null  object 
 2   POSTAL_CODE                     380401 non-null  int64  
 3   CUSTOMER_CLASS                  380401 non-null  object 
 4   TERRITORY_TYPE                  380401 non-null  object 
 5   SUPPLIES_SEGMENTATION           380401 non-null  object 
 6   SUPPLIES_DECLINE_REASONS        380401 non-null  object 
 7   DUNS_NUMBER                     380401 non-null  int64  
 8   Total_SVC_Incidents             380401 non-null  float64
 9   Total_Repeat_Calls              380401 non-null  float64
 10  Total_FTF_Calls                 380401 non-null  float64
 11  Most_Frequent_Interaction_Type  380401 non-null  object 
 12  Total_Visits    

In [42]:
# drop duplicates to keep one row for each CUSTOMER_SITE_ID
df_unique_vairables.drop_duplicates(inplace = True)
df_unique_vairables.shape

(8367, 20)

In [43]:
df_unique_vairables = df_unique_vairables.sort_values(by = 'CUSTOMER_SITE_ID')

In [44]:
df_unique_vairables.shape

(8367, 20)

In [45]:
variables.shape

(8367, 16)

In [46]:
# combine the calculated variables and CUSTOMER_SITE_ID unique variables

all_variables = pd.merge(variables, df_unique_vairables, on = 'CUSTOMER_SITE_ID')

In [47]:
all_variables.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8367 entries, 0 to 8366
Data columns (total 35 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   CUSTOMER_SITE_ID                8367 non-null   int64  
 1   Recency                         8367 non-null   int64  
 2   Frequency                       8367 non-null   float64
 3   Tenure                          8367 non-null   int64  
 4   Churned_365                     8367 non-null   int64  
 5   Num_of_Trxns                    8367 non-null   int64  
 6   Avg_Trxn_Amt                    8367 non-null   float64
 7   Avg_Margin                      8367 non-null   float64
 8   Avg_Quantity                    8367 non-null   float64
 9   Mode_of_Product_Family          8367 non-null   object 
 10  Mode_of_Product_Model           8367 non-null   object 
 11  Types_of_Product_Family         8367 non-null   int64  
 12  Types_of_Product_Model          83

In [48]:
all_variables.head()

Unnamed: 0,CUSTOMER_SITE_ID,Recency,Frequency,Tenure,Churned_365,Num_of_Trxns,Avg_Trxn_Amt,Avg_Margin,Avg_Quantity,Mode_of_Product_Family,Mode_of_Product_Model,Types_of_Product_Family,Types_of_Product_Model,Most_Frequent_Sales_Channel,Most_Frequent_Order_Type,Avg_Price_Index,SHORT_VERTICAL,POSTAL_CODE,CUSTOMER_CLASS,TERRITORY_TYPE,SUPPLIES_SEGMENTATION,SUPPLIES_DECLINE_REASONS,DUNS_NUMBER,Total_SVC_Incidents,Total_Repeat_Calls,Total_FTF_Calls,Most_Frequent_Interaction_Type,Total_Visits,Total_Cases,Num_of_Active_Install_Bases,Total_Contracts,Contract_length,Contract_Category,STRATEGIC_ACCOUNTS,TERRITORY_REGION
0,24,24,17.67,53,0,4,3102.11,2901.24,12.0,LCM,INK,2,2,Copy,STANDARD DOMESTIC,0.79,GRAPHICS,60085,END USER,Industrial,S,,144782380,13.0,7.0,6.0,Call,81.0,1.97,5.0,0.0,0.0,No Contract,0,MW
1,90,50,18.86,1829,0,98,233.9,172.39,11.37,CIJ,MAKE-UP,1,3,EDI,EDI,1.35,CHEMICALS,65802,END USER,Industrial,S,,43937895,57.0,13.0,44.0,Call,53.0,3.03,6.0,6.0,1003.0,FSMA,0,MC
2,111,415,72.85,1457,1,21,220.75,188.43,1.05,CIJ,MAKE-UP,1,2,EDI,EDI,1.28,GRAPHICS,56073,END USER,Industrial,S,,119130057,1.0,0.0,1.0,Call,22.0,3.03,2.0,0.0,0.0,No Contract,0,MW
3,114,6,11.38,1968,0,174,145.43,103.28,4.43,CIJ,MAKE-UP,3,4,EDI,EDI,1.25,PHARMA & MEDICAL,92614,END USER,Industrial,M,,84160407,57.0,14.0,43.0,Call,70.0,1.0,15.0,11.0,521.64,FSMA,0,NW
4,126,49,61.94,1115,0,19,835.78,605.25,8.84,TIJ,INK,1,1,EDI,EDI,0.75,PHARMA & MEDICAL,8822,END USER,Industrial,S,Over Stocked / Timing,36781508,1.0,0.0,1.0,Call,13.0,1.0,2.0,0.0,0.0,No Contract,0,NE


In [49]:
all_variables.shape

(8367, 35)

In [50]:
all_variables.head()

Unnamed: 0,CUSTOMER_SITE_ID,Recency,Frequency,Tenure,Churned_365,Num_of_Trxns,Avg_Trxn_Amt,Avg_Margin,Avg_Quantity,Mode_of_Product_Family,Mode_of_Product_Model,Types_of_Product_Family,Types_of_Product_Model,Most_Frequent_Sales_Channel,Most_Frequent_Order_Type,Avg_Price_Index,SHORT_VERTICAL,POSTAL_CODE,CUSTOMER_CLASS,TERRITORY_TYPE,SUPPLIES_SEGMENTATION,SUPPLIES_DECLINE_REASONS,DUNS_NUMBER,Total_SVC_Incidents,Total_Repeat_Calls,Total_FTF_Calls,Most_Frequent_Interaction_Type,Total_Visits,Total_Cases,Num_of_Active_Install_Bases,Total_Contracts,Contract_length,Contract_Category,STRATEGIC_ACCOUNTS,TERRITORY_REGION
0,24,24,17.67,53,0,4,3102.11,2901.24,12.0,LCM,INK,2,2,Copy,STANDARD DOMESTIC,0.79,GRAPHICS,60085,END USER,Industrial,S,,144782380,13.0,7.0,6.0,Call,81.0,1.97,5.0,0.0,0.0,No Contract,0,MW
1,90,50,18.86,1829,0,98,233.9,172.39,11.37,CIJ,MAKE-UP,1,3,EDI,EDI,1.35,CHEMICALS,65802,END USER,Industrial,S,,43937895,57.0,13.0,44.0,Call,53.0,3.03,6.0,6.0,1003.0,FSMA,0,MC
2,111,415,72.85,1457,1,21,220.75,188.43,1.05,CIJ,MAKE-UP,1,2,EDI,EDI,1.28,GRAPHICS,56073,END USER,Industrial,S,,119130057,1.0,0.0,1.0,Call,22.0,3.03,2.0,0.0,0.0,No Contract,0,MW
3,114,6,11.38,1968,0,174,145.43,103.28,4.43,CIJ,MAKE-UP,3,4,EDI,EDI,1.25,PHARMA & MEDICAL,92614,END USER,Industrial,M,,84160407,57.0,14.0,43.0,Call,70.0,1.0,15.0,11.0,521.64,FSMA,0,NW
4,126,49,61.94,1115,0,19,835.78,605.25,8.84,TIJ,INK,1,1,EDI,EDI,0.75,PHARMA & MEDICAL,8822,END USER,Industrial,S,Over Stocked / Timing,36781508,1.0,0.0,1.0,Call,13.0,1.0,2.0,0.0,0.0,No Contract,0,NE


In [51]:
# move churn to the last column
temp = all_variables['Churned_365']
all_variables.drop('Churned_365',axis = 1,inplace = True)
all_variables['Churned_365']=temp
all_variables.head()

Unnamed: 0,CUSTOMER_SITE_ID,Recency,Frequency,Tenure,Num_of_Trxns,Avg_Trxn_Amt,Avg_Margin,Avg_Quantity,Mode_of_Product_Family,Mode_of_Product_Model,Types_of_Product_Family,Types_of_Product_Model,Most_Frequent_Sales_Channel,Most_Frequent_Order_Type,Avg_Price_Index,SHORT_VERTICAL,POSTAL_CODE,CUSTOMER_CLASS,TERRITORY_TYPE,SUPPLIES_SEGMENTATION,SUPPLIES_DECLINE_REASONS,DUNS_NUMBER,Total_SVC_Incidents,Total_Repeat_Calls,Total_FTF_Calls,Most_Frequent_Interaction_Type,Total_Visits,Total_Cases,Num_of_Active_Install_Bases,Total_Contracts,Contract_length,Contract_Category,STRATEGIC_ACCOUNTS,TERRITORY_REGION,Churned_365
0,24,24,17.67,53,4,3102.11,2901.24,12.0,LCM,INK,2,2,Copy,STANDARD DOMESTIC,0.79,GRAPHICS,60085,END USER,Industrial,S,,144782380,13.0,7.0,6.0,Call,81.0,1.97,5.0,0.0,0.0,No Contract,0,MW,0
1,90,50,18.86,1829,98,233.9,172.39,11.37,CIJ,MAKE-UP,1,3,EDI,EDI,1.35,CHEMICALS,65802,END USER,Industrial,S,,43937895,57.0,13.0,44.0,Call,53.0,3.03,6.0,6.0,1003.0,FSMA,0,MC,0
2,111,415,72.85,1457,21,220.75,188.43,1.05,CIJ,MAKE-UP,1,2,EDI,EDI,1.28,GRAPHICS,56073,END USER,Industrial,S,,119130057,1.0,0.0,1.0,Call,22.0,3.03,2.0,0.0,0.0,No Contract,0,MW,1
3,114,6,11.38,1968,174,145.43,103.28,4.43,CIJ,MAKE-UP,3,4,EDI,EDI,1.25,PHARMA & MEDICAL,92614,END USER,Industrial,M,,84160407,57.0,14.0,43.0,Call,70.0,1.0,15.0,11.0,521.64,FSMA,0,NW,0
4,126,49,61.94,1115,19,835.78,605.25,8.84,TIJ,INK,1,1,EDI,EDI,0.75,PHARMA & MEDICAL,8822,END USER,Industrial,S,Over Stocked / Timing,36781508,1.0,0.0,1.0,Call,13.0,1.0,2.0,0.0,0.0,No Contract,0,NE,0


In [52]:
all_variables.shape

(8367, 35)

## Churn definition updation

In [53]:
# If marked as churned and Frequency > 365 and supplies decline reason does not indicate a potential churn
weak_churn_reason = ['Over Stocked / Timing',
                           'Migration to 1000 Line/TIJ/TTO/LCM/LPA', 'Moved Equipment',
                           'Production Down (timing)',
                           'Migration to Lasers',
                           'Production / Code Reduction', 'Recent Regain/Win-back',
                           'Served by Authorized Distributor',
                           'Seasonal Producer', 'Project Based',
                           'Printing/EQ downtime Issues', 'VJ Operations Issues',
                           'Pricing / Discounting', 'Financial Distress/Credit Hold']

all_variables['Churned_365'] = np.where(((all_variables['Churned_365'] == 1) & (all_variables['Frequency'] > 365) & (all_variables['SUPPLIES_DECLINE_REASONS'].isin(weak_churn_reason))),0,all_variables['Churned_365'])

In [54]:
# Accounts marked as not churned but show churned signals

strong_churn_reason = ['Off Brand','Site Closed','AP Competitive Displacement','No More Coding Requirement',]

all_variables['Churned_365'] = np.where(((all_variables['Churned_365'] == 0) & (all_variables['Contract_Category'] == "No Contract") & (all_variables['SUPPLIES_DECLINE_REASONS'].isin(strong_churn_reason)) & ((all_variables['Recency']/all_variables['Frequency']) >= 2)),1,all_variables['Churned_365'])

In [55]:
# update tenure based on churn
all_variables.loc[all_variables['Churned_365'] == 0,'New Tenure'] = all_variables['Recency'] + all_variables['Tenure']
all_variables.loc[all_variables['Churned_365'] == 1,'New Tenure'] = all_variables['Tenure']
all_variables.drop('Tenure',inplace=True,axis =1)
all_variables.rename(columns = {'New Tenure':'Tenure'}, inplace = True)

# Exporting result table

In [56]:
all_variables.head()

Unnamed: 0,CUSTOMER_SITE_ID,Recency,Frequency,Num_of_Trxns,Avg_Trxn_Amt,Avg_Margin,Avg_Quantity,Mode_of_Product_Family,Mode_of_Product_Model,Types_of_Product_Family,Types_of_Product_Model,Most_Frequent_Sales_Channel,Most_Frequent_Order_Type,Avg_Price_Index,SHORT_VERTICAL,POSTAL_CODE,CUSTOMER_CLASS,TERRITORY_TYPE,SUPPLIES_SEGMENTATION,SUPPLIES_DECLINE_REASONS,DUNS_NUMBER,Total_SVC_Incidents,Total_Repeat_Calls,Total_FTF_Calls,Most_Frequent_Interaction_Type,Total_Visits,Total_Cases,Num_of_Active_Install_Bases,Total_Contracts,Contract_length,Contract_Category,STRATEGIC_ACCOUNTS,TERRITORY_REGION,Churned_365,Tenure
0,24,24,17.67,4,3102.11,2901.24,12.0,LCM,INK,2,2,Copy,STANDARD DOMESTIC,0.79,GRAPHICS,60085,END USER,Industrial,S,,144782380,13.0,7.0,6.0,Call,81.0,1.97,5.0,0.0,0.0,No Contract,0,MW,0,77.0
1,90,50,18.86,98,233.9,172.39,11.37,CIJ,MAKE-UP,1,3,EDI,EDI,1.35,CHEMICALS,65802,END USER,Industrial,S,,43937895,57.0,13.0,44.0,Call,53.0,3.03,6.0,6.0,1003.0,FSMA,0,MC,0,1879.0
2,111,415,72.85,21,220.75,188.43,1.05,CIJ,MAKE-UP,1,2,EDI,EDI,1.28,GRAPHICS,56073,END USER,Industrial,S,,119130057,1.0,0.0,1.0,Call,22.0,3.03,2.0,0.0,0.0,No Contract,0,MW,1,1457.0
3,114,6,11.38,174,145.43,103.28,4.43,CIJ,MAKE-UP,3,4,EDI,EDI,1.25,PHARMA & MEDICAL,92614,END USER,Industrial,M,,84160407,57.0,14.0,43.0,Call,70.0,1.0,15.0,11.0,521.64,FSMA,0,NW,0,1974.0
4,126,49,61.94,19,835.78,605.25,8.84,TIJ,INK,1,1,EDI,EDI,0.75,PHARMA & MEDICAL,8822,END USER,Industrial,S,Over Stocked / Timing,36781508,1.0,0.0,1.0,Call,13.0,1.0,2.0,0.0,0.0,No Contract,0,NE,0,1164.0


In [57]:
all_variables.to_csv('variables.csv', index = False)