In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
%matplotlib inline

In [13]:
df = pd.read_csv("rfm_xmas19.txt", delimiter = ",", parse_dates=["trans_date"])
df.head()

Unnamed: 0,customer_id,trans_date,tran_amount
0,FM5295,2017-11-11,35
1,FM4768,2019-12-15,39
2,FM2122,2017-11-26,52
3,FM1217,2016-08-16,99
4,FM1850,2018-08-20,78


In [14]:
df.shape

(125000, 3)

In [15]:
df.groupby('customer_id').tran_amount.agg(func = np.sum)

customer_id
FM1112    1012
FM1113    1490
FM1114    1432
FM1115    1659
FM1116     857
          ... 
FM8996     582
FM8997     543
FM8998     624
FM8999     383
FM9000     533
Name: tran_amount, Length: 6889, dtype: int64

### Reason behind the request
* Surverying the best customer to understand the reasons why they are the best custormers
* Investing in digital marketing for a specific segment
* Creating a rewards program for the best customers

* nebulous language

In [21]:
df.info()
# Data has been properly cleaned, no missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125000 entries, 0 to 124999
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   customer_id  125000 non-null  object        
 1   trans_date   125000 non-null  datetime64[ns]
 2   tran_amount  125000 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 2.9+ MB


## Find churned customers who haven't purchased anything in over three months

In [23]:
# Find the latest purchase for each customer
group_by_customer = df.groupby('customer_id')
last_transaction = group_by_customer['trans_date'].max()
last_transaction.sample(5)

customer_id
FM7322   2019-05-26
FM8193   2019-02-25
FM5903   2019-07-20
FM1305   2019-09-01
FM3418   2019-11-07
Name: trans_date, dtype: datetime64[ns]

In [26]:
last_transaction.sample(5)

customer_id
FM7759   2019-08-29
FM8745   2019-07-14
FM4218   2019-11-19
FM8981   2019-01-20
FM5107   2019-11-21
Name: trans_date, dtype: datetime64[ns]

Recall that in this scenario, a churned customer is one who hasn't purchased anything since October 16 (three months).

*    Begin by using pd.DataFrame() to have the data in last_transaction as a dataframe. Assign it to best_churn.
*    Add a column called churned that should have the value of 1 if the customer has churned and 0 otherwise.
*        Create a datetime object representing October 16, 2019. Assign it to cutoff_day.
*        Use best_churn["trans_date"].apply() with an appropriate function to code the rationale in the instruction.


In [31]:
best_churn = pd.DataFrame(last_transaction)
cutoff_day = dt.datetime(2019,10,16)
best_churn['churned'] = best_churn['trans_date'].apply(lambda x : 1 if x < cutoff_day else 0)

## Simple weighted sum model to classify customers

In [33]:
best_churn['nr_of_transactions'] = group_by_customer.size()
best_churn.sample(5)

Unnamed: 0_level_0,trans_date,churned,nr_of_transactions
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
FM2905,2019-09-20,1,17
FM7138,2019-12-03,0,15
FM4324,2019-11-03,0,20
FM1632,2019-12-08,0,17
FM4046,2019-09-13,1,11


In [37]:
len(group_by_customer.size())

6889

In [38]:
best_churn.sample(5)

Unnamed: 0_level_0,trans_date,churned,nr_of_transactions
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
FM3423,2019-11-18,0,17
FM3277,2019-10-16,0,20
FM8626,2019-04-05,1,7
FM2616,2019-12-16,0,9
FM3908,2019-10-29,0,20


In [39]:
best_churn['amount_spent'] = group_by_customer.sum()
best_churn.sample(5)

Unnamed: 0_level_0,trans_date,churned,nr_of_transactions,amount_spent
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FM5676,2019-10-17,0,21,1311
FM8981,2019-01-20,1,9,453
FM5193,2019-11-28,0,24,1586
FM6096,2019-09-21,1,19,1386
FM3706,2019-12-05,0,29,1929


In [41]:
best_churn.drop('trans_date',axis = 1, inplace=True)
best_churn.sample(5)

Unnamed: 0_level_0,churned,nr_of_transactions,amount_spent
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
FM7149,1,19,857
FM4470,1,24,1688
FM6040,0,15,1001
FM2584,1,25,1814
FM7569,1,9,478


In [45]:
best_churn[['nr_of_transactions', 'amount_spent']].describe().loc[['min', 'max'], :]

Unnamed: 0,nr_of_transactions,amount_spent
min,4.0,149.0
max,39.0,2933.0


## Min-max feature scaling

In [48]:
best_churn['scaled_tran'] = (best_churn.nr_of_transactions - best_churn.nr_of_transactions.min()) / (
                            best_churn.nr_of_transactions.max() - best_churn.nr_of_transactions.min())
best_churn['scaled_amount'] = (best_churn.amount_spent - best_churn.amount_spent.min()) / (
                            best_churn.amount_spent.max() - best_churn.amount_spent.min())
best_churn.sample(5)

Unnamed: 0_level_0,churned,nr_of_transactions,amount_spent,scaled_tran,scaled_amount
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
FM2585,0,21,1484,0.485714,0.479526
FM4096,0,20,1481,0.457143,0.478448
FM4109,0,20,1284,0.457143,0.407687
FM3453,0,25,1726,0.6,0.566451
FM1203,0,21,1536,0.485714,0.498204


In [49]:
best_churn['score'] = 100 * (0.5 * best_churn.scaled_tran + 0.5 * best_churn.scaled_amount)
best_churn.sample(5)

Unnamed: 0_level_0,churned,nr_of_transactions,amount_spent,scaled_tran,scaled_amount,score
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
FM4590,0,24,1625,0.571429,0.530172,55.080049
FM4735,0,18,1311,0.4,0.417385,40.869253
FM3451,0,23,1736,0.542857,0.570043,55.645012
FM2276,1,14,1071,0.285714,0.331178,30.844622
FM7736,1,11,535,0.2,0.138649,16.932471


best_churn.sort_values(by='score', inplace=True, ascending=False)
best_churn.head()

## Decide on a threshold to determine which customers are "the best"
* k-means clustering, hierarchical clustering... but waste of time

* Here are some factors that you decided to take into account:

    The budget is $1,000.
    No indication was given about how much each coupon would be worth — it's for you to decide.
    The coupons need to be good enough to prompt people to actually use them.
    They can't be too high because:
        That reduces the number of customers who get them.
        It would be like giving away money.
        Due to price dumping, it could be illegal.
    From your experience, you know that a 30% discount is already very enticing;
 *   With all this in mind, you decide to employ the following strategy to determine the cutoff point:

    Find the mean of the transactions and compute 30% of that. Make this the value of the coupon;
    Divide the budget by the value obtained above to get the number of coupons you're going to be sending out;
    Pick the first 

churned customers where is the result of the calculation done in the previous step. This is your cutoff point.


In [54]:
# Number of customers to obtain coupon
coupon = df.tran_amount.mean() * 0.3
nr_of_customers = 1000 / coupon
nr_of_customers

51.28843314123969

In [55]:
print(coupon, nr_of_customers, sep = "\n")

19.4975736
51.28843314123969


* Send the coupon to the top 50 churned customers
* Once you complete this urgent request, you email your manager the text file and bring up the following points:

    Given the budget, you decided to send $20 coupons to the 50 best customers.
    A brief mention that you ranked customers on number of purchases made and amount spent — without getting into too much detail.
    The deliverable has more than just the customer IDs, but the recipients should not worry about the other columns.


In [58]:
top_50_churned = best_churn[best_churn.churned == 1].iloc[:50, :]
top_50_churned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 50 entries, FM4320 to FM1332
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   churned             50 non-null     int64  
 1   nr_of_transactions  50 non-null     int64  
 2   amount_spent        50 non-null     int64  
 3   scaled_tran         50 non-null     float64
 4   scaled_amount       50 non-null     float64
 5   score               50 non-null     float64
dtypes: float64(3), int64(3)
memory usage: 2.7+ KB


In [59]:
top_50_churned.to_csv('best_customers.txt')