In [1]:
#importing the required libraries and modules
import pandas as pd
from datetime import datetime
import numpy as np
from scipy import stats
#sklearn: helps in splitting the dataframe, creating decision tree classifier and guaging the accuracy of the predictions
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
#reading file
df = pd.read_csv(r"C:\Users\kanak\OneDrive\Desktop\ChurnPrediction_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,merchant,time,amount_usd_in_cents
0,1,faa029c6b0,2034-06-17 23:34:14,6349
1,2,ed7a7d91aa,2034-12-27 00:40:38,3854
2,3,5608f200cf,2034-04-30 01:29:42,789
3,4,15b1a0d61e,2034-09-16 01:06:23,4452
4,5,4770051790,2034-07-22 16:21:42,20203


In [3]:
#removing the column that won't aid us in our evaluation/assessment
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,merchant,time,amount_usd_in_cents
0,faa029c6b0,2034-06-17 23:34:14,6349
1,ed7a7d91aa,2034-12-27 00:40:38,3854
2,5608f200cf,2034-04-30 01:29:42,789
3,15b1a0d61e,2034-09-16 01:06:23,4452
4,4770051790,2034-07-22 16:21:42,20203


In [4]:
#converting cents to usd
df['amount_usd'] = df['amount_usd_in_cents'].apply(lambda x: x/100)
df.head()

Unnamed: 0,merchant,time,amount_usd_in_cents,amount_usd
0,faa029c6b0,2034-06-17 23:34:14,6349,63.49
1,ed7a7d91aa,2034-12-27 00:40:38,3854,38.54
2,5608f200cf,2034-04-30 01:29:42,789,7.89
3,15b1a0d61e,2034-09-16 01:06:23,4452,44.52
4,4770051790,2034-07-22 16:21:42,20203,202.03


In [5]:
#converting string to timestamp for the 'time' column - it would aid us later in the analysis
df['time'] = df['time'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
df.head()

Unnamed: 0,merchant,time,amount_usd_in_cents,amount_usd
0,faa029c6b0,2034-06-17 23:34:14,6349,63.49
1,ed7a7d91aa,2034-12-27 00:40:38,3854,38.54
2,5608f200cf,2034-04-30 01:29:42,789,7.89
3,15b1a0d61e,2034-09-16 01:06:23,4452,44.52
4,4770051790,2034-07-22 16:21:42,20203,202.03


In [6]:
#checking for null values
df.isnull().sum()
#we could see from the result that no null values were found

merchant               0
time                   0
amount_usd_in_cents    0
amount_usd             0
dtype: int64

In [7]:
#checking for NA values
df.isna().sum()
#we could see from the result that no NA values were found

merchant               0
time                   0
amount_usd_in_cents    0
amount_usd             0
dtype: int64

In [8]:
#part 1
#categorizing merchants into 3 groups -> Large Scale, Medium Scale, Small Scale
#for this, first we find out the sum of transaction amount for all the merchants
merchants_sum_transaction = df.groupby(['merchant'])['amount_usd'].sum().sort_values(ascending=False).reset_index()
merchants_sum_transaction2 = merchants_sum_transaction.copy()
merchants_sum_transaction

Unnamed: 0,merchant,amount_usd
0,44ee3ec72c,2369072.31
1,bdebc6831e,2188085.67
2,654930c922,1511162.95
3,ce61387781,1429055.63
4,82897075ed,1105732.29
...,...,...
14346,6af3314454,2.31
14347,de9bc54fd5,2.27
14348,12ecce9f0d,2.19
14349,fe9bf14103,2.09


In [9]:
#removing the outliers
merchants_sum_transaction[(np.abs(stats.zscore(merchants_sum_transaction.amount_usd)) < 3)]

Unnamed: 0,merchant,amount_usd
199,5608f200cf,209115.71
200,ff2221ba77,208877.78
201,36745cc693,208545.22
202,6044a863a1,208374.75
203,6db46eecd9,206912.71
...,...,...
14346,6af3314454,2.31
14347,de9bc54fd5,2.27
14348,12ecce9f0d,2.19
14349,fe9bf14103,2.09


In [10]:
#getting quantile ranges to divide the data into three categories
#values that lie between 0 and 0.33 would be categorized as Small Scale Merchant
#values that lie between 0.33 and 0.66 would be categorized as Medium Scale Expenditure Merchant
#values that lie between 0.66 and 1 would be categorized as Large Scale Expenditure Merchant
merchants_sum_transaction.quantile([0.33,0.66,1])

Unnamed: 0,amount_usd
0.33,592.01
0.66,4292.71
1.0,2369072.31


In [11]:
#assigning categories to all merchants by introducing a new column
merchants_sum_transaction2['category'] = merchants_sum_transaction2['amount_usd'].apply(lambda x: 'Small Scale' if x < 592.01 else ('Medium Scale' if (x > 592.01 and x < 4292.71) else 'Large Scale'))
merchants_sum_transaction2.sort_values(by='amount_usd', ascending=False).head(10)

Unnamed: 0,merchant,amount_usd,category
0,44ee3ec72c,2369072.31,Large Scale
1,bdebc6831e,2188085.67,Large Scale
2,654930c922,1511162.95,Large Scale
3,ce61387781,1429055.63,Large Scale
4,82897075ed,1105732.29,Large Scale
5,78639fbae7,1089211.62,Large Scale
6,366ba8e0da,1019238.87,Large Scale
7,005e8bb6fb,988351.7,Large Scale
8,a5e82e285b,957098.71,Large Scale
9,207cc1c4a7,954091.46,Large Scale


In [12]:
merchants_sum_transaction2.sort_values(by='amount_usd', ascending=False).tail(10)

Unnamed: 0,merchant,amount_usd,category
14341,62fadb655e,2.47,Small Scale
14342,607cd654f6,2.46,Small Scale
14343,6de638a30a,2.33,Small Scale
14344,1b5a2ea8d2,2.33,Small Scale
14345,e3c114e84e,2.32,Small Scale
14346,6af3314454,2.31,Small Scale
14347,de9bc54fd5,2.27,Small Scale
14348,12ecce9f0d,2.19,Small Scale
14349,fe9bf14103,2.09,Small Scale
14350,83c1813b63,2.01,Small Scale


In [13]:
#part 2
#getting merchants that have possibly churned

According to the analysis performed, a merchant has churned if, after making some initial transactions, the merchant stops making transactions until the end of a given time period. A merchant can be considered churned after a certain period of time which depends on the amount of the last transaction made by the merchant.
For example: A merchant 0002b63b92 made the first transaction worth $33.79 on 2033-05-16 after which no transaction has been made till the end date of the given dataset i.e. 12-31-2034. So, as the merchant did not make any transaction for 594 days, it was assumed that this merchant has churned.

In [14]:
#checking for number of transactions from each merchant
merchant_transaction_count_df = df.groupby(['merchant'])['time'].count().sort_values(ascending=False).reset_index()
merchant_transaction_count_df.rename(columns={"time": "transaction_count"}, inplace=True)
merchant_transaction_count_df

Unnamed: 0,merchant,transaction_count
0,5608f200cf,25512
1,53b3fbeae2,12178
2,1ddaea9838,12042
3,89e2d29885,11969
4,654930c922,11222
...,...,...
14346,9847b12953,1
14347,985bf67891,1
14348,9865dca5a3,1
14349,988269b4df,1


In [15]:
#checking total expense for each merchant
df.groupby(['merchant'])['amount_usd'].sum().sort_values(ascending=False)

merchant
44ee3ec72c    2369072.31
bdebc6831e    2188085.67
654930c922    1511162.95
ce61387781    1429055.63
82897075ed    1105732.29
                 ...    
6af3314454          2.31
de9bc54fd5          2.27
12ecce9f0d          2.19
fe9bf14103          2.09
83c1813b63          2.01
Name: amount_usd, Length: 14351, dtype: float64

In [16]:
#checking patterns for merchants
#we see the consecutive transaction values for merchants - that can help us in finding patterns
groups = df.groupby(['merchant', 'amount_usd'])['time'].apply(list)
merchant_grouped_transactions = groups.reset_index()
merchant_grouped_transactions

Unnamed: 0,merchant,amount_usd,time
0,0002b63b92,33.79,[2033-05-16 20:07:57]
1,0002d07bba,20.57,[2034-12-15 09:56:19]
2,0002d07bba,55.49,[2034-10-11 17:02:26]
3,0002d07bba,378.30,[2034-10-17 17:57:32]
4,0002d07bba,438.42,[2034-11-13 15:42:55]
...,...,...,...
958245,fff1754102,128.71,[2034-07-20 13:26:26]
958246,fff1754102,271.50,[2033-08-08 12:15:11]
958247,fff1754102,459.03,[2034-07-24 22:55:29]
958248,fff1754102,464.11,[2034-07-14 22:30:29]


In [17]:
#sorting values
merchant_time_df = df.sort_values(['merchant','time'])

In [18]:
#creating dataframe having merchants and their last transaction amount
merchant_amount_last = merchant_time_df.groupby(['merchant'])['amount_usd'].last()
merchant_amount_last_df = merchant_amount_last.reset_index()
merchant_amount_last_df.rename(columns={"merchant": "merchant1", "amount_usd": "last_transaction_amt"}, inplace=True)
merchant_amount_last_df

Unnamed: 0,merchant1,last_transaction_amt
0,0002b63b92,33.79
1,0002d07bba,20.57
2,00057d4302,12.48
3,000bcff341,78.26
4,000ddbf0ca,102.99
...,...,...
14346,ffd3e45675,53.98
14347,ffe1f6b51a,65.17
14348,ffe26b900d,14.92
14349,ffec05edb9,52.18


In [19]:
#creating dataframe having merchants and their last date of transaction
merchant_time_last = merchant_time_df.groupby(['merchant'])['time'].last()
merchant_time_last_df = merchant_time_last.reset_index()
merchant_time_last_df.rename(columns={"merchant": "merchant2", "time": "last_transaction_dt"}, inplace=True)
merchant_time_last_df

Unnamed: 0,merchant2,last_transaction_dt
0,0002b63b92,2033-05-16 20:07:57
1,0002d07bba,2034-12-15 09:56:19
2,00057d4302,2033-08-04 04:26:40
3,000bcff341,2033-08-09 20:18:36
4,000ddbf0ca,2033-06-02 13:25:12
...,...,...
14346,ffd3e45675,2033-01-27 00:32:30
14347,ffe1f6b51a,2034-02-19 01:33:10
14348,ffe26b900d,2034-11-21 14:02:34
14349,ffec05edb9,2034-02-14 22:34:54


In [20]:
#creating dataframe having merchants and their first date of transaction
merchant_time_first = merchant_time_df.groupby(['merchant'])['time'].first()
merchant_time_first_df = merchant_time_first.reset_index()
merchant_time_first_df.rename(columns={"time": "first_transaction_dt"}, inplace=True)
merchant_time_first_df

Unnamed: 0,merchant,first_transaction_dt
0,0002b63b92,2033-05-16 20:07:57
1,0002d07bba,2034-10-11 17:02:26
2,00057d4302,2033-05-30 01:30:52
3,000bcff341,2033-08-09 20:18:36
4,000ddbf0ca,2033-06-02 13:25:12
...,...,...
14346,ffd3e45675,2033-01-04 04:35:29
14347,ffe1f6b51a,2033-06-04 00:11:12
14348,ffe26b900d,2033-12-22 04:01:55
14349,ffec05edb9,2034-01-25 20:14:36


In [21]:
#merging first and last transaction dates in one dataframe
merchant_time_first_last_df = pd.concat([merchant_time_first_df, merchant_time_last_df, merchant_amount_last_df], axis=1)
merchant_time_first_last_df.drop(['merchant1','merchant2'], axis=1, inplace = True)
merchant_time_first_last_df

Unnamed: 0,merchant,first_transaction_dt,last_transaction_dt,last_transaction_amt
0,0002b63b92,2033-05-16 20:07:57,2033-05-16 20:07:57,33.79
1,0002d07bba,2034-10-11 17:02:26,2034-12-15 09:56:19,20.57
2,00057d4302,2033-05-30 01:30:52,2033-08-04 04:26:40,12.48
3,000bcff341,2033-08-09 20:18:36,2033-08-09 20:18:36,78.26
4,000ddbf0ca,2033-06-02 13:25:12,2033-06-02 13:25:12,102.99
...,...,...,...,...
14346,ffd3e45675,2033-01-04 04:35:29,2033-01-27 00:32:30,53.98
14347,ffe1f6b51a,2033-06-04 00:11:12,2034-02-19 01:33:10,65.17
14348,ffe26b900d,2033-12-22 04:01:55,2034-11-21 14:02:34,14.92
14349,ffec05edb9,2034-01-25 20:14:36,2034-02-14 22:34:54,52.18


In [22]:
#adding difference between the first and last transaction dates as a new column
merchant_time_first_last_df['time_difference'] = merchant_time_first_last_df['last_transaction_dt'] - merchant_time_first_last_df['first_transaction_dt']
merchant_time_first_last_df

Unnamed: 0,merchant,first_transaction_dt,last_transaction_dt,last_transaction_amt,time_difference
0,0002b63b92,2033-05-16 20:07:57,2033-05-16 20:07:57,33.79,0 days 00:00:00
1,0002d07bba,2034-10-11 17:02:26,2034-12-15 09:56:19,20.57,64 days 16:53:53
2,00057d4302,2033-05-30 01:30:52,2033-08-04 04:26:40,12.48,66 days 02:55:48
3,000bcff341,2033-08-09 20:18:36,2033-08-09 20:18:36,78.26,0 days 00:00:00
4,000ddbf0ca,2033-06-02 13:25:12,2033-06-02 13:25:12,102.99,0 days 00:00:00
...,...,...,...,...,...
14346,ffd3e45675,2033-01-04 04:35:29,2033-01-27 00:32:30,53.98,22 days 19:57:01
14347,ffe1f6b51a,2033-06-04 00:11:12,2034-02-19 01:33:10,65.17,260 days 01:21:58
14348,ffe26b900d,2033-12-22 04:01:55,2034-11-21 14:02:34,14.92,334 days 10:00:39
14349,ffec05edb9,2034-01-25 20:14:36,2034-02-14 22:34:54,52.18,20 days 02:20:18


In [23]:
#merging transaction count as a new column
merchant_time_first_last_df = merchant_time_first_last_df.merge(merchant_transaction_count_df)
merchant_time_first_last_df

Unnamed: 0,merchant,first_transaction_dt,last_transaction_dt,last_transaction_amt,time_difference,transaction_count
0,0002b63b92,2033-05-16 20:07:57,2033-05-16 20:07:57,33.79,0 days 00:00:00,1
1,0002d07bba,2034-10-11 17:02:26,2034-12-15 09:56:19,20.57,64 days 16:53:53,4
2,00057d4302,2033-05-30 01:30:52,2033-08-04 04:26:40,12.48,66 days 02:55:48,28
3,000bcff341,2033-08-09 20:18:36,2033-08-09 20:18:36,78.26,0 days 00:00:00,1
4,000ddbf0ca,2033-06-02 13:25:12,2033-06-02 13:25:12,102.99,0 days 00:00:00,1
...,...,...,...,...,...,...
14346,ffd3e45675,2033-01-04 04:35:29,2033-01-27 00:32:30,53.98,22 days 19:57:01,5
14347,ffe1f6b51a,2033-06-04 00:11:12,2034-02-19 01:33:10,65.17,260 days 01:21:58,53
14348,ffe26b900d,2033-12-22 04:01:55,2034-11-21 14:02:34,14.92,334 days 10:00:39,81
14349,ffec05edb9,2034-01-25 20:14:36,2034-02-14 22:34:54,52.18,20 days 02:20:18,3


In [24]:
#adding the nummber of days from last transaction date until the end of timeframe (the end date for the given dataframe)
dt_string = "2034-12-31 23:59:59"
dfEnd_date = datetime.strptime(dt_string, '%Y-%m-%d %H:%M:%S')
merchant_time_first_last_df['days_lastdt_from_dfEnd'] = (dfEnd_date - merchant_time_first_last_df['last_transaction_dt'])
merchant_time_first_last_df

Unnamed: 0,merchant,first_transaction_dt,last_transaction_dt,last_transaction_amt,time_difference,transaction_count,days_lastdt_from_dfEnd
0,0002b63b92,2033-05-16 20:07:57,2033-05-16 20:07:57,33.79,0 days 00:00:00,1,594 days 03:52:02
1,0002d07bba,2034-10-11 17:02:26,2034-12-15 09:56:19,20.57,64 days 16:53:53,4,16 days 14:03:40
2,00057d4302,2033-05-30 01:30:52,2033-08-04 04:26:40,12.48,66 days 02:55:48,28,514 days 19:33:19
3,000bcff341,2033-08-09 20:18:36,2033-08-09 20:18:36,78.26,0 days 00:00:00,1,509 days 03:41:23
4,000ddbf0ca,2033-06-02 13:25:12,2033-06-02 13:25:12,102.99,0 days 00:00:00,1,577 days 10:34:47
...,...,...,...,...,...,...,...
14346,ffd3e45675,2033-01-04 04:35:29,2033-01-27 00:32:30,53.98,22 days 19:57:01,5,703 days 23:27:29
14347,ffe1f6b51a,2033-06-04 00:11:12,2034-02-19 01:33:10,65.17,260 days 01:21:58,53,315 days 22:26:49
14348,ffe26b900d,2033-12-22 04:01:55,2034-11-21 14:02:34,14.92,334 days 10:00:39,81,40 days 09:57:25
14349,ffec05edb9,2034-01-25 20:14:36,2034-02-14 22:34:54,52.18,20 days 02:20:18,3,320 days 01:25:05


In [25]:
diff_180 = pd.to_timedelta('180 days 23:59:59')
diff_365 = pd.to_timedelta('365 days 23:59:59')

In [26]:
#defining the criteria if a merchant has churned:
# - if  days_lastdt_from_dfEnd > 180 and last_transaction_amt < 5000 - merchant has churned
# - if  days_lastdt_from_dfEnd > 365 and last_transaction_amt between (5000,10001) - merchant has churned
# - if  last_transaction_amt > 10000 - merchant has not churned
# - else merchant has not churned
merchant_time_first_last_df['hasChurned'] = merchant_time_first_last_df.apply(lambda x: True if ((x.last_transaction_amt < 5000) and (x.days_lastdt_from_dfEnd > diff_180)) else (True if ((x.last_transaction_amt)in range(5000,10001) and (x.days_lastdt_from_dfEnd > diff_365)) else (False if (x.last_transaction_amt > 10000) else False)),axis=1)
merchant_time_first_last_df

Unnamed: 0,merchant,first_transaction_dt,last_transaction_dt,last_transaction_amt,time_difference,transaction_count,days_lastdt_from_dfEnd,hasChurned
0,0002b63b92,2033-05-16 20:07:57,2033-05-16 20:07:57,33.79,0 days 00:00:00,1,594 days 03:52:02,True
1,0002d07bba,2034-10-11 17:02:26,2034-12-15 09:56:19,20.57,64 days 16:53:53,4,16 days 14:03:40,False
2,00057d4302,2033-05-30 01:30:52,2033-08-04 04:26:40,12.48,66 days 02:55:48,28,514 days 19:33:19,True
3,000bcff341,2033-08-09 20:18:36,2033-08-09 20:18:36,78.26,0 days 00:00:00,1,509 days 03:41:23,True
4,000ddbf0ca,2033-06-02 13:25:12,2033-06-02 13:25:12,102.99,0 days 00:00:00,1,577 days 10:34:47,True
...,...,...,...,...,...,...,...,...
14346,ffd3e45675,2033-01-04 04:35:29,2033-01-27 00:32:30,53.98,22 days 19:57:01,5,703 days 23:27:29,True
14347,ffe1f6b51a,2033-06-04 00:11:12,2034-02-19 01:33:10,65.17,260 days 01:21:58,53,315 days 22:26:49,True
14348,ffe26b900d,2033-12-22 04:01:55,2034-11-21 14:02:34,14.92,334 days 10:00:39,81,40 days 09:57:25,False
14349,ffec05edb9,2034-01-25 20:14:36,2034-02-14 22:34:54,52.18,20 days 02:20:18,3,320 days 01:25:05,True


In [27]:
merchant_time_first_last_df['last_transaction_year'] = pd.to_datetime(merchant_time_first_last_df['last_transaction_dt']).dt.year
merchant_time_first_last_df['last_transaction_month'] = pd.to_datetime(merchant_time_first_last_df['last_transaction_dt']).dt.month
merchant_time_first_last_df['last_transaction_day'] = pd.to_datetime(merchant_time_first_last_df['last_transaction_dt']).dt.day
merchant_time_first_last_df['onlydays_lastdt_from_dfEnd'] = pd.to_numeric(merchant_time_first_last_df['days_lastdt_from_dfEnd'].dt.days, downcast='integer')
merchant_time_first_last_df

Unnamed: 0,merchant,first_transaction_dt,last_transaction_dt,last_transaction_amt,time_difference,transaction_count,days_lastdt_from_dfEnd,hasChurned,last_transaction_year,last_transaction_month,last_transaction_day,onlydays_lastdt_from_dfEnd
0,0002b63b92,2033-05-16 20:07:57,2033-05-16 20:07:57,33.79,0 days 00:00:00,1,594 days 03:52:02,True,2033,5,16,594
1,0002d07bba,2034-10-11 17:02:26,2034-12-15 09:56:19,20.57,64 days 16:53:53,4,16 days 14:03:40,False,2034,12,15,16
2,00057d4302,2033-05-30 01:30:52,2033-08-04 04:26:40,12.48,66 days 02:55:48,28,514 days 19:33:19,True,2033,8,4,514
3,000bcff341,2033-08-09 20:18:36,2033-08-09 20:18:36,78.26,0 days 00:00:00,1,509 days 03:41:23,True,2033,8,9,509
4,000ddbf0ca,2033-06-02 13:25:12,2033-06-02 13:25:12,102.99,0 days 00:00:00,1,577 days 10:34:47,True,2033,6,2,577
...,...,...,...,...,...,...,...,...,...,...,...,...
14346,ffd3e45675,2033-01-04 04:35:29,2033-01-27 00:32:30,53.98,22 days 19:57:01,5,703 days 23:27:29,True,2033,1,27,703
14347,ffe1f6b51a,2033-06-04 00:11:12,2034-02-19 01:33:10,65.17,260 days 01:21:58,53,315 days 22:26:49,True,2034,2,19,315
14348,ffe26b900d,2033-12-22 04:01:55,2034-11-21 14:02:34,14.92,334 days 10:00:39,81,40 days 09:57:25,False,2034,11,21,40
14349,ffec05edb9,2034-01-25 20:14:36,2034-02-14 22:34:54,52.18,20 days 02:20:18,3,320 days 01:25:05,True,2034,2,14,320


In [28]:
#the final model that we'll develop will be based on the pattern that emerge for each category of business that we have identified and frequency of transaction

In [29]:
#part 3
#for building a model, we'll apply the logistic regression for the fact that the nature of the target variable is boolean
#initializing the target variable | the aspect that needs to be predicted
targetVariableHasChurned = merchant_time_first_last_df.hasChurned

#initializing feature elements
#declaring the features in the dataset
featureElementsMerchantHasChurned = ['last_transaction_amt','last_transaction_year','last_transaction_month', 'last_transaction_day', 'onlydays_lastdt_from_dfEnd']

#extracting and separting the features from the dataset and storing it in another collection
featureSetMerchantHasChurned = merchant_time_first_last_df[featureElementsMerchantHasChurned]
featureSetMerchantHasChurned

Unnamed: 0,last_transaction_amt,last_transaction_year,last_transaction_month,last_transaction_day,onlydays_lastdt_from_dfEnd
0,33.79,2033,5,16,594
1,20.57,2034,12,15,16
2,12.48,2033,8,4,514
3,78.26,2033,8,9,509
4,102.99,2033,6,2,577
...,...,...,...,...,...
14346,53.98,2033,1,27,703
14347,65.17,2034,2,19,315
14348,14.92,2034,11,21,40
14349,52.18,2034,2,14,320


In [30]:
#shuffling the dataset - this is achieved by setting the shuffle attribute as True
#splitting the dataset in the ratio of 20:80::testing:training sets - this is achieved by setting the test_size attribute to 0.2
#supplying the feature set and the target variable initialized above as the inputs, the function returns the shuffled and split data sets
featureElementsTrainMerchantHasChurned, featureElementsTestMerchantHasChurned, targetVariableHasChurnedTrain, targetVariableHasChurnedTest = train_test_split(featureSetMerchantHasChurned, targetVariableHasChurned, test_size=0.2, shuffle= True)

In [31]:
#taking a look at the features and the target variable for the training set
#features for the training dataset
featureElementsTrainMerchantHasChurned.head()

Unnamed: 0,last_transaction_amt,last_transaction_year,last_transaction_month,last_transaction_day,onlydays_lastdt_from_dfEnd
4391,32.32,2034,6,10,204
10679,44.35,2034,12,22,9
7908,619.24,2034,12,1,30
1050,333.15,2034,11,3,58
9923,291.7,2034,4,15,260


In [32]:
#target variable for the training dataset
targetVariableHasChurned.head()

0     True
1    False
2     True
3     True
4     True
Name: hasChurned, dtype: bool

In [33]:
len(featureElementsTrainMerchantHasChurned)

11480

In [34]:
######################################REMOVE THIS CELL#######################################
#initializing LogisticRegression model variable
#note that here we've set the maximum number of iterations as 100000 to avoid the warning for convergence
LogReg = LogisticRegression(solver='lbfgs', max_iter= 100000)
LogReg.fit(featureElementsTrainMerchantHasChurned, targetVariableHasChurnedTrain)

LogisticRegression(max_iter=100000)

In [35]:
#running predictions for the train data
targetVariableSHasChurnedPredictedTrain = LogReg.predict(featureElementsTrainMerchantHasChurned)

In [36]:
#running predictions for the test data
targetVariableHasChurnedPredictedTest = LogReg.predict(featureElementsTestMerchantHasChurned)

In [37]:
#guaging the accuracy for the isSpam prediction for the train data
metrics.accuracy_score(targetVariableHasChurnedTrain, targetVariableSHasChurnedPredictedTrain)

0.9930313588850174

In [38]:
#guaging the accuracy for the isSpam prediction for the test data
metrics.accuracy_score(targetVariableHasChurnedTest, targetVariableHasChurnedPredictedTest)

0.9937304075235109

In [39]:
#note that the accuracy level here is high, since the conditions mentioned for churning are few (3)
#introducing more conditions would improve the model