### Exercise 2 - Create a model to predict customer churn

### 0. Import packages

In [102]:
import pandas as pd
import os
import gc

### 1. Data Preparation

#### Read the data

In [103]:
path = '~/Documents/Git/Qustodio-Technical-Test/'
sales = pd.read_csv(os.path.join(path, 'sales.csv'))
user_activity = pd.read_csv(os.path.join(path, 'user_activity.csv'))

In [104]:
sales.head()

Unnamed: 0,order_id,account_id,start_date,plan,amount,currency,Unnamed: 6
0,C5G1ckzVUC1V,36369294,2019-03-17,MEDIUM,12.95,EUR,
1,LyPKxILXvkiu,36369294,2019-04-17,MEDIUM,12.95,EUR,
2,729R0C9dVx49,36369294,2019-05-17,MEDIUM,12.95,EUR,
3,RrxBXQYG9Qn8,13708705,2020-08-28,SMALL,8.95,EUR,
4,iYemtey2MjLT,940537915,2020-07-17,SMALL,8.95,EUR,


In [105]:
user_activity.head()

Unnamed: 0,account_id,gender,genre1,genre2,type,games,age,hours
0,101530,male,role-playing,action,mobile,8,21,7.573853
1,731892,female,adventure,action,computer,9,25,4.620231
2,856432,male,action,role-playing,mobile,19,35,13.608988
3,1425820,male,adventure,adventure,mobile,8,20,8.648719
4,1881252,male,action,strategy,computer,6,18,8.929738


#### Data Exploration & Cleaning

In [106]:
sales.dtypes

order_id        object
 account_id      int64
 start_date     object
 plan           object
 amount        float64
 currency       object
Unnamed: 6     float64
dtype: object

Unnamed: 6 column should be dropped and we should change the data type of start_date column. Additionally, some column appear to have a trailing whitespace that needs cleaning.

In [107]:
sales = sales.rename(columns=lambda x: x.strip()) # removes the trailing whitespace
sales['start_date'] = pd.to_datetime(sales.start_date) # transforms start_date into a datetime format
sales.drop(columns= 'Unnamed: 6', inplace= True) # removes the undesired column

In [108]:
sales.describe(include=['O'])

Unnamed: 0,order_id,plan,currency
count,14788,14788,14788
unique,14788,3,2
top,C5G1ckzVUC1V,SMALL,USD
freq,1,12440,7413


In [109]:
sales.isna().sum()

order_id      0
account_id    0
start_date    0
plan          0
amount        0
currency      0
dtype: int64

No NaN values have been found.

In [110]:
sales[sales.duplicated()]

Unnamed: 0,order_id,account_id,start_date,plan,amount,currency


No duplicates have been found.

In [111]:
user_activity.dtypes

account_id      int64
gender         object
genre1         object
genre2         object
type           object
games           int64
age             int64
hours         float64
dtype: object

In [112]:
user_activity.describe(include=['O'])

Unnamed: 0,gender,genre1,genre2,type
count,2000,2000,2000,2000
unique,2,6,6,3
top,male,action,adventure,computer
freq,1400,588,555,1250


In [113]:
user_activity.describe()

Unnamed: 0,account_id,games,age,hours
count,2000.0,2000.0,2000.0,2000.0
mean,507339900.0,8.1575,25.997,9.134307
std,289071400.0,4.880226,6.093238,4.324985
min,101530.0,0.0,15.0,0.0
25%,256578300.0,5.0,22.0,6.340461
50%,504000000.0,8.0,25.0,8.635053
75%,754996700.0,11.0,29.0,11.566813
max,999643000.0,33.0,55.0,30.0


In [114]:
user_activity.isna().sum()

account_id    0
gender        0
genre1        0
genre2        0
type          0
games         0
age           0
hours         0
dtype: int64

No NaN values have been found.

In [115]:
user_activity[user_activity.duplicated()]

Unnamed: 0,account_id,gender,genre1,genre2,type,games,age,hours


No duplicate values have been found.

In [116]:
# set indexes for both dataframes
sales.set_index('order_id', inplace= True)
user_activity.set_index('account_id', inplace= True)

### 2. Create Churn Label

In [117]:
number_of_orders = sales.groupby(by=['account_id']).count()['start_date'].to_frame().reset_index() # checking how many purchases have been made by each user
first_order_date = sales.groupby('account_id')['start_date'].min().reset_index() # extracts the date of the first purchase for each user
last_order_date = sales.groupby('account_id')['start_date'].max().reset_index() # extracts the date of the last purchase for each user

useful_information_sales = number_of_orders.merge(first_order_date, how= 'left', on= 'account_id').merge(last_order_date, how='left', on='account_id')
useful_information_sales.rename(columns= {'start_date_x': 'Orders', 'start_date_y': 'Date_First_Order', 'start_date': 'Date_Last_Order'}, inplace= True) # renames column after the left merge

useful_information_sales = useful_information_sales.merge(sales[['account_id', 'plan', 'currency']], how= 'left', on= 'account_id') # extracts additional useful information from sales dataframe
useful_information_sales.drop_duplicates(inplace=True)
useful_information_sales

Unnamed: 0,account_id,Orders,Date_First_Order,Date_Last_Order,plan,currency
0,101530,4,2019-09-03,2019-12-03,SMALL,EUR
4,731892,11,2020-02-12,2020-12-12,SMALL,EUR
15,856432,12,2020-01-25,2020-12-25,SMALL,EUR
27,1425820,2,2019-06-25,2019-07-25,SMALL,EUR
29,1881252,1,2019-12-22,2019-12-22,SMALL,USD
...,...,...,...,...,...,...
14739,998101168,3,2020-10-13,2020-12-13,SMALL,EUR
14742,998103378,3,2020-03-14,2020-05-14,SMALL,EUR
14745,998392913,18,2019-07-03,2020-12-03,MEDIUM,USD
14763,999546295,17,2019-03-19,2020-07-19,SMALL,EUR


In [67]:
# build the dataframe to be used to modelling with info from sales.csv and user_actiity.csv
model_df = user_activity.merge(useful_information_sales, how= 'left', on= 'account_id').set_index('account_id')

assert(model_df.shape[0]==user_activity.shape[0], 'Error! There are duplicates in the data')

del useful_information_sales, sales, user_activity # release memory by deleting dataframes that won't be used anymore
gc.collect()

30065

In [69]:
model_df.head()

Unnamed: 0_level_0,gender,genre1,genre2,type,games,age,hours,Orders,Date_First_Order,plan,currency
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
101530,male,role-playing,action,mobile,8,21,7.573853,4,2019-09-03,SMALL,EUR
731892,female,adventure,action,computer,9,25,4.620231,11,2020-02-12,SMALL,EUR
856432,male,action,role-playing,mobile,19,35,13.608988,12,2020-01-25,SMALL,EUR
1425820,male,adventure,adventure,mobile,8,20,8.648719,2,2019-06-25,SMALL,EUR
1881252,male,action,strategy,computer,6,18,8.929738,1,2019-12-22,SMALL,USD


In [None]:
# remove users with less than 3 orders



### 3. Feature Engineering

encoding

### 4. Model Building

### 5. Model Evaluation

### 6. Conclusions

### 7. Predictions