## Introduction

### Machine learning model to predict if the user would buy the product in the next 3 months or not based on the user's past activities and user-level information.

### Import the necessary packages.

In [128]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # visualization
from matplotlib import pyplot as plt # visualization
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import re
#import klib

### Import train and test dataset

In [111]:
train=pd.read_csv("../input/jobathon-june-22/train_wn75k28.csv")
test=pd.read_csv("../input/jobathon-june-22/test_Wf7sxXF.csv")

### Pre-processing function


In [44]:
def pre_process(df):
    df['products_purchased_null']=df['products_purchased'].isnull().astype('int')
    def date_feature_eng(df):
        df['date_time']=pd.to_datetime(df["created_at"].astype('str')) #convert to date-time format
        df['date_time2']=pd.to_datetime(df["signup_date"].astype('str')) #convert to date-time format
        df['day']=df['date_time'].dt.day #extract day from the date
        df['day_label']=df['date_time'].dt.day_name() #extract the day name from the date
        df['day_number']=df['date_time'].dt.dayofweek #extract the day number from the date
        df['month_number']=df['date_time'].dt.month #extract month number from the date
        df['month_label']=df['date_time'].dt.strftime('%b') #extract the month name from the date
        df['year_quarter']=df['date_time'].dt.quarter #extract the quarter of the year
        df['week_of_year']=df['date_time'].dt.week #extract week of the year from date
        df['year']= df['date_time'].dt.year #extract year
        df['dayofmonth'] = df['date_time'].dt.daysinmonth #extract the day of the month
        df['dayofyear'] = df['date_time'].dt.day_of_year #extract day of the year
        df['weekday']=df['date_time'].dt.day_name().isin(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']).astype('int') #create weekday column
        df['weekend']=df['date_time'].dt.day_name().isin(['Saturday', 'Sunday']).astype('int') #create weekend column
        df['month_start']=df['date_time'].dt.is_month_start.astype('int') #create month start
        df['month_end']=df['date_time'].dt.is_month_end.astype('int') #create month end
        df['quarter_start']=df['date_time'].dt.is_quarter_start.astype('int') #create quarter start
        df['quarter_end']=df['date_time'].dt.is_quarter_end.astype('int') #create quarter end
        df['year_start']=df['date_time'].dt.is_year_start.astype('int') #create year start
        df['year_end']=df['date_time'].dt.is_year_end.astype('int') #create year end
        return df
    
    #Apply function to date column
    df=date_feature_eng(df) 
    
    #Miisng value filling
    df['products_purchased']=df['products_purchased'].fillna(df.groupby([
                                                   'campaign_var_1', 'campaign_var_2',
                                                   'user_activity_var_1','user_activity_var_2',
                                                   'user_activity_var_3','user_activity_var_4',
                                                   'user_activity_var_5','user_activity_var_6',
                                                    'user_activity_var_7','user_activity_var_8', 
                                                   'user_activity_var_9','user_activity_var_10', 
                                                   'user_activity_var_11','user_activity_var_12'])['products_purchased'].transform('median'))

    #New feature
    df['user_count']=(df.groupby(['campaign_var_1', 'campaign_var_2',
                          'user_activity_var_1','user_activity_var_2',
                          'user_activity_var_3','user_activity_var_4',
                          'user_activity_var_5','user_activity_var_6',
                          'user_activity_var_7','user_activity_var_8', 
                          'user_activity_var_9','user_activity_var_10', 
                          'user_activity_var_11','user_activity_var_12'])['id'].transform('count'))


    #Category columns 
    grpcol=['campaign_var_1', 'campaign_var_2',
                  'user_activity_var_1','user_activity_var_2',
                  'user_activity_var_3','user_activity_var_4',
                  'user_activity_var_5','user_activity_var_6',
                  'user_activity_var_7','user_activity_var_8', 
                  'user_activity_var_9','user_activity_var_10', 
                  'user_activity_var_11','user_activity_var_12']
    
    
    #Function for numerical feature engineering
    def num_feature_eng(df):
        df1=(df.groupby(grpcol)['products_purchased'].agg({'min','median','max','mean','count'}).reset_index())


        df1=(df1.rename(columns={col:'grp_'+col for col in df1.loc[:,df1.columns.str.
                            match("(min|median|max|mean|count)")]}))
        df=pd.merge(df,df1,on=grpcol,how='left')

        return df

    #Apply the numerical feature engineering for products_purchased column
    df=num_feature_eng(df) 

    #New feature    
    df['lead_after_signup']=df["date_time2"]>df['date_time']
    #Missing value indicator for lead_after_signup column
    df['lead_after_signup_null']=df['lead_after_signup'].isnull().astype('int')
    #Fill the missing value for lead_after_signup column
    df['lead_after_signup']=df['lead_after_signup'].fillna(0)
    #New feature
    df['date_diff']=(df['date_time']-df['date_time2']).dt.days
    #Missing value indicator date_diff
    df['date_diff_null']=df['date_diff'].isnull().astype('int')
    #Fill the missing value for date_diff column
    df['date_diff']=df['date_diff'].fillna(0)
    


    return df

### Apply the pre-processing steps to train dataset

In [112]:
train=pre_process(train)

### Import pycaret model

In [5]:
!pip install --user pycaret -full
!pip install numba==0.53

In [13]:
import pycaret

In [14]:
from pycaret.classification import *

In [46]:
train.info()

### Split predictor(X) and response(y) variable

In [50]:
X=train.iloc[:,np.r_[2:5,6:18,19,22,24,25,27:50]]
y=train['buy']

### Create a new dataframe with concatenate the selected predictor and response variable.

In [51]:
train1=pd.concat([X,y],axis=1)

### Setup model to train

In [52]:
clf_pycaret = setup(data = train1,
                    data_split_shuffle=True,
                    data_split_stratify=True,
                    fold_strategy='stratifiedkfold',
                    fold_shuffle=True,
                    fold=5,
                    target = 'buy', session_id=112,
                    use_gpu =True,
                   ) 

### Compare all classifier models then select the top 3 models based on the f1 score.

In [53]:
top3 = compare_models(sort = 'F1',n_select=3)

### Blend the top 3 model

In [54]:
blender = blend_models(top3)

### Finalize the belnded model

In [55]:
final = finalize_model(blender)

### Model ROC curve

In [63]:
plt.figure(figsize = (10,10))
plot_model(final, plot = 'auc')

### Model precision and recall curve

In [64]:
plt.figure(figsize = (10,10))
plot_model(final, plot = 'pr')

### Model confusion matrix for hold out data

In [66]:
plt.figure(figsize = (10,10))
plot_model(final, plot = 'confusion_matrix',use_train_data = False, plot_kwargs = {'percent' : True})

### Error plot

In [68]:
plt.figure(figsize = (10,10))
plot_model(final, plot = 'error')

### Catboost model feature importance plot

In [79]:
plt.figure(figsize = (10,10))
plot_model(final.named_estimators_['catboost'], plot = 'feature')

### Validation curve for catboost model

In [76]:
plt.figure(figsize = (10,10))
plot_model(final.named_estimators_['catboost'], plot = 'vc')

### Apply the pre-process steps to test dataset

In [113]:
test=pre_process(test)

### Predict the test data by using finalized blended model

In [115]:
test['buy']=(predict_model(final, data=test.loc[:,X.columns])['Label'])

### Prediction Plot

In [125]:
train['data']='train'
test['data']='test'

In [127]:
combine_buy=pd.concat([train[['date_time','buy','data']],test[['date_time','buy','data']]],axis=0)

In [147]:
g = sns.FacetGrid(combine_buy.groupby(['date_time','buy','data'])['buy'].agg({'count'}).reset_index(),
                  col="data" ,height=8, aspect=.9,sharey=False,sharex=False,hue='buy');
g.map_dataframe(sns.lineplot, x="date_time",y='count');
g.add_legend();

### Create a result dataframe

In [58]:
res=pd.DataFrame({'id':test['id'],'buy':test['buy']})

### Write the result dataframe to csv

In [59]:
res.to_csv("pycaret_blend_final.csv",index=False)