In [1]:
# import libraries
import warnings 
warnings.filterwarnings("ignore")

import matplotlib
matplotlib.use('nbagg')
%matplotlib inline

import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sn
import pickle
from datetime import datetime

import lightgbm
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint

    

In [2]:

# function to get median values for all numerical features
def get_median(data,numerical_columns):
    
    median_dict = dict()
    
    for column in numerical_columns:
        
        median_dict[column] = np.nanmedian(data[column])
        
    return median_dict

#=================================================================


# function to get mode values for all categorical features
def get_mode(data,categorical_columns):
    
    mode_dict = dict()
    
    for column in categorical_columns:
        
        mode_dict[column] = data[column].value_counts().index[0]
        
    return mode_dict
    

In [3]:
# https://www.kaggle.com/hjsdssz/kkbox-churn-prediction-model 

# function to fill NULL values of numerical features with mean

def fill_with_median(data,numerical_columns,train_data_median):
    
    for column in numerical_columns:
        data[column].fillna(train_data_median[column],inplace=True)

#===============================================================================

# function to fill NULL values of categorical features with most occuring values 

def fill_with_mode(data,categorical_columns,train_data_mode): 
    
    for column in categorical_columns:
        data[column].fillna(train_data_mode[column], inplace=True)

In [4]:
def get_best_parameters(x,y):
    
    '''
    function to identify best parameters for LGBM Classifier using RandomizedSearchCV
    '''
    
    lgb = LGBMClassifier(n_jobs=-1)

    param = {
               'max_depth':[2,3,4,5,6,7,8,9,10,11,12,13,14,15],
               'learning_rate':[0.01,0.05,0.1,0.15,0.2,0.3,0.5,0.8,1],
               'subsample':[0.2,0.3,0.5,0.8,0.9,1],
               'colsample_bytree':[0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
               'n_estimators':[50,100,150,200,250,300,400,500,700,800],
               'num_leaves':[31,35,40,45,50,55,60,70,80,90,100,200]
               }

    random_clf = RandomizedSearchCV(lgb, param_distributions=param, cv=10, verbose=1, n_jobs=-1)

    random_clf.fit(x,y)

    best_param = random_clf.best_params_
    
    return best_param


In [5]:
def onehot(col):
    '''
    function for one-hot encoding of class-label y
    '''
    col = list(col)
    arr = np.zeros((len(col),2))
    for i,val in enumerate(col):
        arr[i,val] = 1
    return arr


In [6]:
def train_model():
    '''
    function to train model for predicting customer churn
    '''
    #reading data from csv files
    trainv2 = pd.read_csv("input_files/train_v2.csv",dtype={'is_churn':np.int8})
    
    #sample_submission_v2 = pd.read_csv("input_files/sample_submission_v2.csv",dtype={'is_churn':np.int8})
    
    transactions_v2 = pd.read_csv("input_files/transactions_v2.csv",
                              dtype={'payment_method_id':np.int8,'payment_plan_days':np.int16,'plan_list_price':np.int16,
                                    'actual_amount_paid':np.int16,'is_auto_renew':np.int8,'is_cancel':np.int8})
    members = pd.read_csv("input_files/members_v3.csv",
                     dtype={'city':np.int8,'bd':np.int16,'registered_via':np.int8})
    user_logs_v2 = pd.read_csv("input_files/user_logs_v2.csv",
                     dtype={'num_25': np.int16,'num_50': np.int16,'num_75': np.int16,'num_985': np.int16,'num_100': np.int16})
    
    # taking transaction count per user
    transaction_count_per_user = transactions_v2.groupby("msno")['msno'].count()
    
    # converting transaction count to dataframe
    transaction_count_per_user = pd.DataFrame(transaction_count_per_user)
    transaction_count_per_user.columns = ['trans_count']
    transaction_count_per_user.reset_index(inplace=True)
    
    #keeping only latest transaction record for each user
    transactions_v2 = transactions_v2.drop_duplicates(subset=['msno'], keep='first')
    #dropping columns
    transactions_v2.drop(columns=['membership_expire_date','actual_amount_paid','is_cancel'],axis=1,inplace=True)
    #merging transaction count to dataframe
    transactions_v2 = transactions_v2.merge(transaction_count_per_user, how='left',on='msno')
    
    user_logs_v2.drop_duplicates(inplace=True)
    
    # removing datapoints with negative value of num_100
    user_logs_v2.drop(index=(user_logs_v2[user_logs_v2.num_100<0].index) ,inplace=True)
    
    # converting total seconds to total minutes by dividing it by 60
    user_logs_v2['total_minutes'] = user_logs_v2['total_secs']/60
    user_logs_v2.drop(columns=['total_secs'],inplace=True)  # dropping column 'total_secs'
    
    # taking sum of each columns for each user
    user_logs_sum = user_logs_v2.groupby('msno',as_index=False).sum()
    user_logs_sum.drop(columns=['date'],axis=1,inplace=True)
    
    # taking mean of each columns for each user
    user_logs_mean = user_logs_v2.groupby('msno',as_index=False).mean()
    user_logs_mean.columns = ['msno','date','mean_num_25','mean_num_50','mean_num_75','mean_num_985','mean_num_100','mean_num_unq','mean_total_min']
    user_logs_mean.drop(columns=['date'],axis=1,inplace=True)
    
    # merging dataframe
    user_logs = user_logs_sum.merge(user_logs_mean,on='msno')
    
    members.drop_duplicates(inplace=True)
    
    train = trainv2.merge(transactions_v2,how='left',on='msno')
    members.drop(columns=['gender'],axis=1,inplace=True)  # dropping 'gender' column as 65% values are NULL
    train = train.merge(members,how='left',on='msno')  
    train = train.merge(user_logs,how='left',on='msno')
    print("Shape after merging all dataframes = ",train.shape)
    
    numerical_columns = ['payment_plan_days','plan_list_price','num_25', 'num_50', 'num_75', 'num_985', 'num_100',
                     'num_unq', 'total_minutes', 'mean_num_25', 'mean_num_50', 'mean_num_75','mean_num_985', 
                     'mean_num_100', 'mean_num_unq', 'mean_total_min','bd','trans_count']
    
    categorical_columns = ['payment_method_id','is_auto_renew','city','registered_via','transaction_date','registration_init_time']
    
    # getting mdian and mode values from train data
    train_data_median = get_median(train,numerical_columns)
    train_data_mode = get_mode(train,categorical_columns)
    
    # filling missing numerical values with median
    fill_with_median(train,numerical_columns,train_data_median)

    # filling missing categorical values and missing date with mode
    fill_with_mode(train,categorical_columns,train_data_mode)
    
    train.drop(columns='bd',axis=1,inplace=True)
    
    # creating new features out of transaction date 

    # taking just day of month
    train['trans_day_of_month'] = train.transaction_date.apply(lambda x: np.int8(str(int(x))[6:]))

    # taking just month of transaction
    #train['trans_month'] = train.transaction_date.apply(lambda x: np.int8(str(int(x))[4:6]) )

    # taking removing year from the date and taking just month and day of month 
    train['trans_month_day'] = train.transaction_date.apply(lambda x: np.int16(str(int(x))[4:]))

    # 'day_of_week' signifies which day of week it is. 
    # e.g.  Monday is 1 and Sunday is 7
    train['trans_day_of_week'] = train.transaction_date.apply(lambda x: datetime.strptime(str(int(x)), "%Y%m%d").isoweekday())

    #=========================================================================================================

    # creating new features out of registration date 

    # taking just day of month
    train['reg_day_of_month'] = train.registration_init_time.apply(lambda x: np.int8(str(int(x))[6:]))

    # taking just month of transaction
    train['reg_month'] = train.registration_init_time.apply(lambda x: np.int8(str(int(x))[4:6]) )

    # taking removing year from the date and taking just month and day of month 
    train['reg_month_day'] = train.registration_init_time.apply(lambda x: np.int16(str(int(x))[4:]))

    # 'day_of_week' signifies which day of week it is. 
    # e.g.  Monday is 1 and Sunday is 7
    train['reg_day_of_week'] = train.registration_init_time.apply(lambda x: datetime.strptime(str(int(x)), "%Y%m%d").isoweekday())
    
    #==========================================================================================================
    
    # creating a new feature - price_per_day
    train['price_per_day'] = [ (train['plan_list_price'][i]/train['payment_plan_days'][i]) if (train['payment_plan_days'][i]!=0) else 0 for i in range(train.shape[0]) ]

    #=====================================================================================================
    
    # creating a new feature - min_per_song i.e. average minutes listened per song
    train['min_per_song'] = train['total_minutes']/(train['num_25']+train['num_50']+train['num_75']+train['num_985']+train['num_100'])

    # creating a new feature - avg_min_per_unq i.e. average minutes listened per unique song
    train['avg_min_per_unq'] = train['mean_total_min']/train['mean_num_unq']

    #========================================================================================================

    # creating a feature 'total_num' i.e. total number of songs listened by the user
    train['total_num'] = train['num_25']+train['num_50']+train['num_75']+train['num_985']+train['num_100']

    # creating a feature 'ratio_num_25' i.e. ratio of 'total_num'(total number of songs) and 'num_25'(number of songs listened upto 25% of their length)
    # train['ratio_num_25'] = [train['total_num'][i]/train['num_25'][i] for i in range(train.shape[0])]
    train['ratio_num_25'] = train['num_25']/train['total_num']

    # creating a feature 'ratio_num_50' i.e. ratio of 'total_num'(total number of songs) and 'num_50'(number of songs listened upto 50% of their length)
    train['ratio_num_50'] = train['num_50']/train['total_num']

    # creating a feature 'ratio_num_75' i.e. ratio of 'total_num'(total number of songs) and 'num_75'(number of songs listened upto 75% of their length)
    train['ratio_num_75'] = train['num_75']/train['total_num']

    # creating a feature 'ratio_num_985' i.e. ratio of 'total_num'(total number of songs) and 'num_985'(number of songs listened upto 98.5% of their length)
    train['ratio_num_985'] = train['num_985']/train['total_num']

    # creating a feature 'ratio_num_100' i.e. ratio of 'total_num'(total number of songs) and 'num_100'(number of songs listened upto 100% of their length)
    train['ratio_num_100'] = train['num_100']/train['total_num']

    # creating a feature 'total_by_unq' i.e. total_num divided by num_unq
    train['total_by_unq'] = train['total_num']/train['num_unq']
    
    print("Shape after feature engineering = ",train.shape)
    #===========================================================================================================
    
    #splitting data in to train and test set 
    X = train.drop(columns=['msno','is_churn','transaction_date','registration_init_time']).reset_index(drop=True)
    Y = train['is_churn']
    
    # splitting into train test ratio of 75:25 i.e. 75% data for training and 25% for testing
    x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.25,stratify=Y)
    
    print("Shape after splitting:")
    print("Train shape: x_train = ",x_train.shape,", y_train = ",y_train.shape)
    print("Test shape: x_test = ",x_test.shape,", y_test = ",y_test.shape)

    
    #============================================================================================================
    
    # training model
    
    # LightGBM 1
    
    print("LightGBM 1")
    start_1 = datetime.now()
    
    # getting best parameters 
    best_param_1 = get_best_parameters(x_train,y_train)

    print("LGBMClassifier-1 best parameters:", best_param_1)
    
    # training LGBM-1 with best parameters identified
    lgb1 = LGBMClassifier(max_depth=best_param_1['max_depth'],subsample=best_param_1['subsample'],
                     learning_rate=best_param_1['learning_rate'],n_estimators=best_param_1['n_estimators'],
                     num_leaves=best_param_1['num_leaves'],colsample_bytree=best_param_1['colsample_bytree'],n_jobs=-1)

    lgb1.fit(x_train,y_train)

    pred_train_1 = lgb1.predict(x_train)
    pred_test_1 = lgb1.predict(x_test)
    print("Train accuracy of LGBM-1 = ",accuracy_score(y_train,pred_train_1))
    print("Test accuracy of LGBM-1 = ",accuracy_score(y_test,pred_test_1))

    pred_train_prob_1 = lgb1.predict_proba(x_train)
    pred_test_prob_1 = lgb1.predict_proba(x_test)
    print("Train loss of LGBM-1 = ",log_loss(y_train,pred_train_prob_1))
    print("Test loss of LGBM-1 = ",log_loss(y_test,pred_test_prob_1))
    print("Time taken to train LightGBM Model-1 = ", (datetime.now() - start_1))
    print("="*50)
    
    #============================================================================================================
    
    # training 2nd LightGBM
    
    print("LightGBM 2")

    start_2 = datetime.now()
    
    best_param_2 = get_best_parameters(x_train,y_train)

    print("LGBMClassifier-2 best parameters:", best_param_2)
    
    # training LGBM-2 with best parameters identified
    lgb2 = LGBMClassifier(max_depth=best_param_2['max_depth'],subsample=best_param_2['subsample'],
                     learning_rate=best_param_2['learning_rate'],n_estimators=best_param_2['n_estimators'],
                     num_leaves=best_param_2['num_leaves'],colsample_bytree=best_param_2['colsample_bytree'],n_jobs=-1)

    lgb2.fit(x_train,y_train)

    pred_train_2 = lgb2.predict(x_train)
    pred_test_2 = lgb2.predict(x_test)
    print("Train accuracy of LGBM-2 = ",accuracy_score(y_train,pred_train_2))
    print("Test accuracy of LGBM-2 = ",accuracy_score(y_test,pred_test_2))

    pred_train_prob_2 = lgb2.predict_proba(x_train)
    pred_test_prob_2 = lgb2.predict_proba(x_test)
    print("Train loss of LGBM-2 = ",log_loss(y_train,pred_train_prob_2))
    print("Test loss of LGBM-2 = ",log_loss(y_test,pred_test_prob_2))
    print("Time taken to train LightGBM Model-2 = ", (datetime.now() - start_2))
    print("="*50)
#================================================================================================

    # training 3rd LightGBM
    
    print("LightGBM 3")

    start_3 = datetime.now()
    
    best_param_3 = get_best_parameters(x_train,y_train)

    print("LGBMClassifier-3 best parameters:", best_param_3)

    # training LGBM-3 with best parameters identified
    lgb3 = LGBMClassifier(max_depth=best_param_3['max_depth'],subsample=best_param_3['subsample'],
                     learning_rate=best_param_3['learning_rate'],n_estimators=best_param_3['n_estimators'],
                     num_leaves=best_param_3['num_leaves'],colsample_bytree=best_param_3['colsample_bytree'],n_jobs=-1)

    lgb3.fit(x_train,y_train)

    pred_train_3 = lgb3.predict(x_train)
    pred_test_3 = lgb3.predict(x_test)
    print("Train accuracy of LGBM-3 = ",accuracy_score(y_train,pred_train_3))
    print("Test accuracy of LGBM-3 = ",accuracy_score(y_test,pred_test_3))

    pred_train_prob_3 = lgb3.predict_proba(x_train)
    pred_test_prob_3 = lgb3.predict_proba(x_test)
    print("Train loss of LGBM-3 = ",log_loss(y_train,pred_train_prob_3))
    print("Test loss of LGBM-3 = ",log_loss(y_test,pred_test_prob_3))
    print("Time taken to train LightGBM Model-3 = ", (datetime.now() - start_3))
    print("="*50)
    
    #=================================================================================================================
    
    # preparing train data for Feed Forward Neural Network (FFNN)
    
    # combining the predicted probababilities of three LGBM models for trainig FFNN

    pred_train = np.hstack([pred_train_prob_1,pred_train_prob_2,pred_train_prob_3])
    pred_test = np.hstack([pred_test_prob_1,pred_test_prob_2,pred_test_prob_3])

    print("Shape of predicted train probabilities",pred_train.shape)
    print("Shape of predicted test probabilities",pred_test.shape)
    
    # one-hot encoding class label for Neural Network 
    ytrain_ohe = onehot(y_train)
    ytest_ohe = onehot(y_test)
    
    # defining architecture for FFNN
    # Neural network on the probabilities of LGBM Classifiers

    model_nn = Sequential()
    model_nn.add(Dense(24,activation='sigmoid',input_shape=(6,)))
    model_nn.add(Dropout(0.7))
    model_nn.add(Dense(12,activation='sigmoid'))
    model_nn.add(Dropout(0.7))
    model_nn.add(Dense(6,activation='sigmoid'))
    model_nn.add(Dropout(0.3))
    model_nn.add(Dense(2,activation='softmax'))
    opt = tensorflow.keras.optimizers.Adam(0.0001)
    model_nn.compile(optimizer=opt, loss='binary_crossentropy',metrics=['accuracy'])
    
    #checkpoint to save best weights of model
    checkpt = ModelCheckpoint("best_weights.h5",monitor='val_loss', save_best_only=True,
                          save_weights_only=True, mode='min', save_freq='epoch')

    model_nn.fit(pred_train,ytrain_ohe,batch_size=256, epochs=25,validation_data=(pred_test,ytest_ohe),callbacks=[checkpt],verbose=2)
    
    model_nn.load_weights("best_weights.h5")
    
    # predicting on train data
    train_pred_prob_nn = model_nn.predict(pred_train)
    train_pred_nn = [np.argmax(train_pred_prob_nn[i]) for i in range(pred_train.shape[0])]
    print("Final accuracy on train data = ", accuracy_score(y_train,train_pred_nn))
    print("Final loss on train data = ", log_loss(y_train,train_pred_prob_nn))
    
    # predicting on test data
    test_pred_prob_nn = model_nn.predict(pred_test)
    test_pred_nn = [np.argmax(test_pred_prob_nn[i]) for i in range(pred_test.shape[0])]
    print("Final accuracy on test data = ", accuracy_score(y_test,test_pred_nn))
    print("Final loss on test data = ", log_loss(y_test,test_pred_prob_nn))
    
    print("Model training completed")
    
    #=============================================================================================
    
    # saving model and weights for predictions
    
    print("Please wait...saving trained model...")
    
    with open("median_data.pickle",'wb') as file:
        pickle.dump(train_data_median,file)
    
    with open("mode_data.pickle",'wb') as file:
        pickle.dump(train_data_mode,file)
        
    with open("LightGBM_1.pickle",'wb') as file:
        pickle.dump(lgb1,file)
        
    with open("LightGBM_2.pickle",'wb') as file:
        pickle.dump(lgb2,file)
        
    with open("LightGBM_3.pickle",'wb') as file:
        pickle.dump(lgb3,file)
        
    model_json = model_nn.to_json()
    
    with open("model_ffnn.json","w") as file:
        file.write(model_json)
    
    print("Models saved.")
    print("DONE")
    #=============================================================================================

In [7]:
train_model()

Shape after merging all dataframes =  (970960, 26)
Shape after feature engineering =  (970960, 42)
Shape after splitting:
Train shape: x_train =  (728220, 38) , y_train =  (728220,)
Test shape: x_test =  (242740, 38) , y_test =  (242740,)
LightGBM 1
Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 19.7min finished


LGBMClassifier-1 best parameters: {'subsample': 0.5, 'num_leaves': 100, 'n_estimators': 200, 'max_depth': 13, 'learning_rate': 0.05, 'colsample_bytree': 0.6}
Train accuracy of LGBM-1 =  0.9625236878965148
Test accuracy of LGBM-1 =  0.9613125154486282
Train loss of LGBM-1 =  0.10925273771694329
Test loss of LGBM-1 =  0.11650275727421662
Time taken to train LightGBM Model-1 =  0:20:52.621027
LightGBM 2
Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 11.9min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 23.6min finished


LGBMClassifier-2 best parameters: {'subsample': 0.5, 'num_leaves': 80, 'n_estimators': 300, 'max_depth': 10, 'learning_rate': 0.1, 'colsample_bytree': 0.7}
Train accuracy of LGBM-2 =  0.9656903133668396
Test accuracy of LGBM-2 =  0.9610859355689215
Train loss of LGBM-2 =  0.09871427276954055
Test loss of LGBM-2 =  0.11697049258910246
Time taken to train LightGBM Model-2 =  0:24:45.572408
LightGBM 3
Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  7.1min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 16.7min finished


LGBMClassifier-3 best parameters: {'subsample': 0.5, 'num_leaves': 35, 'n_estimators': 500, 'max_depth': 11, 'learning_rate': 0.05, 'colsample_bytree': 0.8}
Train accuracy of LGBM-3 =  0.9618892642333361
Test accuracy of LGBM-3 =  0.9607522452006262
Train loss of LGBM-3 =  0.1113491552745651
Test loss of LGBM-3 =  0.11776111488662007
Time taken to train LightGBM Model-3 =  0:18:08.838687
Shape of predicted train probabilities (728220, 6)
Shape of predicted test probabilities (242740, 6)
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Train on 728220 samples, validate on 242740 samples
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/25
728220/728220 - 13s - loss: 0.3462 - acc: 0.9100 - val_loss: 0.2939 - val_acc: 0.9101
Epoch 2/25
728220/728220 - 12s - loss: 0.3039 - acc: 0.9101 - val_loss: 0.2604 - val_acc: 0.9101
Epoch 3/25
728220/728220 - 12s - loss: 0.2689 - ac

In [8]:

def churn_predictor():
    
    '''
    function to predict customer churn 
    '''
        
    with open("median_data.pickle",'rb') as file:
        train_data_median = pickle.load(file)
        
    with open("mode_data.pickle",'rb') as file:
        train_data_mode = pickle.load(file)
        
    with open("LightGBM_1.pickle",'rb') as file:
        lgb1 = pickle.load(file)
        
    with open("LightGBM_2.pickle",'rb') as file:
        lgb2 = pickle.load(file)
        
    with open("LightGBM_3.pickle",'rb') as file:
        lgb3 = pickle.load(file)
        
    with open("model_ffnn.json","r") as file:
        model_json = file.read()
        
    model_nn = tensorflow.keras.models.model_from_json(model_json)
    model_nn.load_weights("best_weights.h5")
        
    sample_submission_v2 = pd.read_csv("input_files/sample_submission_v2.csv",dtype={'is_churn':np.int8})
    
    transactions_v2 = pd.read_csv("input_files/transactions_v2.csv",
                              dtype={'payment_method_id':np.int8,'payment_plan_days':np.int16,'plan_list_price':np.int16,
                                    'actual_amount_paid':np.int16,'is_auto_renew':np.int8,'is_cancel':np.int8})
    members = pd.read_csv("input_files/members_v3.csv",
                     dtype={'city':np.int8,'bd':np.int16,'registered_via':np.int8})
    user_logs_v2 = pd.read_csv("input_files/user_logs_v2.csv",
                     dtype={'num_25': np.int16,'num_50': np.int16,'num_75': np.int16,'num_985': np.int16,'num_100': np.int16})
    
        
    # taking transaction count per user
    transaction_count_per_user = transactions_v2.groupby("msno")['msno'].count()
    
    # converting transaction count to dataframe
    transaction_count_per_user = pd.DataFrame(transaction_count_per_user)
    transaction_count_per_user.columns = ['trans_count']
    transaction_count_per_user.reset_index(inplace=True)
    
    #keeping only latest transaction record for each user
    transactions_v2 = transactions_v2.drop_duplicates(subset=['msno'], keep='first')
    #dropping columns
    transactions_v2.drop(columns=['membership_expire_date','actual_amount_paid','is_cancel'],axis=1,inplace=True)
    #merging transaction count to dataframe
    transactions_v2 = transactions_v2.merge(transaction_count_per_user, how='left',on='msno')
    
    user_logs_v2.drop_duplicates(inplace=True)
    
    # removing datapoints with negative value of num_100
    user_logs_v2.drop(index=(user_logs_v2[user_logs_v2.num_100<0].index) ,inplace=True)
    
    # converting total seconds to total minutes by dividing it by 60
    user_logs_v2['total_minutes'] = user_logs_v2['total_secs']/60
    user_logs_v2.drop(columns=['total_secs'],inplace=True)  # dropping column 'total_secs'
    
    # taking sum of each columns for each user
    user_logs_sum = user_logs_v2.groupby('msno',as_index=False).sum()
    user_logs_sum.drop(columns=['date'],axis=1,inplace=True)
    
    # taking mean of each columns for each user
    user_logs_mean = user_logs_v2.groupby('msno',as_index=False).mean()
    user_logs_mean.columns = ['msno','date','mean_num_25','mean_num_50','mean_num_75','mean_num_985','mean_num_100','mean_num_unq','mean_total_min']
    user_logs_mean.drop(columns=['date'],axis=1,inplace=True)
    
    # merging dataframe
    user_logs = user_logs_sum.merge(user_logs_mean,on='msno')
    #user_logs = user_logs.merge(user_logs_std,on='msno')
    
    members.drop_duplicates(inplace=True)
    members.drop(columns=['gender'],axis=1,inplace=True)  # dropping 'gender' column as 65% values are NULL
    
    # merging test labels with transactions dataframe
    test = sample_submission_v2.merge(transactions_v2,how='left',on='msno')

    # merging members data for test dataset users
    test = test.merge(members,how='left',on='msno')

    # merging test user's logs 
    test = test.merge(user_logs, how='left',on='msno')
    
    numerical_columns = ['payment_plan_days','plan_list_price','num_25', 'num_50', 'num_75', 'num_985', 'num_100',
                     'num_unq', 'total_minutes', 'mean_num_25', 'mean_num_50', 'mean_num_75','mean_num_985', 
                     'mean_num_100', 'mean_num_unq', 'mean_total_min','bd','trans_count']
    
    categorical_columns = ['payment_method_id','is_auto_renew','city','registered_via','transaction_date','registration_init_time']
    

    #filling categorical fetaures like city,registered_via etc, with mode values i.e. most occured value for that feature
    fill_with_mode(test,categorical_columns,train_data_mode)

    #filling numerical fetaures with median values 
    fill_with_median(test,numerical_columns,train_data_median)

    test.drop(columns=['bd'],axis=1,inplace=True)  # drop 'bd' (age) column
    
    # creating new features out of registration date 

    # taking just day of month
    test['trans_day_of_month'] = test.transaction_date.apply(lambda x: np.int8(str(int(x))[6:]))

    # taking removing year from the date and taking just month and day of month 
    test['trans_month_day'] = test.transaction_date.apply(lambda x: np.int16(str(int(x))[4:]))

    # 'day_of_week' signifies which day of week it is. 
    # e.g.  Monday is 1 and Sunday is 7
    test['trans_day_of_week'] = test.transaction_date.apply(lambda x: datetime.strptime(str(int(x)), "%Y%m%d").isoweekday())

    #=======================================================================================================

    # creating new features out of registration date 

    # taking just day of month
    test['reg_day_of_month'] = test.registration_init_time.apply(lambda x: np.int8(str(int(x))[6:]))

    # taking just month of transaction
    test['reg_month'] = test.registration_init_time.apply(lambda x: np.int8(str(int(x))[4:6]) )

    # taking removing year from the date and taking just month and day of month 
    test['reg_month_day'] = test.registration_init_time.apply(lambda x: np.int16(str(int(x))[4:]))

    # 'day_of_week' signifies which day of week it is. 
    # e.g.  Monday is 1 and Sunday is 7
    test['reg_day_of_week'] = test.registration_init_time.apply(lambda x: datetime.strptime(str(int(x)), "%Y%m%d").isoweekday())

    #=======================================================================================================

    # creating a new feature - price_per_day

    test['price_per_day'] = [ (test['plan_list_price'][i]/test['payment_plan_days'][i]) if (test['payment_plan_days'][i]!=0) else 0 for i in range(test.shape[0]) ]

    #=======================================================================================================

    # creating a new feature - min_per_song i.e. average minutes listened per song

    test['min_per_song'] = test['total_minutes']/(test['num_25']+test['num_50']+test['num_75']+test['num_985']+test['num_100'])

    # creating a new feature - avg_min_per_unq i.e. average minutes listened per unique song

    test['avg_min_per_unq'] = test['mean_total_min']/test['mean_num_unq']

    #========================================================================================================

    # creating a feature 'total_num' i.e. total number of songs listened by the user
    test['total_num'] = test['num_25']+test['num_50']+test['num_75']+test['num_985']+test['num_100']

    # creating a feature 'ratio_num_25' i.e. ratio of 'total_num'(total number of songs) and 'num_25'(number of songs listened upto 25% of their length)
    # train['ratio_num_25'] = [train['total_num'][i]/train['num_25'][i] for i in range(train.shape[0])]
    test['ratio_num_25'] = test['num_25']/test['total_num']

    # creating a feature 'ratio_num_50' i.e. ratio of 'total_num'(total number of songs) and 'num_50'(number of songs listened upto 50% of their length)
    test['ratio_num_50'] = test['num_50']/test['total_num']

    # creating a feature 'ratio_num_75' i.e. ratio of 'total_num'(total number of songs) and 'num_75'(number of songs listened upto 75% of their length)
    test['ratio_num_75'] = test['num_75']/test['total_num']

    # creating a feature 'ratio_num_985' i.e. ratio of 'total_num'(total number of songs) and 'num_985'(number of songs listened upto 98.5% of their length)
    test['ratio_num_985'] = test['num_985']/test['total_num']

    # creating a feature 'ratio_num_100' i.e. ratio of 'total_num'(total number of songs) and 'num_100'(number of songs listened upto 100% of their length)
    test['ratio_num_100'] = test['num_100']/test['total_num']

    # creating a feature 'total_by_unq' i.e. total_num divided by num_unq
    test['total_by_unq'] = test['total_num']/test['num_unq']
    
    ##################################
    
    # test data provided by kaggle
    test_x = test.drop(columns=['msno','is_churn','transaction_date','registration_init_time'],axis=1)

    test_y = test['is_churn'].copy()
    
    ##################################
    
    # getting accuracy and loss on test data provided by kaggle
    test_pred1 = lgb1.predict(test_x)

    test_pred_prob1 = lgb1.predict_proba(test_x)
    #####################################################################

    # getting accuracy and loss on test data provided by kaggle
    test_pred2 = lgb2.predict(test_x)
    test_pred_prob2 = lgb2.predict_proba(test_x)
    
    ######################################################################

    # getting accuracy and loss on test data provided by kaggle
    test_pred3 = lgb3.predict(test_x)
    test_pred_prob3 = lgb3.predict_proba(test_x)
    
    
    # concatenating predicted probabilities by 3 LGBM Classifiers for input to Neural Network 
    test_x_nn = np.hstack([test_pred_prob1,test_pred_prob2,test_pred_prob3])
    
    # one-hot encoding class-label 
    test_y_ohe = onehot(test_y)
    
    # predicting is_churn
    test_pred_prob_nn = model_nn.predict(test_x_nn)
    test_pred_nn = [np.argmax(test_pred_prob_nn[i]) for i in range(test_x_nn.shape[0])]

    print("Accuracy = ", accuracy_score(test_y,test_pred_nn))
    print("Loss = ", log_loss(test_y,test_pred_prob_nn,labels=[0,1]))
    
    # taking out probabilities of class '1' i.e. churn
    churn_prob = [p[1] for p in test_pred_prob_nn]
    
    # saving predicted probabilities in a csv file
    submission = pd.DataFrame()
    submission['msno'] = test['msno']
    submission['is_churn'] = churn_prob
    #print(submission_7.shape)
    submission.to_csv("submission_test.csv",index=False)
    submission['is_churn_class'] = test_pred_nn
    submission.to_csv("submission_test_class.csv",index=False)

In [10]:
import warnings
warnings.filterwarnings("ignore")

#predicting on test data provided by Kaggle
churn_predictor()

Accuracy =  0.9501328417106442
Loss =  0.06224723566259524
