In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
from sklearn import preprocessing

xls = pd.ExcelFile('/kaggle/input/Data.xlsx')

# read every sheet in the Excel file
data = pd.read_excel(xls, 'Description') # description of the data
prodbal = pd.read_excel(xls, 'Products_ActBalance') # product account balance data set
socdem = pd.read_excel(xls, 'Soc_Dem') # social demography data set
inout = pd.read_excel(xls, 'Inflow_Outflow') # behavior as cash inflow and outflow
sales = pd.read_excel(xls, 'Sales_Revenues') # sales and revenue

# train set
df = socdem.merge(sales, on='Client', how='inner')
df1 = df.merge(prodbal, on='Client', how='inner')
X = df1.merge(inout, on='Client', how='left') # left join

# counter variables such as count of CA, SA, MF, OVD, CC, and CL, its NaN value will be replaced by 0.
counters = ['Count_CA', 'Count_SA', 'Count_MF', 'Count_OVD', 'Count_CC', 'Count_CL']
X[counters] = X[counters].fillna(value=0)

# 6 targets
MF_y = X['Sale_MF']
CC_y = X['Sale_CC']
CL_y = X['Sale_CL']
rMF_y = X['Revenue_MF']
rCC_y = X['Revenue_CC']
rCL_y = X['Revenue_CL']
X.drop(['Client','Sale_MF', 'Sale_CC', 'Sale_CL', 'Revenue_MF', 'Revenue_CC', 'Revenue_CL'], axis=1, inplace=True)

# test set
df2 = socdem.merge(prodbal, on='Client', how='inner')
tdata = df2.merge(inout, on='Client', how='left') # left join
test = tdata[~tdata.Client.isin(sales.Client)]

# convert categorical variables to numerical with cardinality less than 10
low_cardinality_cols = [cname for cname in X.columns if 
                                X[cname].nunique() < 10 and
                                X[cname].dtype == "object"]

# make sure all data are positive
X = pd.get_dummies(X, columns=low_cardinality_cols)
X = X.abs()
test = pd.get_dummies(test, columns=low_cardinality_cols)
test = test.abs()

# clean data 
X.fillna(0, inplace=True)
test.fillna(0, inplace=True)
X = X.replace('\n','', regex=True)
test = test.replace('\n','', regex=True)

In [None]:
#import dependencies
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

## Sales

### Mutual Fund Sales

In [None]:
# split the data set
MF_X_train, MF_X_valid, MF_y_train, MF_y_valid = train_test_split(X, MF_y, test_size=0.3, train_size=0.7, random_state=1)

# scale train set to be fed to feature selection
sc = StandardScaler()
MF_X_train_scaled = sc.fit_transform(MF_X_train)

# feature selection
MF_feat = LogisticRegression()
rfe = RFE(MF_feat, 2)
fit = rfe.fit(MF_X_train_scaled, MF_y_train)

# extract best features
MF_features = [i for i, x in enumerate(fit.ranking_) if x == True]
MF_F = list(X.columns[MF_features])

# training and validation set with selected features
MF_X_train = MF_X_train[MF_F]
MF_X_valid = MF_X_valid[MF_F]

In [None]:
# random forest classifier
MF_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0)

MF_model.fit(MF_X_train, MF_y_train)
MF_preds = MF_model.predict(MF_X_valid)

print("Mutual funds sales prediction accuracy: ")
print(accuracy_score(MF_preds, MF_y_valid))

In [None]:
# finding the best hyperparameter for randomforest classifier in mutual funds sales
parameters = [{'n_estimators':[50, 100, 150, 200, 250, 300, 350, 400, 450, 500], 'max_depth': range(11)}]
grid_search = GridSearchCV(estimator=MF_model, param_grid=parameters, scoring='accuracy', cv=10, n_jobs=-1)
grid = grid_search.fit(MF_X_train, MF_y_train)
print(grid_search.best_params_)

### Credit Card Sales

In [None]:
# split the data set
CC_X_train, CC_X_valid, CC_y_train, CC_y_valid = train_test_split(X, CC_y, test_size=0.3, train_size=0.7, random_state=1)

# scale train set to be fed to feature selection
#sc = StandardScaler()
CC_X_train_scaled = sc.fit_transform(CC_X_train)

# feature selection
CC_feat = LogisticRegression()
rfe = RFE(CC_feat, 2)
fit = rfe.fit(CC_X_train_scaled, CC_y_train)

# extract best features
CC_features = [i for i, x in enumerate(fit.ranking_) if x == True]
CC_F = list(X.columns[CC_features])

# training and validation set with selected features
CC_X_train = CC_X_train[CC_F]
CC_X_valid = CC_X_valid[CC_F]

In [None]:
# random forest classifier for credit card sales
CC_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=1)

CC_model.fit(CC_X_train, CC_y_train)
CC_preds = CC_model.predict(CC_X_valid)

print("Credit cards sales prediction accuracy: ")
print(accuracy_score(CC_preds, CC_y_valid))

In [None]:
# finding the best hyperparameter for randomforest classifier in credit card sales
parameters = [{'n_estimators':[50, 100, 150, 200, 250, 300, 350, 400, 450, 500], 'max_depth': range(11)}]
grid_search = GridSearchCV(estimator=CC_model, param_grid=parameters, scoring='accuracy', cv=10, n_jobs=-1)
grid = grid_search.fit(CC_X_train, CC_y_train)
print(grid_search.best_params_)

### Consumer Loans Sales

In [None]:
# split the data set
CL_X_train, CL_X_valid, CL_y_train, CL_y_valid = train_test_split(X, CL_y, test_size=0.3, train_size=0.7, random_state=1)

# scale train set to be fed to feature selection
#sc = StandardScaler()
CL_X_train_scaled = sc.fit_transform(CL_X_train)

# feature selection
CL_feat = LogisticRegression()
rfe = RFE(CL_feat, 2)
fit = rfe.fit(CL_X_train_scaled, CL_y_train)

# extract best features
CL_features = [i for i, x in enumerate(fit.ranking_) if x == True]
CL_F = list(X.columns[CL_features])

# training and validation set with selected features
CL_X_train = CL_X_train[CL_F]
CL_X_valid = CL_X_valid[CL_F]

In [None]:
# random forest classifier for consumer loans sales
CL_model = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=0)

CL_model.fit(CL_X_train, CL_y_train)
CL_preds = CL_model.predict(CL_X_valid)

print("Consumer loans sales prediction accuracy: ")
print(accuracy_score(CL_preds, CL_y_valid))

In [None]:
# finding the best hyperparameter for randomforest classifier in consumer loans sales
parameters = [{'n_estimators':[50, 100, 150, 200, 250, 300, 350, 400, 450, 500], 'max_depth': range(11)}]
grid_search = GridSearchCV(estimator=CL_model, param_grid=parameters, scoring='accuracy', cv=10, n_jobs=-1)
grid = grid_search.fit(CL_X_train, CL_y_train)
print(grid_search.best_params_)

## Revenues

### Mutual Funds Revenues

In [None]:
# filter data according to MF sale
MF_rX = X.join(MF_y)
MF_rX = MF_rX[MF_rX['Sale_MF']==1]

# response variable
rMF_y = rMF_y[rMF_y>0] # filter response variable by removing data with no values 

# test set for mutual fund
MF_rtest = test.join(MF_y)
MF_rtest = MF_rtest[MF_rtest['Sale_MF']==1]

In [None]:
# split the data set
rMF_X_train, rMF_X_valid, rMF_y_train, rMF_y_valid = train_test_split(MF_rX, rMF_y, test_size=0.2, random_state=0)

# scale train set to be fed to feature selection
sc = StandardScaler()
rMF_X_train_scaled = sc.fit_transform(rMF_X_train)


# feature selection for continuous feature response
rMF_feat = LinearRegression()
rfe = RFE(rMF_feat, 3)
fit = rfe.fit(rMF_X_train_scaled, rMF_y_train)

# extract best features
rMF_features = [i for i, x in enumerate(fit.ranking_) if x == True]
rMF_F = list(X.columns[rMF_features])

# training and validation set with selected features
rMF_X_train = rMF_X_train[rMF_F]
rMF_X_valid = rMF_X_valid[rMF_F]

In [None]:
# model mutual funds revenue
rMF_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=1) # Your code here

rMF_model.fit(rMF_X_train, rMF_y_train) # Your code here
rMF_pred = rMF_model.predict(rMF_X_valid)
rMF = rMF_model.predict(MF_rtest[rMF_F])
#print(rMF)

# MAE score
rMF_mae = mean_absolute_error(rMF_pred, rMF_y_valid)
print("Mean Absolute Error:", rMF_mae)

### Credit Cards Revenues

In [None]:
# filter data according to CC sale
CC_rX = X.join(CC_y)
CC_rX = CC_rX[CC_rX['Sale_CC']==1]

# response variable
rCC_y = rCC_y[rCC_y>0]

# test set for mutual fund
CC_rtest = test.join(CC_y)
CC_rtest = CC_rtest[CC_rtest['Sale_CC']==1]

In [None]:
# split the data set
rCC_X_train, rCC_X_valid, rCC_y_train, rCC_y_valid = train_test_split(CC_rX, rCC_y, test_size=0.2, random_state=0)

# scale train set to be fed to feature selection
sc = StandardScaler()
rCC_X_train_scaled = sc.fit_transform(rCC_X_train)


# feature selection for continuous feature response
rCC_feat = LinearRegression()
rfe = RFE(rCC_feat, 5)
fit = rfe.fit(rCC_X_train_scaled, rCC_y_train)

# extract best features
rCC_features = [i for i, x in enumerate(fit.ranking_) if x == True]
rCC_F = list(X.columns[rCC_features])

# training and validation set with selected features
rCC_X_train = rCC_X_train[rCC_F]
rCC_X_valid = rCC_X_valid[rCC_F]

In [None]:
# model credit card revenue
rCC_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=1) # Your code here

rCC_model.fit(rCC_X_train, rCC_y_train) # Your code here
rCC_pred = rCC_model.predict(rCC_X_valid)
rCC = rCC_model.predict(CC_rtest[rCC_F])
#print(rCC)

# MAE score
rCC_mae = mean_absolute_error(rCC_pred, rCC_y_valid)
print("Mean Absolute Error:", rCC_mae)

### Consumer Loans Revenues

In [None]:
# filter data according to CC sale
CL_rX = X.join(CL_y)
CL_rX = CL_rX[CL_rX['Sale_CL']==1]

# response variable
rCL_y = rCL_y[rCL_y>0]

# test set for mutual fund
CL_rtest = test.join(CL_y)
CL_rtest = CL_rtest[CL_rtest['Sale_CL']==1]

In [None]:
# split the data set
rCL_X_train, rCL_X_valid, rCL_y_train, rCL_y_valid = train_test_split(CL_rX, rCL_y, test_size=0.2, random_state=0)

# scale train set to be fed to feature selection
sc = StandardScaler()
rCL_X_train_scaled = sc.fit_transform(rCL_X_train)


# feature selection for continuous feature response
rCL_feat = LinearRegression()
rfe = RFE(rCL_feat, 5)
fit = rfe.fit(rCL_X_train_scaled, rCL_y_train)

# extract best features
rCL_features = [i for i, x in enumerate(fit.ranking_) if x == True]
rCL_F = list(X.columns[rCL_features])

# training and validation set with selected features
rCL_X_train = rCL_X_train[rCL_F]
rCL_X_valid = rCL_X_valid[rCL_F]

In [None]:
# model consumer loans revenue
rCL_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=1) # Your code here

rCL_model.fit(rCL_X_train, rCL_y_train) # Your code here
rCL_pred = rCL_model.predict(rCL_X_valid)
rCL = rCL_model.predict(CL_rtest[rCL_F])
#print(rCL)

# MAE score
rCL_mae = mean_absolute_error(rCL_pred, rCL_y_valid)
print("Mean Absolute Error:", rCL_mae)

# Results

In [None]:
# sales prediction data frame in terms of probability
MF_preds = MF_model.predict_proba(test[MF_F])
MF_preds = MF_preds[:, 1]

CC_preds = CC_model.predict_proba(test[CC_F])
CC_preds = CC_preds[:, 1]

CL_preds = CL_model.predict_proba(test[CL_F])
CL_preds = CL_preds[:, 1]

sales_output = pd.DataFrame({'Client':test.Client, 'MF_sales':MF_preds, 'CC_sales':CC_preds, 'CL_sales':CL_preds})

# revenues of each product
MF_revenues = pd.DataFrame({'Client':MF_rtest.Client, 'MF_Revenue':rMF})
CC_revenues = pd.DataFrame({'Client':CC_rtest.Client, 'CC_Revenue':rCC})
CL_revenues = pd.DataFrame({'Client':CL_rtest.Client, 'CL_Revenue':rCL})

## Top 100 prospects

Revenues result will be merged based on top 100 clients of predicted sales. And predicted revenue will be computed.

In [None]:
sales_output['mean'] = sales_output[['MF_sales', 'CC_sales', 'CL_sales']].mean(numeric_only=True, axis=1)
sales_output.head()

In [None]:
prospects = sales_output.sort_values('mean', ascending=False).head(100)
prospects.head()

In [None]:
print(prospects.Client)

In [None]:
propects_MF = prospects[prospects['MF_sales'] > 0.5]
print(propects_MF.Client)

In [None]:
propects_CC = prospects[prospects['CC_sales'] > 0.5]
print(propects_CC.Client)

In [None]:
propects_CL = prospects[prospects['CL_sales'] > 0.5]
print(propects_CL.Client)

In [None]:
# mutual funds revenue
rMF_data = MF_revenues.merge(prospects, on='Client', how='inner')
rCC_data = CC_revenues.merge(prospects, on='Client', how='inner')
rCL_data = CL_revenues.merge(prospects, on='Client', how='inner')

rMF = rMF_data.MF_Revenue.sum()
rCC = rCC_data.CC_Revenue.sum()
rCL = rCL_data.CL_Revenue.sum()

In [None]:
predicted_revenue = rMF + rCC + rCL
print(predicted_revenue)