# Implementation of Forecast model

The aim of this notebook is to implement forecast model using linear regression, hence prediciting the number of consumers, total revenue, and number of transactions in 2022 based on 2021-2022 data.

Three separate models will be trained to predict each of number of consumers,  total revenue, and number of transactions, and those implementions will be shown in sequential order as follows.

In [1]:
import pandas as pd
from statsmodels.formula.api import ols
from sklearn.metrics import *
from sklearn.model_selection import *
from sklearn.linear_model import *


### Read in Data

In [2]:
train_data = pd.read_parquet('../data/curated/train_data/')
test_data = pd.read_parquet('../data/curated/test_data/')
train_data.drop(['merchant_take_rate', 'year'], axis=1, inplace = True)
test_data.drop(['merchant_take_rate', 'year'], axis=1, inplace = True)

In [3]:
# Remove NA values from the data
train_data = train_data.dropna(axis=0)
test_data = test_data.dropna(axis=0)

In [4]:
train_data = train_data.set_index('merchant_abn')
test_data = test_data.set_index('merchant_abn')

In [5]:
categorical_features = ['category', 'merchant_revenue_lvl']
train_data = pd.get_dummies(train_data, columns=categorical_features)
test_data = pd.get_dummies(test_data, columns=categorical_features)

train_data.head()

Unnamed: 0_level_0,total_num_consumer,total_num_trans,total_revenue,zone_mean_population,zone_avg_num_earners,zone_avg_age,zone_avg_total_income,zone_avg_income,avg_monthly_post,next_total_num_consumer,...,category_shoe,category_stationery,category_telecom,category_tent,category_watch,merchant_revenue_lvl_a,merchant_revenue_lvl_b,merchant_revenue_lvl_c,merchant_revenue_lvl_d,merchant_revenue_lvl_e
merchant_abn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
38700038932,4106,4134,5535580.95,10385.0,6818.0,44.91,293816400.0,51213.09,298.538462,4365.0,...,0,0,0,1,0,1,0,0,0,0
41956465747,153,153,36275.63,10348.5,6838.5,44.57,319406000.0,50381.9,12.75,177.0,...,0,0,0,0,0,0,1,0,0,0
48214071373,289,289,86416.64,9877.5,6317.0,44.96,251017300.0,49916.165,24.0,345.0,...,0,0,0,0,0,0,1,0,0,0
92202115241,78,78,24513.3,9523.0,6186.0,46.3,282809200.0,49269.645,6.5,65.0,...,0,0,0,0,0,1,0,0,0,0
15700338102,117,117,18938.77,11181.5,6518.0,45.205,267153500.0,50454.77,9.75,136.0,...,0,0,0,0,0,1,0,0,0,0


## 1. Predicting Total Number of Consumers 

In [6]:
# Separate the target variables
labels = ['next_total_num_consumer', 'next_total_revenue', 'next_total_num_trans']
features = [i for i in train_data.columns if i not in labels]
X = train_data[features]
y = train_data['next_total_num_consumer']
test = test_data[features]

In [7]:
# Display the coefficients for the linear regression
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
lr = LinearRegression()
fitted_model = lr.fit(X_train, y_train)
coef = pd.DataFrame({"feature":features, "coef": lr.coef_})
# Top 5 features with smallest coefficient (negative)
print(coef.sort_values(by='coef', ascending=True).head(5))
# Top 5 features with largest coefficient (positive)
print(coef.sort_values(by='coef', ascending=False).head(5))

               feature       coef
18   category_florists -12.051717
19  category_furniture -10.677050
14      category_cable  -8.201633
15   category_computer  -5.441599
26      category_music  -4.139070
                   feature      coef
25          category_motor  9.598678
11  category_artist supply  6.588512
21           category_gift  4.370475
22         category_health  3.831363
37  merchant_revenue_lvl_e  3.321446


In [8]:
y_pred = fitted_model.predict(X_test)
result = pd.DataFrame({'predicted_total_num_consumer': y_pred, 'true_total_num_consumer': y_test})

In [9]:
# Evaluate the performance of the linear model
r_2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f'R-squared score is {r_2}')
print(f'Mean Absolute Error is {mae}')

R-squared score is 0.9999120885519618
Mean Absolute Error is 32.18840727924995


In [10]:
# Predict the label for the test data
num_of_con = fitted_model.predict(test)
test_data['predicted_total_number_consumers'] = num_of_con.round(0)

## 2. Predicting Revenue 

In [11]:
# Separate the target variables
labels = ['next_total_num_consumer', 'next_total_revenue', 'next_total_num_trans']
features = [i for i in train_data.columns if i not in labels]
X = train_data[features]
y = train_data['next_total_revenue']
test = test_data[features]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
lr = LinearRegression()
fitted_model = lr.fit(X_train, y_train)
coef = pd.DataFrame({"feature":features, "coef": lr.coef_})
# Top 5 features with smallest coefficient (negative)
print(coef.sort_values(by='coef', ascending=True).head(5))
# Top 5 features with largest coefficient (positive)
print(coef.sort_values(by='coef', ascending=False).head(5))

                   feature          coef
10            category_art -13136.843378
9         category_antique -10815.290888
24        category_jewelry  -6570.702391
36  merchant_revenue_lvl_d  -4372.816948
20         category_garden  -3718.242775
                   feature         coef
12        category_bicycle  4457.244937
16        category_digital  4450.149232
37  merchant_revenue_lvl_e  3968.280168
21           category_gift  3939.095610
27      category_opticians  3812.676896


In [13]:
y_pred = fitted_model.predict(X_test)
result = pd.DataFrame({'predicted_revenue': y_pred, 'true_revenue': y_test})

In [14]:
# Evaluate the performance of the linear model
r_2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f'R-squared score is {r_2}')
print(f'Mean Absolute Error is {mae}')

R-squared score is 0.9974573377314002
Mean Absolute Error is 16196.60115858211


In [15]:
# Predict the label for the test data
revenue = fitted_model.predict(test)
test_data['predicted_revenue'] = revenue

## 3. Predicitng Total Number of Transactions

In [16]:
# Separate the target variable
labels = ['next_total_num_consumer', 'next_total_revenue', 'next_total_num_trans']
features = [i for i in train_data.columns if i not in labels]
X = train_data[features]
y = train_data['next_total_num_trans']
test = test_data[features]

In [17]:
# Display the coefficients for the linear regression
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
lr = LinearRegression()
fitted_model = lr.fit(X_train, y_train)
coef = pd.DataFrame({"feature":features, "coef": lr.coef_})
# Top 5 features with smallest coefficient (negative)
print(coef.sort_values(by='coef', ascending=True).head(5))
# Top 5 features with largest coefficient (positive)
print(coef.sort_values(by='coef', ascending=False).head(5))

               feature       coef
18   category_florists -14.293507
14      category_cable -13.562761
15   category_computer  -5.894622
26      category_music  -5.255395
19  category_furniture  -3.976033
                   feature      coef
11  category_artist supply  8.331837
27      category_opticians  6.354866
28           category_shoe  5.910555
25          category_motor  5.775343
20         category_garden  4.466838


In [18]:
y_pred = fitted_model.predict(X_test)
result = pd.DataFrame({'predicted_total_num_trans': y_pred.round(0), 'true_total_num_trans': y_test})

In [19]:
# Evaluate the performance of the linear model
r_2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f'R-squared score is {r_2}')
print(f'Mean Absolute Error is {mae}')

R-squared score is 0.9999514705369851
Mean Absolute Error is 35.76329679664923


In [20]:
# Predict the label for the test data
num_transactions = fitted_model.predict(test)
test_data['predicted_total_number_transactions'] = num_transactions.round(0)

In [21]:
predicted_data = test_data.filter(like='predicted', axis=1)

In [22]:
predicted_data.head()

Unnamed: 0_level_0,predicted_total_number_consumers,predicted_revenue,predicted_total_number_transactions
merchant_abn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10142254217,1909.0,69346.445477,1901.0
10430380319,111.0,34926.54677,109.0
10463252268,50.0,27034.483863,50.0
10487253336,1587.0,728885.195019,1597.0
10553813474,512.0,148145.360857,516.0


In [23]:
predicted_data.count()

predicted_total_number_consumers       4018
predicted_revenue                      4018
predicted_total_number_transactions    4018
dtype: int64

In [24]:
predicted_data.to_csv('../data/curated/LM_prediction.csv')