# Feature Engineering with Modelling
## Author: Dulan Wijeratne 1181873

In this notebook we will make new features using modelling techniques.

First we will start by creating a Spark session and reading in the joined aggregated data.

In [2]:
from pyspark.sql import SparkSession, functions as f

In [3]:
spark = (
    SparkSession.builder.appName("feature_engineering")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '3g')   
    .config('spark.executor.memory', '4g')  
    .config('spark.executor.instances', '2')  
    .config('spark.executor.cores', '2')
    .getOrCreate()
)

your 131072x1 screen size is bogus. expect trouble
23/09/27 03:38:56 WARN Utils: Your hostname, DulanComputer resolves to a loopback address: 127.0.1.1; using 172.30.15.25 instead (on interface eth0)
23/09/27 03:38:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/27 03:38:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
joined = spark.read.parquet("../../../data/insights/joined.parquet")

                                                                                

In [10]:
joined.orderBy(f.col("consumer_diff_over_period").asc()).show()

+------------+--------------------+-------------+---------+----------------------------------+--------------------------+----------------------------------+--------------------------+----------------------------------------+----------------------+----------------+---------------------+--------------------------+------------------------------+-------------------------+--------------------+------------------------+--------------------------+-------------------------+-------------------------+-------------------+---------------------+--------------------------------+---------------------------+------------------+------------------+--------------------+---------------------------------+--------------------+
|merchant_abn|                name|revenue_level|take_rate|average_merchant_fraud_probability|number_of_unique_consumers|average_consumer_fraud_probability|number_of_repeat_consumers|average_repeat_transactions_per_consumer|consumer_retainability|number_of_orders|average_cost_of_order|a

Changing NULLs to 0s

As we are going to be using modelling techniques we need to change the NULLs to an interpretable value.

In [7]:
joined = joined.fillna(0)

Next we want to convert the categorical features into integer values so that we can check its correlation between the target variables.

In the dataset there are 2 categorical features:
- Revenue Value
- Segment

In [12]:
from pyspark.ml.feature import StringIndexer

In [11]:
input_cols = ["revenue_level","segment"]
output_cols = ["revenue_level_indexed","segment_indexed"]

In [15]:
revenue_level_indexer = StringIndexer(inputCol = "revenue_level", outputCol= "revenue_level_indexed")
segment_indexer = StringIndexer(inputCol = "segment", outputCol = "segment_indexed")

In [36]:
pre_correlation_df = revenue_level_indexer.fit(joined).transform(segment_indexer.fit(joined).transform(joined))
pre_correlation_df = pre_correlation_df.drop("revenue_level", "segment","name","first_recorded_transaction","last_recorded_transaction")

In [37]:
correlation_df = pre_correlation_df.toPandas()

Now we will check the correlation matrix

In [38]:
import pandas as pd

In [51]:
correlation_df.head()

Unnamed: 0,merchant_abn,take_rate,average_merchant_fraud_probability,number_of_unique_consumers,average_consumer_fraud_probability,number_of_repeat_consumers,average_repeat_transactions_per_consumer,consumer_retainability,number_of_orders,average_cost_of_order,...,number_of_postcodes,most_popular_postcode,avg_total_weekly_personal_income,avg_total_weekly_fam_income,avg_median_age,avg_household_size,postcode_reach,avg_num_of_consumers_per_postcode,segment_indexed,revenue_level_indexed
0,10023283211,0.18,0.0,2525,0.095502,174,1.071683,0.068911,2706,215.797947,...,1628,3275,786.702328,1971.123799,43.031966,2.456914,0.6169,1.662162,2.0,4.0
1,10142254217,4.22,0.0,2389,0.064356,151,1.064881,0.063206,2544,38.59147,...,1591,6438,792.25,1983.427083,42.850629,2.464025,0.60288,1.598994,1.0,1.0
2,10165489824,4.4,0.0,0,0.0,0,0.0,0.0,4,8885.894209,...,4,2534,817.5,2066.125,41.625,2.475,0.001516,1.0,3.0,1.0
3,10187291046,3.29,0.0,291,0.058022,1,1.003436,0.003436,292,115.99557,...,273,5067,796.547945,1961.171233,43.125,2.449418,0.103448,1.069597,3.0,1.0
4,10192359162,6.33,0.0,321,0.036126,2,1.006231,0.006231,323,460.347109,...,303,2062,808.877709,2024.267802,43.294118,2.44548,0.114816,1.066007,0.0,0.0


In [45]:
corr_matrix = correlation_df.corr()

### Feature Engineering

1. Predicting number of consumers in 3 years

In [44]:
corr_matrix.loc["number_of_unique_consumers"]

merchant_abn                                0.005000
take_rate                                   0.040962
average_merchant_fraud_probability         -0.012018
number_of_unique_consumers                  1.000000
average_consumer_fraud_probability         -0.047828
number_of_repeat_consumers                  0.859005
average_repeat_transactions_per_consumer    0.665024
consumer_retainability                      0.976162
number_of_orders                            0.713317
average_cost_of_order                      -0.166808
average_spend_per_consumer                 -0.159572
average_monthly_diff_consumers              0.772166
consumer_diff_over_period                   0.772211
average_growth                              0.447964
merchant_revenue_rounded                    0.648542
transcation_period_months                   0.197559
number_of_postcodes                         0.847743
most_popular_postcode                       0.116245
avg_total_weekly_personal_income            0.

In [48]:
corr_matrix.loc["consumer_retainability"]

merchant_abn                                0.009474
take_rate                                   0.039260
average_merchant_fraud_probability         -0.008699
number_of_unique_consumers                  0.976162
average_consumer_fraud_probability         -0.038619
number_of_repeat_consumers                  0.947489
average_repeat_transactions_per_consumer    0.707421
consumer_retainability                      1.000000
number_of_orders                            0.819932
average_cost_of_order                      -0.141634
average_spend_per_consumer                 -0.133854
average_monthly_diff_consumers              0.870185
consumer_diff_over_period                   0.870215
average_growth                              0.367334
merchant_revenue_rounded                    0.663081
transcation_period_months                   0.166579
number_of_postcodes                         0.744230
most_popular_postcode                       0.117988
avg_total_weekly_personal_income            0.

In [49]:
corr_matrix.loc["postcode_reach"]

merchant_abn                               -0.007819
take_rate                                   0.053763
average_merchant_fraud_probability         -0.024144
number_of_unique_consumers                  0.847743
average_consumer_fraud_probability         -0.046839
number_of_repeat_consumers                  0.514494
average_repeat_transactions_per_consumer    0.624319
consumer_retainability                      0.744230
number_of_orders                            0.444686
average_cost_of_order                      -0.254117
average_spend_per_consumer                 -0.248717
average_monthly_diff_consumers              0.498233
consumer_diff_over_period                   0.498296
average_growth                              0.661497
merchant_revenue_rounded                    0.536521
transcation_period_months                   0.308889
number_of_postcodes                         1.000000
most_popular_postcode                       0.092763
avg_total_weekly_personal_income            0.

In [50]:
corr_matrix.loc["consumer_diff_over_period"]

merchant_abn                                0.012623
take_rate                                   0.028378
average_merchant_fraud_probability         -0.004928
number_of_unique_consumers                  0.772211
average_consumer_fraud_probability         -0.031323
number_of_repeat_consumers                  0.927064
average_repeat_transactions_per_consumer    0.781994
consumer_retainability                      0.870215
number_of_orders                            0.994600
average_cost_of_order                      -0.089203
average_spend_per_consumer                 -0.081439
average_monthly_diff_consumers              0.999999
consumer_diff_over_period                   1.000000
average_growth                              0.225224
merchant_revenue_rounded                    0.598710
transcation_period_months                   0.104117
number_of_postcodes                         0.498296
most_popular_postcode                       0.108086
avg_total_weekly_personal_income            0.

Now we want to predict the number of unique consumers in 3 years time therefore we need to include:
- transcation period in month

Furthermore we include features that highly correlate and do not correlate with other features such as:
- consumer retainability
- average growth rate
- postcode reach
- average number of consumers per postcode  

In [96]:
modelling_df = correlation_df[["merchant_abn", "consumer_retainability", "average_growth", "postcode_reach", "avg_num_of_consumers_per_postcode",
                               "transcation_period_months", "number_of_unique_consumers"]]

In [79]:
modelling_df.head()

Unnamed: 0,merchant_abn,consumer_retainability,average_growth,postcode_reach,avg_num_of_consumers_per_postcode,transcation_period_months,number_of_unique_consumers
0,10023283211,0.068911,1.604633,0.6169,1.662162,19.677419,2525
1,10142254217,0.063206,5.252767,0.60288,1.598994,19.322581,2389
2,10165489824,0.0,0.0,0.001516,1.0,6.903226,0
3,10187291046,0.003436,0.091955,0.103448,1.069597,18.677419,291
4,10192359162,0.006231,0.17899,0.114816,1.066007,18.741935,321


For this project we are not expecting more merchants to be added so we will disregard using a train test split.

In [80]:
from sklearn.linear_model import LinearRegression

In [81]:
features_columns = ["consumer_retainability", "average_growth", "postcode_reach", "avg_num_of_consumers_per_postcode", "transcation_period_months"]

In [82]:
features_unique_customers = modelling_df[features_columns]
number_of_unique_customer = modelling_df["number_of_unique_consumers"]

In [83]:
num_of_unique_customers_model = LinearRegression()

In [84]:
num_of_unique_customers_model.fit(features_unique_customers, number_of_unique_customer)

Next we will predict the number of customers in 3 years

In [104]:
future_modelling_df = modelling_df.copy()
future_modelling_df = future_modelling_df.sort_values(by='merchant_abn')

In [105]:
future_modelling_df["transcation_period_months"] = future_modelling_df["transcation_period_months"] + 36

In [106]:
future_modelling_df.head()

Unnamed: 0,merchant_abn,consumer_retainability,average_growth,postcode_reach,avg_num_of_consumers_per_postcode,transcation_period_months,number_of_unique_consumers
0,10023283211,0.068911,1.604633,0.6169,1.662162,55.677419,2525
1,10142254217,0.063206,5.252767,0.60288,1.598994,55.322581,2389
2,10165489824,0.0,0.0,0.001516,1.0,42.903226,0
3,10187291046,0.003436,0.091955,0.103448,1.069597,54.677419,291
4,10192359162,0.006231,0.17899,0.114816,1.066007,54.741935,321


In [107]:
future_features_unique_customers = future_modelling_df[features_columns]

In [109]:
predicted_num_of_unique_customers= num_of_unique_customers_model.predict(future_features_unique_customers)

In [110]:
results = future_modelling_df.copy()
results["predicted_num_of_unique_customers"] = predicted_num_of_unique_customers

In [114]:
results.head()

Unnamed: 0,merchant_abn,consumer_retainability,average_growth,postcode_reach,avg_num_of_consumers_per_postcode,transcation_period_months,number_of_unique_consumers,predicted_num_of_unique_customers
0,10023283211,0.068911,1.604633,0.6169,1.662162,55.677419,2525,2217.315515
1,10142254217,0.063206,5.252767,0.60288,1.598994,55.322581,2389,1807.921562
2,10165489824,0.0,0.0,0.001516,1.0,42.903226,0,-579.023073
3,10187291046,0.003436,0.091955,0.103448,1.069597,54.677419,291,-479.603821
4,10192359162,0.006231,0.17899,0.114816,1.066007,54.741935,321,-387.940564


In [120]:
results_df = spark.createDataFrame(results)

In [121]:
results_df = results_df.drop("consumer_retainability","average_growth","postcode_reach", "avg_num_of_consumers_per_postcode","transcation_period_months", "number_of_unique_consumers")

In [122]:
joined = joined.join(results_df, on = "merchant_abn", how = "inner")

In [126]:
joined2 = joined.withColumn("predicted_num_of_unique_customers", f.when(joined.predicted_num_of_unique_customers < 0, 0).otherwise(f.round(joined.predicted_num_of_unique_customers)))

In [127]:
joined2.show()

+------------+--------------------+-------------+---------+----------------------------------+--------------------------+----------------------------------+--------------------------+----------------------------------------+----------------------+----------------+---------------------+--------------------------+------------------------------+-------------------------+-------------------+------------------------+--------------------------+-------------------------+-------------------------+-------------------+---------------------+--------------------------------+---------------------------+------------------+------------------+--------------------+---------------------------------+--------------------+---------------------------------+
|merchant_abn|                name|revenue_level|take_rate|average_merchant_fraud_probability|number_of_unique_consumers|average_consumer_fraud_probability|number_of_repeat_consumers|average_repeat_transactions_per_consumer|consumer_retainability|number_

In [8]:
spark.stop()