# Feature Engineering with Modelling
## Author: Dulan Wijeratne 1181873

In this notebook we will make new features using modelling techniques.

First we will start by creating a Spark session and reading in the joined aggregated data.

In [None]:
from pyspark.sql import SparkSession, functions as f

In [None]:
spark = (
    SparkSession.builder.appName("feature_engineering")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config('spark.driver.memory', '3g')   
    .config('spark.executor.memory', '4g')  
    .config('spark.executor.instances', '2')  
    .config('spark.executor.cores', '2')
    .getOrCreate()
)

In [None]:
joined = spark.read.parquet("../../../data/curated/removed_outliers.parquet")

In [None]:
joined.orderBy(f.col("consumer_diff_over_period").asc()).show()

In [None]:
joined.filter(joined.merchant_abn == 71118957552).show()

In [None]:
joined.orderBy(f.col("average_growth_consumers").desc()).show()

Changing NULLs to 0s

As we are going to be using modelling techniques we need to change the NULLs to an interpretable value.

In [None]:
joined = joined.fillna(0)

Next we want to convert the categorical features into integer values so that we can check its correlation between the target variables.

In the dataset there are 2 categorical features:
- Revenue Value
- Segment

In [None]:
from pyspark.ml.feature import StringIndexer

In [None]:
input_cols = ["revenue_level","segment"]
output_cols = ["revenue_level_indexed","segment_indexed"]

In [None]:
revenue_level_indexer = StringIndexer(inputCol = "revenue_level", outputCol= "revenue_level_indexed")
segment_indexer = StringIndexer(inputCol = "segment", outputCol = "segment_indexed")

In [None]:
pre_correlation_df = revenue_level_indexer.fit(joined).transform(segment_indexer.fit(joined).transform(joined))
pre_correlation_df = pre_correlation_df.drop("revenue_level", "segment","name","first_recorded_transaction","last_recorded_transaction")

In [None]:
correlation_df = pre_correlation_df.toPandas()

Now we will check the correlation matrix

In [None]:
import pandas as pd

In [None]:
correlation_df.head()

In [None]:
corr_matrix = correlation_df.corr()

### Feature Engineering

Predicting number of consumers in 3 years

In [None]:
corr_matrix.loc["number_of_unique_consumers"]

Next we seperate the features and the target variables

In [None]:
modelling_df = correlation_df.copy()

In [None]:
modelling_df.head()

In [None]:
target_variable = "number_of_unique_consumers"

In [None]:
features_unique_customers = modelling_df.drop(columns = ["merchant_abn",target_variable])
number_of_unique_customer = modelling_df[target_variable]

Feature Selection

In [None]:
from sklearn.feature_selection import f_regression, SelectKBest

In [None]:
selector = SelectKBest(score_func=f_regression, k= 5)
features_unique_customers_selected = selector.fit_transform(features_unique_customers, number_of_unique_customer)

In [None]:
selected_feature_indices = selector.get_support(indices=True)
selected_features = features_unique_customers.columns[selected_feature_indices]
print(selected_features)

Splitting the data for train and test 
 - We will use a 80 - 20

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
features_unique_customers_train, features_unique_customers_test, number_of_unique_customer_train, number_of_unique_customer_test = \
    train_test_split(features_unique_customers[selected_features], number_of_unique_customer, test_size=0.33, random_state=42)

Fitting the model
- We will use a linear regression model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
num_of_unique_customers_model = LinearRegression()
num_of_unique_customers_model.fit(features_unique_customers_train, number_of_unique_customer_train)

Model Evaluation

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
num_of_unique_customer_pred = num_of_unique_customers_model.predict(features_unique_customers_test)
mse = mean_squared_error(number_of_unique_customer_test, num_of_unique_customer_pred)
rmse = (mse ** 0.5)
r2 = r2_score(number_of_unique_customer_test, num_of_unique_customer_pred)

In [None]:
print(f'R-squared (R2): {r2}')

Next we will predict the number of customers in 3 years

In [None]:
future_modelling_df = modelling_df.copy()
future_modelling_df = future_modelling_df.sort_values(by='merchant_abn')

In [None]:
future_modelling_df["transcation_period_months"] = future_modelling_df["transcation_period_months"] + 36

In [None]:
future_modelling_df.head()

In [None]:
future_features_unique_customers = future_modelling_df.drop(columns = ["merchant_abn",target_variable])

In [None]:
future_features_unique_customers.columns

In [None]:
predicted_num_of_unique_customers= num_of_unique_customers_model.predict(future_features_unique_customers[selected_features])

In [None]:
results = future_modelling_df.copy()
results["predicted_num_of_unique_customers"] = predicted_num_of_unique_customers

In [None]:
results.head()

In [None]:
results_df = spark.createDataFrame(results)

In [None]:
results_df = results_df.select(f.col("merchant_abn"),f.col("predicted_num_of_unique_customers"))

In [None]:
joined = joined.join(results_df, on = "merchant_abn", how = "inner")

In [None]:
joined = joined.withColumn("predicted_num_of_unique_customers", f.when(joined.predicted_num_of_unique_customers < 0, 0).otherwise(f.round(joined.predicted_num_of_unique_customers)))

In [None]:
joined.orderBy(f.col("number_of_unique_consumers").asc()).show()

In [None]:
joined.write.mode("overwrite").parquet("../../../data/ranking_data.parquet")

In [None]:
spark.stop()