# Interactive analysis via jupyter notebook

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml import Transformer
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.sql import DataFrame
from pyspark.sql.types import FloatType
import math
import pyspark.sql.functions as F
from pyspark.ml.linalg import Vectors, VectorUDT
import os
from pyspark.ml.regression import GBTRegressor, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pprint import pprint

## Read Hive tables

### Connect to Hive

In [2]:
TEAM = 'team24'

# location of Hive database in HDFS
WAREHOUSE = "project/hive/warehouse"

spark = SparkSession.builder\
        .appName(f"{TEAM} - spark ML")\
        .master("yarn")\
        .config("hive.metastore.uris",
                "thrift://hadoop-02.uni.innopolis.ru:9883")\
        .config("spark.sql.warehouse.dir", WAREHOUSE)\
        .config("spark.sql.avro.compression.codec", "snappy")\
        .enableHiveSupport()\
        .getOrCreate()

### Read Hive table

In [3]:
transactions = spark.read.format("avro")\
                    .table('team24_projectdb.transactions_part_buck')

### Explore the table

In [4]:
transactions.printSchema()

root
 |-- time: timestamp (nullable = true)
 |-- hour_of_day: integer (nullable = true)
 |-- sending_address: string (nullable = true)
 |-- receiving_address: string (nullable = true)
 |-- amount: float (nullable = true)
 |-- location_region: string (nullable = true)
 |-- ip_prefix: string (nullable = true)
 |-- login_frequency: integer (nullable = true)
 |-- session_duration: integer (nullable = true)
 |-- purchase_pattern: string (nullable = true)
 |-- age_group: string (nullable = true)
 |-- risk_score: float (nullable = true)
 |-- anomaly: string (nullable = true)
 |-- transaction_type: string (nullable = true)



In [5]:
transactions.show(10)

+-------------------+-----------+--------------------+--------------------+---------+---------------+---------+---------------+----------------+----------------+---------+----------+---------+----------------+
|               time|hour_of_day|     sending_address|   receiving_address|   amount|location_region|ip_prefix|login_frequency|session_duration|purchase_pattern|age_group|risk_score|  anomaly|transaction_type|
+-------------------+-----------+--------------------+--------------------+---------+---------------+---------+---------------+----------------+----------------+---------+----------+---------+----------------+
|2022-12-12 16:15:16|         19|0x87cd446adc9d04f...|0x1f1d8ed2ce1b2cb...|658.88055|  South America|    172.0|              2|              36|          random|      new|      94.5|high_risk|            scam|
|2022-08-28 10:15:22|         13|0xec2d86edc26d619...|0x5449125c46f3e2a...|912.27344|         Europe|     10.0|              2|              36|          random

In [6]:
print("(row, colunm):", (transactions.count(), len(transactions.columns)))

(row, colunm): (78600, 14)


## Preprocessing the data

### Drop data with missing values

In [7]:
transactions = transactions.dropna()

print("row:", transactions.count())

row: 78600


### Class CustomTransformer for preprocessing timestamp data type

In [8]:
def cos_sin_time(list_time):
    output = []
    parm = [0, 12, 31, 24, 60, 60]

    for i in range(len(list_time)):
        if i == 0:
            output.append(float(list_time[i]))
        else:
            cos = math.cos((2*list_time[i]*math.pi)/parm[i])
            sin = math.sin((2*list_time[i]*math.pi)/parm[i])
            output.append(cos)
            output.append(sin)
    return Vectors.dense(output)

In [9]:
class CustomTransformer(Transformer, DefaultParamsReadable,
                        DefaultParamsWritable):

    def _transform(self, dataset: DataFrame):
        input_col = dataset['time']
        transform_udf = F.udf(lambda x: cos_sin_time(
                            list(map(int, ':'.join('-'.join(str(x).split())
                                                   .split('-'))
                                     .split(':')))
                        ), VectorUDT())
        return dataset.withColumn("sin_cos_time", transform_udf(input_col))

### Select the features and preprocessing

In [10]:
categorical_features = ['transaction_type', 'location_region',
                        'purchase_pattern']
numerical_features = ['amount', 'login_frequency', 'session_duration',
                      'ip_prefix']
time = 'time'
label = 'risk_score'

In [11]:
# Convert from string to float type column 'ip_prefix'
ip_prefix_udf = F.udf(lambda x: float(x), FloatType())
transactions = transactions.withColumn('ip_prefix', ip_prefix_udf('ip_prefix'))

In [12]:
# Create string indexer for categorical features
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index",
                          handleInvalid="skip") for col in categorical_features]

# One-hot encode categorical features
onehot_encoders = [OneHotEncoder(inputCol=col+"_index",
                                 outputCol=col+"_onehot")
                   for col in categorical_features]

In [13]:
custom_transformer = CustomTransformer()

In [14]:
# Assemble all features into a single vector
assembler = VectorAssembler(
    inputCols=[col+"_onehot" for col in categorical_features]
    + numerical_features + ["sin_cos_time"] + [label],
    outputCol="features")

### Build the Pipeline

In [15]:
# Create pipeline
pipeline = Pipeline(stages=indexers + onehot_encoders + [custom_transformer]
                    + [assembler])

In [16]:
# Fit pipeline to data
pipeline_model = pipeline.fit(transactions)

In [17]:
# Transform data using the fitted pipeline
transformed_data = pipeline_model.transform(transactions)

In [18]:
transformed_data.head(3)

[Row(time=datetime.datetime(2022, 6, 25, 11, 27, 37), hour_of_day=14, sending_address='0x6ea0e02fb6ee893dc3b70b98df1a48165d28eb09', receiving_address='0xc28cbdb253f12174f7aa80ff6c6660f2e09397d7', amount=523.9479370117188, location_region='North America', ip_prefix=172.0, login_frequency=4, session_duration=56, purchase_pattern='focused', age_group='established', risk_score=15.75, anomaly='low_risk', transaction_type='transfer', transaction_type_index=2.0, location_region_index=0.0, purchase_pattern_index=2.0, transaction_type_onehot=SparseVector(4, {2: 1.0}), location_region_onehot=SparseVector(4, {0: 1.0}), purchase_pattern_onehot=SparseVector(2, {}), sin_cos_time=DenseVector([2022.0, -1.0, 0.0, 0.3473, -0.9378, -0.9659, 0.2588, -0.9511, 0.309, -0.7431, -0.6691]), features=DenseVector([0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 523.9479, 4.0, 56.0, 172.0, 2022.0, -1.0, 0.0, 0.3473, -0.9378, -0.9659, 0.2588, -0.9511, 0.309, -0.7431, -0.6691, 15.75])),
 Row(time=datetime.datetime

In [19]:
df = transformed_data['features', 'risk_score']

df.head(3)

[Row(features=DenseVector([1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 665.4055, 8.0, 114.0, 172.16, 2022.0, 1.0, -0.0, 0.9795, 0.2013, 0.2588, -0.9659, -0.4067, 0.9135, 0.809, -0.5878, 52.5]), risk_score=52.5),
 Row(features=DenseVector([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 762.8003, 6.0, 153.0, 172.0, 2022.0, 0.5, 0.866, -0.4404, 0.8978, -0.2588, 0.9659, -0.5878, -0.809, -0.9511, 0.309, 62.5]), risk_score=62.5),
 Row(features=DenseVector([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 453.8714, 6.0, 89.0, 172.0, 2022.0, 0.5, -0.866, -0.8743, -0.4853, 0.2588, -0.9659, -0.866, -0.5, -0.9135, 0.4067, 52.5]), risk_score=52.5)]

### Split data

In [19]:
# Split the data into training and testing sets
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

In [20]:
train_data.head(3)

[Row(features=SparseVector(26, {9: 1.0, 10: 0.01, 11: 1.0, 12: 36.0, 13: 10.0, 14: 2022.0, 15: -0.0, 16: -1.0, 17: 0.9795, 18: 0.2013, 19: -0.9659, 20: 0.2588, 21: -0.2079, 22: -0.9781, 23: 1.0, 25: 100.0}), risk_score=100.0),
 Row(features=SparseVector(26, {9: 1.0, 10: 327.2854, 11: 1.0, 12: 30.0, 13: 10.0, 14: 2022.0, 15: 0.866, 16: 0.5, 17: 0.919, 18: 0.3944, 19: -0.7071, 20: 0.7071, 21: 0.9781, 22: 0.2079, 23: 1.0, 25: 100.0}), risk_score=100.0),
 Row(features=SparseVector(26, {9: 1.0, 10: 649.1923, 11: 2.0, 12: 21.0, 13: 192.168, 14: 2022.0, 15: 0.866, 16: -0.5, 17: 0.1514, 18: 0.9885, 19: -0.866, 20: 0.5, 21: 0.4067, 22: 0.9135, 23: 1.0, 25: 100.0}), risk_score=100.0)]

In [21]:
print('train_data, test_data:', (train_data.count(), test_data.count()))

train_data, test_data: (63112, 15466)


### Save train and test data to HDFS

In [22]:
def run(command):
    return os.popen(command).read()

In [23]:
# train_data.select("features", "risk_score")\
#     .coalesce(1)\
#     .write\
#     .mode("overwrite")\
#     .format("json")\
#     .save("project/data/train")

In [24]:
# test_data.select("features", "risk_score")\
#     .coalesce(1)\
#     .write\
#     .mode("overwrite")\
#     .format("json")\
#     .save("project/data/test")

In [25]:
# run("hdfs dfs -cat project/data/train/*.json >\
#     ~/project/bigdata_project_team24/data/train.json")
# run("hdfs dfs -cat project/data/test/*.json >\
#     ~/project/bigdata_project_team24/data/test.json")

## Model 1 (GBTRegressor)

### Model training

In [26]:
gbt = GBTRegressor(featuresCol="features", labelCol="risk_score")

In [27]:
model_gbt = gbt.fit(train_data)

### Prediction

In [28]:
predictions_gbt = model_gbt.transform(test_data)

In [29]:
predictions_gbt.show(10)

+--------------------+----------+------------------+
|            features|risk_score|        prediction|
+--------------------+----------+------------------+
|[0.0,0.0,0.0,1.0,...|     100.0|100.01853424776044|
|[0.0,0.0,0.0,1.0,...|      90.0| 89.99690730183153|
|[0.0,0.0,0.0,1.0,...|      90.0| 89.99690730183153|
|[0.0,0.0,0.0,1.0,...|      90.0| 89.99690730183153|
|[0.0,0.0,0.0,1.0,...|     100.0|100.01853424776044|
|[0.0,0.0,0.0,1.0,...|     100.0|100.01853424776044|
|[0.0,0.0,0.0,1.0,...|      90.0| 89.99690730183153|
|[0.0,0.0,0.0,1.0,...|      90.0| 89.99690730183153|
|[0.0,0.0,0.0,1.0,...|     100.0|100.01853424776044|
|[0.0,0.0,0.0,1.0,...|      90.0| 89.99690730183153|
+--------------------+----------+------------------+
only showing top 10 rows



### Evaluation

In [30]:
evaluator_rmse = RegressionEvaluator(labelCol="risk_score",
                                     predictionCol="prediction",
                                     metricName="rmse")

evaluator_r2 = RegressionEvaluator(labelCol="risk_score",
                                   predictionCol="prediction", metricName="r2")

In [31]:
rmse_gbt = evaluator_rmse.evaluate(predictions_gbt)
r2_gbt = evaluator_r2.evaluate(predictions_gbt)

In [32]:
print("RMSE for GBTRegressor:", rmse_gbt)
print("R2 for GBTRegressor:", r2_gbt)

RMSE for GBTRegressor: 0.015894595778895824
R2 for GBTRegressor: 0.999999455585549


### Hyperparameter optimization

In [33]:
grid = ParamGridBuilder()

In [34]:
param_grid_gbt = grid.addGrid(model_gbt.maxDepth, [2, 4, 6]) \
    .addGrid(model_gbt.maxBins, [16, 32, 64]) \
    .build()

In [35]:
cv_gbt = CrossValidator(estimator=gbt,
                        estimatorParamMaps=param_grid_gbt,
                        evaluator=evaluator_rmse,
                        parallelism=5,
                        numFolds=3)

In [36]:
cvModel_gbt = cv_gbt.fit(train_data)

### Select the best model

In [37]:
model_best_gbt = cvModel_gbt.bestModel
model_best_gbt

GBTRegressionModel: uid=GBTRegressor_c8346a47d3b9, numTrees=20, numFeatures=26

In [38]:
pprint(model_best_gbt.extractParamMap())

{Param(parent='GBTRegressor_c8346a47d3b9', name='featuresCol', doc='features column name.'): 'features',
 Param(parent='GBTRegressor_c8346a47d3b9', name='cacheNodeIds', doc='If false, the algorithm will pass trees to executors to match instances with nodes. If true, the algorithm will cache node IDs for each instance. Caching can speed up training of deeper trees. Users can set how often should the cache be checkpointed or disable it by setting checkpointInterval.'): False,
 Param(parent='GBTRegressor_c8346a47d3b9', name='featureSubsetStrategy', doc="The number of features to consider for splits at each tree node. Supported options: 'auto' (choose automatically for task: If numTrees == 1, set to 'all'. If numTrees > 1 (forest), set to 'sqrt' for classification and to 'onethird' for regression), 'all' (use all features), 'onethird' (use 1/3 of the features), 'sqrt' (use sqrt(number of features)), 'log2' (use log2(number of features)), 'n' (when n is in the range (0, 1.0], use n * number

### Save the model to HDFS

In [39]:
# model_best_gbt.write().overwrite().save("project/models/model1")

# # Run it from root directory of the repository
# run("hdfs dfs -get project/models/model1\
#     ~/project/bigdata_project_team24/models/model1")

### Prediction for the best model

In [40]:
predictions_best_gbt = model_best_gbt.transform(test_data)
predictions_best_gbt.show()

+--------------------+----------+----------+
|            features|risk_score|prediction|
+--------------------+----------+----------+
|(26,[1,4,10,11,12...|   42.1875|   42.1875|
|(26,[1,6,10,11,12...|   35.4375|   35.4375|
|(26,[1,7,10,11,12...|     26.25|     26.25|
|(26,[1,7,10,11,12...|   42.1875|   42.1875|
|(26,[1,10,11,12,1...|      25.0|      25.0|
|(26,[1,10,11,12,1...|     26.25|     26.25|
|(26,[1,10,11,12,1...|   35.4375|   35.4375|
|(26,[1,10,11,12,1...|     26.25|     26.25|
|(26,[1,10,11,12,1...|     31.25|     31.25|
|(26,[1,10,11,12,1...|     31.25|     31.25|
|(26,[1,10,11,12,1...|     31.25|     31.25|
|(26,[1,10,11,12,1...|     33.75|     33.75|
|(26,[1,10,11,12,1...|     33.75|     33.75|
|(26,[1,10,11,12,1...|     33.75|     33.75|
|(26,[1,10,11,12,1...|     26.25|     26.25|
|(26,[1,10,11,12,1...|     26.25|     26.25|
|(26,[1,10,11,12,1...|     26.25|     26.25|
|(26,[1,10,11,12,1...|     33.75|     33.75|
|(26,[1,10,11,12,1...|   35.4375|   35.4375|
|(26,[1,10

### Save prediction of the best model to HDFS

In [41]:
# predictions_best_gbt.select("risk_score", "prediction")\
#     .coalesce(1)\
#     .write\
#     .mode("overwrite")\
#     .format("csv")\
#     .option("sep", ",")\
#     .option("header", "true")\
#     .save("project/output/model1_predictions.csv")

# # Run it from root directory of the repository
# run("hdfs dfs -cat project/output/model1_predictions.csv/*.csv >\
#     ~/project/bigdata_project_team24/output/model1_predictions.csv")

### Evaluation for the best model 

In [42]:
rmse_best_gbt = evaluator_rmse.evaluate(predictions_best_gbt)
r2_best_gbt = evaluator_r2.evaluate(predictions_best_gbt)

print("RMSE for the best GBTRegressor:", rmse_best_gbt)
print("R2 for the best GBTRegressor:", r2_best_gbt)

RMSE for the best GBTRegressor: 0.0009512903246989098
R2 for the best GBTRegressor: 0.9999999980550562


## Model 2 (RandomForestRegressor)

### Model training

In [43]:
rf = RandomForestRegressor(featuresCol="features", labelCol="risk_score")

In [44]:
model_rf = rf.fit(train_data)

### Prediction

In [45]:
predictions_rf = model_rf.transform(test_data)

In [46]:
predictions_rf.show(10)

+--------------------+----------+------------------+
|            features|risk_score|        prediction|
+--------------------+----------+------------------+
|(26,[0,8,10,11,12...|    70.875| 69.09707330228932|
|[1.0,0.0,0.0,0.0,...|      50.0|51.914561288012614|
|[1.0,0.0,0.0,0.0,...|      50.0| 52.06205364399314|
|[1.0,0.0,0.0,0.0,...|      50.0| 52.06205364399314|
|[1.0,0.0,0.0,0.0,...|      67.5| 68.92832330228931|
|[1.0,0.0,0.0,0.0,...|      50.0| 51.84768984981709|
|[1.0,0.0,0.0,0.0,...|      67.5| 69.09524827968585|
|[1.0,0.0,0.0,0.0,...|      50.0|52.025637196635195|
|[1.0,0.0,0.0,0.0,...|      50.0| 51.90752897503462|
|[1.0,0.0,0.0,0.0,...|      50.0| 52.06205364399314|
+--------------------+----------+------------------+
only showing top 10 rows



### Evaluation

In [47]:
rmse_rf = evaluator_rmse.evaluate(predictions_rf)
r2_rf = evaluator_r2.evaluate(predictions_rf)

In [48]:
print("RMSE for RandomForestRegressor:", rmse_rf)
print("R2 for RandomForestRegressor:", r2_rf)

RMSE for RandomForestRegressor: 2.0072857305101093
R2 for RandomForestRegressor: 0.9914588296948514


### Hyperparameter optimization

In [49]:
param_grid_rf = grid.addGrid(model_rf.maxDepth, [2, 4, 6]) \
    .addGrid(model_rf.maxBins, [16, 32, 64]) \
    .build()

In [50]:
cv_rf = CrossValidator(estimator=rf,
                       estimatorParamMaps=param_grid_rf,
                       evaluator=evaluator_rmse,
                       parallelism=5,
                       numFolds=3)

In [51]:
cvModel_rf = cv_rf.fit(train_data)

### Select the best model

In [52]:
model_best_rf = cvModel_rf.bestModel
model_best_rf

RandomForestRegressionModel: uid=RandomForestRegressor_b2e6363f39a8, numTrees=20, numFeatures=26

In [53]:
pprint(model_best_rf.extractParamMap())

{Param(parent='RandomForestRegressor_b2e6363f39a8', name='leafCol', doc='Leaf indices column name. Predicted leaf index of each instance in each tree by preorder.'): '',
 Param(parent='RandomForestRegressor_b2e6363f39a8', name='maxBins', doc='Max number of bins for discretizing continuous features.  Must be >=2 and >= number of categories for any categorical feature.'): 64,
 Param(parent='RandomForestRegressor_b2e6363f39a8', name='maxMemoryInMB', doc='Maximum memory in MB allocated to histogram aggregation. If too small, then 1 node will be split per iteration, and its aggregates may exceed this size.'): 256,
 Param(parent='RandomForestRegressor_b2e6363f39a8', name='seed', doc='random seed.'): -8588119780064033430,
 Param(parent='RandomForestRegressor_b2e6363f39a8', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes. Must be in range [0, 30].'): 6,
 Param(parent='RandomForestRegressor_b2e6363f39a8', name

### Save the model to HDFS

In [54]:
# model_best_rf.write().overwrite().save("project/models/model2")

# # Run it from root directory of the repository
# run("hdfs dfs -get project/models/model2 models/model2")

### Prediction for the best model

In [55]:
predictions_best_rf = model_best_rf.transform(test_data)
predictions_best_rf.show()

+--------------------+----------+------------------+
|            features|risk_score|        prediction|
+--------------------+----------+------------------+
|(26,[0,8,10,11,12...|    70.875| 70.97619056428002|
|[1.0,0.0,0.0,0.0,...|      50.0|50.723714086969984|
|[1.0,0.0,0.0,0.0,...|      50.0|50.856973933623685|
|[1.0,0.0,0.0,0.0,...|      50.0|50.836213008181815|
|[1.0,0.0,0.0,0.0,...|      67.5| 69.77153667638076|
|[1.0,0.0,0.0,0.0,...|      50.0|50.974864753617446|
|[1.0,0.0,0.0,0.0,...|      67.5| 69.64035663941068|
|[1.0,0.0,0.0,0.0,...|      50.0| 50.72024449506914|
|[1.0,0.0,0.0,0.0,...|      50.0|50.728037710727264|
|[1.0,0.0,0.0,0.0,...|      50.0|50.852650309866405|
|[1.0,0.0,0.0,0.0,...|      50.0| 50.84972984139665|
|[1.0,0.0,0.0,0.0,...|      50.0| 50.94557297482727|
|[1.0,0.0,0.0,0.0,...|      50.0| 50.71311679700837|
|[1.0,0.0,0.0,0.0,...|      50.0|50.856361810125726|
|[1.0,0.0,0.0,0.0,...|      67.5| 69.63154910545067|
|[1.0,0.0,0.0,0.0,...|      50.0| 50.856119901

### Save prediction of the best model to HDFS

In [56]:
# predictions_best_rf.select("risk_score", "prediction")\
#     .coalesce(1)\
#     .write\
#     .mode("overwrite")\
#     .format("csv")\
#     .option("sep", ",")\
#     .option("header","true")\
#     .save("project/output/model2_predictions.csv")

# # Run it from root directory of the repository
# run("hdfs dfs -cat project/output/model2_predictions.csv/*.csv >\
#     output/model2_predictions.csv")

### Evaluation for the best model 

In [57]:
rmse_best_rf = evaluator_rmse.evaluate(predictions_best_rf)
r2_best_rf = evaluator_r2.evaluate(predictions_best_rf)

print("RMSE for the best RandomForestRegressor:", rmse_best_rf)
print("R2 for the best RandomForestRegressor:", r2_best_rf)

RMSE for the best RandomForestRegressor: 1.1895396985820008
R2 for the best RandomForestRegressor: 0.9969815154026349


## Compare the best models

### Create data frame to compare performance of the models

In [58]:
models = [[str(model_best_gbt), rmse_best_gbt, r2_best_gbt],
          [str(model_best_rf), rmse_best_rf, r2_best_rf]]

In [59]:
comp_models = spark.createDataFrame(models, ["model", "RMSE", "R2"])
comp_models.show(truncate=False)

+------------------------------------------------------------------------------------------------+--------------------+------------------+
|model                                                                                           |RMSE                |R2                |
+------------------------------------------------------------------------------------------------+--------------------+------------------+
|GBTRegressionModel: uid=GBTRegressor_c8346a47d3b9, numTrees=20, numFeatures=26                  |9.512903246989098E-4|0.9999999980550562|
|RandomForestRegressionModel: uid=RandomForestRegressor_b2e6363f39a8, numTrees=20, numFeatures=26|1.1895396985820008  |0.9969815154026349|
+------------------------------------------------------------------------------------------------+--------------------+------------------+



### Save comparation models to HDFS

In [60]:
# comp_models.coalesce(1)\
#     .write\
#     .mode("overwrite")\
#     .format("csv")\
#     .option("sep", ",")\
#     .option("header", "true")\
#     .save("project/output/evaluation.csv")


# run("hdfs dfs -cat project/output/evaluation.csv/*.csv >\
#     output/evaluation.csv")