In [1]:
import os 
import warnings
import time
import numpy as np
import scipy.stats
import sys
import sklearn
import sklearn.datasets

from pyspark.sql import SparkSession
warnings.filterwarnings('ignore')
import pandas as pd

# launch this cell if you have issues on windows with py4j (think about updating your PATH)

os.environ['PYSPARK_DRIVER_PYTHON_OPTS']= "notebook"
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['PYSPARK_PYTHON'] = sys.executable

# starts a spark session from notebook

os.environ['PYSPARK_SUBMIT_ARGS'] ="--conf spark.driver.memory=4g  pyspark-shell"


spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("feature_selection") \
    .getOrCreate()

sc=spark.sparkContext

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/25 09:16:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/05/25 09:16:43 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/05/25 09:16:43 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/05/25 09:16:43 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
22/05/25 09:16:43 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
22/05/25 09:16:43 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4045.


In [2]:
train_sessions_engineered = spark.read.csv('../Data/session_engineered_features.txt',header=False,
                                          inferSchema=True)

train_purchases = spark.read.load('../Data/train_purchases.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true')

                                                                                

In [3]:
selected_features_indices = [-1, 36, 4, 0, 34, 17, 35, 18, 32, 5, 33, 16, 31, 21, 19, 28, 26, 30, 14, 23, 13]
columns_to_keep = []
for elem in selected_features_indices:
    columns_to_keep.append(train_sessions_engineered.columns[elem+1])
train_sessions_engineered = train_sessions_engineered.select(columns_to_keep)
train_sessions_engineered.take(1)

[Row(_c0=720, _c37=0.0, _c5=2021.0, _c1=23943.0, _c35=0.0, _c18=0.0, _c36=0.0, _c19=0.0, _c33=0.0, _c6=21890.0, _c34=0.0, _c17=0.0, _c32=0.0, _c22=0.0, _c20=0.0, _c29=0.0, _c27=0.0, _c31=0.0, _c15=0.0, _c24=0.0, _c14=0.0)]

In [4]:
# join the dataframes on the session ids and drop useless columns
train_df = train_sessions_engineered.join(train_purchases,train_sessions_engineered._c0 == train_purchases.session_id,"inner" )
for col in ['_c0','session_id','date']:
    train_df = train_df.drop(col)

In [5]:
train_df.take(1)

                                                                                

[Row(_c37=0.0, _c5=2020.0, _c1=0.0, _c35=0.0, _c18=0.0, _c36=0.0, _c19=1.0, _c33=0.0, _c6=15654.0, _c34=0.0, _c17=0.0, _c32=0.0, _c22=0.0, _c20=0.0, _c29=0.0, _c27=0.0, _c31=0.0, _c15=0.0, _c24=0.0, _c14=0.0, item_id=18626)]

In [6]:
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorIndexer,VectorAssembler
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

assembler = VectorAssembler(inputCols=train_df.columns[:-1], outputCol='features')

labelIndexer = StringIndexer(inputCol='item_id', outputCol='indexedLabel').fit(train_df)

rf = RandomForestClassifier(labelCol='indexedLabel', featuresCol='features')

pipeline = Pipeline(stages=[labelIndexer, assembler, rf])

paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [60,100]) \
    .addGrid(rf.maxDepth, [4,5]) \
    .build()
evaluator = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
#accuracy = evaluator.evaluate(predictions)

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=2)

                                                                                

In [None]:
(training_data, test_data) = train_df.randomSplit([0.8, 0.2])

CV_model = crossval.fit(training_data) 

22/05/25 09:18:51 WARN DAGScheduler: Broadcasting large task binary with size 3.2 MiB
22/05/25 09:19:21 WARN DAGScheduler: Broadcasting large task binary with size 1780.6 KiB
22/05/25 09:19:48 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB
22/05/25 09:20:17 WARN DAGScheduler: Broadcasting large task binary with size 1780.6 KiB
22/05/25 09:20:43 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB
22/05/25 09:21:11 WARN DAGScheduler: Broadcasting large task binary with size 1780.6 KiB
22/05/25 09:21:40 WARN DAGScheduler: Broadcasting large task binary with size 9.7 MiB
22/05/25 09:22:09 WARN DAGScheduler: Broadcasting large task binary with size 1928.4 KiB
22/05/25 09:22:38 WARN DAGScheduler: Broadcasting large task binary with size 4.9 MiB
22/05/25 09:23:07 WARN DAGScheduler: Broadcasting large task binary with size 1928.4 KiB
22/05/25 09:23:36 WARN DAGScheduler: Broadcasting large task binary with size 7.5 MiB
22/05/25 09:24:07 WARN DAGScheduler: Br

In [None]:
transformed_data = cross_validated.transform(test_data)
print(evaluator.getMetricName(), 'accuracy:', evaluator.evaluate(transformed_data))

In [None]:
tree_model = CV_model.bestModel.stages[2]
print(tree_model)