In [1]:
import os 
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

In [2]:
# monitor spark tasks localy at http://127.0.0.1:4040/

In [3]:
# launch this cell if you have issues on windows with py4j (think about updating your PATH)
import sys
os.environ['PYSPARK_DRIVER_PYTHON_OPTS']= "notebook"
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['PYSPARK_PYTHON'] = sys.executable

In [4]:
# starts a spark session from notebook

os.environ['PYSPARK_SUBMIT_ARGS'] ="--conf spark.driver.memory=2g  pyspark-shell"
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local") \
    .appName("load_explore") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/09 11:00:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
# loads relevant datas in DataFrames
train_sessions = spark.read.load('../Data/train_sessions.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true')

train_purchases = spark.read.load('../Data/train_purchases.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true')

candidate_items = spark.read.load('../Data/candidate_items.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true')

item_features = spark.read.load('../Data/item_features.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true')

datas = [train_sessions, train_purchases, candidate_items, item_features]

                                                                                

In [6]:
# to have a first look at our datas
for data_set in datas:
    print(f"{data_set.take(1)} , count = {data_set.count()}")
    data_set.printSchema()

[Row(session_id=3, item_id=9655, date='2020-12-18 21:25:00.373')] , count = 4743820
root
 |-- session_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- date: string (nullable = true)

[Row(session_id=3, item_id=15085, date='2020-12-18 21:26:47.986')] , count = 1000000
root
 |-- session_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- date: string (nullable = true)

[Row(item_id=4)] , count = 4990
root
 |-- item_id: integer (nullable = true)

[Row(item_id=2, feature_category_id=56, feature_value_id=365)] , count = 471751
root
 |-- item_id: integer (nullable = true)
 |-- feature_category_id: integer (nullable = true)
 |-- feature_value_id: integer (nullable = true)



In [6]:
# get some insight on our dataset

from pyspark.sql.functions import isnan, when, count, col

# spark already has some builtin methods for extracting statistics (see tp) : 
print(train_purchases.describe().toPandas().set_index('summary').transpose())
# ---> not relevant to use describe () here in this case of course

# example : print NaN count per column per set
for data_set in datas:
    data_set.select([count(when(isnan(col), col)).alias(col) for col in data_set.columns]).show()

# note : working with spark is WAY faster than transforming to pandas and computing features naively as you see, do not make the mistake

                                                                                

summary       count            mean              stddev  \
session_id  1000000  2221071.286897  1281017.5859886208   
item_id     1000000    13978.825051   8187.993593462088   
date        1000000            None                None   

summary                        min                      max  
session_id                       3                  4440001  
item_id                          3                    28143  
date        2020-01-01 00:02:11.06  2021-05-31 23:47:07.764  


                                                                                

+----------+-------+----+
|session_id|item_id|date|
+----------+-------+----+
|         0|      0|   0|
+----------+-------+----+



                                                                                

+----------+-------+----+
|session_id|item_id|date|
+----------+-------+----+
|         0|      0|   0|
+----------+-------+----+

+-------+
|item_id|
+-------+
|      0|
+-------+

+-------+-------------------+----------------+
|item_id|feature_category_id|feature_value_id|
+-------+-------------------+----------------+
|      0|                  0|               0|
+-------+-------------------+----------------+

