In [1]:
# Get access to Google disk
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
# Import libraries
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
import pandas as pd
from pyspark.sql.functions import col
from pyspark.sql.functions import to_timestamp

In [3]:
# Создаём SparkSession
spark = SparkSession.builder \
    .appName("PetCo") \
    .getOrCreate()

In [4]:
# Добавляем файлы  в Spark-контекст
from pyspark import SparkFiles

spark.sparkContext.addFile("/content/drive/MyDrive/Colab Notebooks/diploma/scripts/process_data.py")
spark.sparkContext.addFile("/content/drive/MyDrive/Colab Notebooks/diploma/scripts/tests_for_datasets.py")


import process_data
from tests_for_datasets import test_actions_petco, test_items_petco

In [5]:
%cd "/content/drive/MyDrive/Colab Notebooks/diploma/scripts"

/content/drive/MyDrive/Colab Notebooks/diploma/scripts


In [6]:
WORKING_DIRECTORY = '/content/drive/MyDrive/Colab Notebooks/diploma/'

In [7]:
# !jupyter nbconvert --to script process_data.ipynb
# !jupyter nbconvert --to script tests_for_datasets.ipynb

# Приведем датасет с логами юзеров к формату для SASRec

In [8]:
# Read dataset with beh logs
data_logs = spark.read.parquet(WORKING_DIRECTORY+'data/needed_beh_logs')
data_logs.show()

+----------------+--------------------+-----+--------------------+--------------------+--------------------+-----------------+-------------+--------------------+--------------------+-----------+--------------------+------------+--------------------+--------------------+--------------------+-------+
|customer_user_id|              ac_key|ac_id|             user_id|          session_id|           timestamp|         raw_term|  filter_name|        filter_value|              action|customer_id|           item_name|variation_id|      variation_name|               items|            items_v2|revenue|
+----------------+--------------------+-----+--------------------+--------------------+--------------------+-----------------+-------------+--------------------+--------------------+-----------+--------------------+------------+--------------------+--------------------+--------------------+-------+
|       435675978|key_afiSr5Y4gCaaSW5X| 2560|e5fbf045-94fa-498...|e5fbf045-94fa-498...|2024-11-29 10

In [9]:
# Приведем датасет к презентабельному виду
data_conversions_cleaned = process_data.clean_petco_conversions(data_logs)
data_conversions_cleaned.show()

+-------+-------+----------+--------------------+-------+
|user_id|item_id|conversion|           timestamp|revenue|
+-------+-------+----------+--------------------+-------+
|  -1002|5181461|         1|2024-06-14 15:17:...|  10.69|
|  -1002|5206702|         1|2024-06-14 15:19:...|  16.49|
|  -1002|5119257|         1|2024-06-14 15:21:...|   4.39|
|  -1002|5107211|         1|2024-06-28 19:06:...|  69.48|
|  -1002|5008694|         1|2024-07-01 18:43:...|  59.98|
|  -1002|5181462|         1|2024-07-17 10:11:...|  23.73|
|  -1002|5000991|         1|2024-07-17 10:12:...|  55.99|
|  -1002|5000991|         1|2024-07-17 10:19:...|  55.99|
|  -1002|5080929|         1|2024-07-20 21:55:...|  36.99|
|  -1002|5183021|         1|2024-07-25 17:05:...|  67.99|
|  -1002|5183309|         1|2024-07-30 17:02:...|  68.98|
|  -1002|5142756|         1|2024-08-14 17:22:...|  17.18|
|  -1002|5206288|         1|2024-08-17 14:32:...|  15.88|
|  -1002|5181804|         1|2024-08-21 01:23:...|  19.99|
|  -1002|51184

In [10]:
# Сохраним презентабельный вид датасета
cleaned_data_folder_path = WORKING_DIRECTORY+'data/cleaned_data/'
output_name = 'data_conversions'
process_data.save_dataset_parquet(data_conversions_cleaned, cleaned_data_folder_path, output_name)

Папка /content/drive/MyDrive/Colab Notebooks/diploma/data/cleaned_data/ уже существует.
Датасет сохранён по пути /content/drive/MyDrive/Colab Notebooks/diploma/data/cleaned_data/data_conversions.


In [11]:
# Теперь приведем датасет к формату SASRec
data_conversions_processed = process_data.actions_to_sasrec_format(data_conversions_cleaned)
print(data_conversions_processed.shape)
data_conversions_processed.head(10)

[('user_id', 'int'), ('item_id', 'int'), ('datetime', 'timestamp'), ('weight', 'int')]
(441616, 4)


Unnamed: 0,user_id,item_id,datetime,weight
0,-1002,5181461,2024-06-14 15:17:25,1
1,-1002,5206702,2024-06-14 15:19:15,1
2,-1002,5119257,2024-06-14 15:21:21,1
3,-1002,5107211,2024-06-28 19:06:10,1
4,-1002,5008694,2024-07-01 18:43:40,1
5,-1002,5181462,2024-07-17 10:11:22,1
6,-1002,5000991,2024-07-17 10:12:30,1
7,-1002,5000991,2024-07-17 10:19:52,1
8,-1002,5080929,2024-07-20 21:55:22,1
9,-1002,5183021,2024-07-25 17:05:36,1


In [12]:
# Сохраним датасет формата SASRec
sasrec_data_folder_path = WORKING_DIRECTORY+'data/sasrec_format/'
output_name = 'actions.pkl'
process_data.save_dataset_pickle(data_conversions_processed, sasrec_data_folder_path, output_name)

Папка /content/drive/MyDrive/Colab Notebooks/diploma/data/sasrec_format/ уже существует.
Датасет сохранён по пути /content/drive/MyDrive/Colab Notebooks/diploma/data/sasrec_format/actions.pkl.


In [13]:
# Проведем тесты на правильность сборки датасета для SASRec
test_actions_petco(df_final=data_conversions_processed, df_original=data_logs)

✅ Тест структуры данных пройден
✅ Тест на пропущенные значения пройден
✅ Тест на добавления в корзину пройден 


# Приведем датасет с признаками товаров к формату SASRec

In [14]:
# Read dataset with items
data_items = spark.read.parquet(WORKING_DIRECTORY+'data/data_set_items')
data_items.show()

+----------+---------------+--------------------+-----+-----------+--------------------+--------------------+--------------+-------------+--------------------+------------+-------------------+-------------------+----------+--------------------+
|        id|autocomplete_id|                name|score|customer_id|          name_lower|       metadata_json|computed_score|boost_or_bury|    ds_metadata_json|section_name|         created_at|         updated_at|       day|              ac_key|
+----------+---------------+--------------------+-----+-----------+--------------------+--------------------+--------------+-------------+--------------------+------------+-------------------+-------------------+----------+--------------------+
|3668992957|           1947|Kaytee Forti-Diet...|   -1|    5017277|kaytee forti-diet...|{"url": "/shop/en...|         68803|            0|{"weighted_keywor...|    Products|2022-05-16 14:44:33|2024-06-26 02:09:22|2024-06-26|key_afiSr5Y4gCaaSW5X|
|3668994348|        

In [15]:
# Приведем датасет к презентабельному виду
data_items_cleaned = process_data.clean_petco_items(data_items)
data_items_cleaned.show(truncate=False)

+-----------+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------+-----------------------+-------------+----------------+----------------+----------------------+-----------------------+
|customer_id|name                                      

In [16]:
# Сохраним презентадельный вид датасета
cleaned_data_folder_path = WORKING_DIRECTORY+'data/cleaned_data/'
output_name = 'data_items'
process_data.save_dataset_parquet(data_items_cleaned, cleaned_data_folder_path, output_name)

Папка /content/drive/MyDrive/Colab Notebooks/diploma/data/cleaned_data/ уже существует.
Датасет сохранён по пути /content/drive/MyDrive/Colab Notebooks/diploma/data/cleaned_data/data_items.


In [17]:
# Приведем датасет к формату SASRec
data_items_processed = process_data.items_to_sasrec_format(data_items_cleaned)
print(data_items_processed.shape)
data_items_processed.head(5)

(222585, 3)


Unnamed: 0,id,value,feature
0,119675,repeat-delivery-eligible-products,Category
1,119675,buy-online-pick-up-in-store-reptile-products,Category
2,119675,dry-reptile-food,Category
3,119675,same-day-delivery-reptile-products,Category
4,5103084,tall-dog-gates,Category


In [18]:
# Сохраним датасет формата SASRec
sasrec_data_folder_path = WORKING_DIRECTORY+'data/sasrec_format/'
output_name = 'items.pkl'
process_data.save_dataset_pickle(data_items_processed, sasrec_data_folder_path, output_name)

Папка /content/drive/MyDrive/Colab Notebooks/diploma/data/sasrec_format/ уже существует.
Датасет сохранён по пути /content/drive/MyDrive/Colab Notebooks/diploma/data/sasrec_format/items.pkl.


In [20]:
# Проведем тесты на правильность сборки датасета для SASRec
test_items_petco(df_final=data_items_processed, df_original=data_items)

✅ Тест структуры данных пройден
✅ Тест на пропущенные значения пройден
✅ Тест на множество товаров пройден
