In [1]:
# Get access to Google disk
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
# Import libraries
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
import pandas as pd
from pyspark.sql.functions import col
from pyspark.sql.functions import to_timestamp
import pyspark

In [3]:
# Создаём SparkSession
spark = SparkSession.builder \
    .appName("PetCo_tests") \
    .getOrCreate()

In [4]:
# # Добавляем файлы  в Spark-контекст
from pyspark import SparkFiles

spark.sparkContext.addFile("/content/drive/MyDrive/Colab Notebooks/diploma/scripts/process_data.py")
spark.sparkContext.addFile("/content/drive/MyDrive/Colab Notebooks/diploma/scripts/tests_for_datasets.py")


import process_data
from tests_for_datasets import test_actions_sasrec, test_items_sasrec, common_tests_sasrec

In [5]:
%cd "/content/drive/MyDrive/Colab Notebooks/diploma/tests/tests_petco"

/content/drive/MyDrive/Colab Notebooks/diploma/tests/tests_petco


In [6]:
WORKING_DIRECTORY = '/content/drive/MyDrive/Colab Notebooks/diploma/'

# Создаем сырой датасет каталога для теста

In [7]:
# Read dataset with items
data_items = spark.read.parquet(WORKING_DIRECTORY+'data/data_set_items')
data_items.dtypes

[('id', 'bigint'),
 ('autocomplete_id', 'int'),
 ('name', 'string'),
 ('score', 'int'),
 ('customer_id', 'string'),
 ('name_lower', 'string'),
 ('metadata_json', 'string'),
 ('computed_score', 'int'),
 ('boost_or_bury', 'int'),
 ('ds_metadata_json', 'string'),
 ('section_name', 'string'),
 ('created_at', 'timestamp'),
 ('updated_at', 'timestamp'),
 ('day', 'string'),
 ('ac_key', 'string')]

In [8]:
from pyspark.sql.functions import col
from pyspark.sql import Window
from pyspark.sql import functions as F
import random

# Получаем список уникальных customer_id
unique_customers = [5017277, 5023023, 5024527]

# Фильтруем датасет по выбранным customer_id
filtered_items = data_items.filter(col('customer_id').isin(unique_customers))

# Оставляем только по 2 строки для каждого customer_id
filtered_items = (filtered_items
    .withColumn("row_num", F.row_number().over(Window.partitionBy("customer_id").orderBy(F.rand())))
    .filter(col("row_num") <= 2)
    .drop("row_num"))

print(filtered_items.dtypes)
filtered_items.show()

[('id', 'bigint'), ('autocomplete_id', 'int'), ('name', 'string'), ('score', 'int'), ('customer_id', 'string'), ('name_lower', 'string'), ('metadata_json', 'string'), ('computed_score', 'int'), ('boost_or_bury', 'int'), ('ds_metadata_json', 'string'), ('section_name', 'string'), ('created_at', 'timestamp'), ('updated_at', 'timestamp'), ('day', 'string'), ('ac_key', 'string')]
+----+---------------+--------------------+-----+-----------+--------------------+--------------------+--------------+-------------+--------------------+------------+-------------------+-------------------+----------+--------------------+
|  id|autocomplete_id|                name|score|customer_id|          name_lower|       metadata_json|computed_score|boost_or_bury|    ds_metadata_json|section_name|         created_at|         updated_at|       day|              ac_key|
+----+---------------+--------------------+-----+-----------+--------------------+--------------------+--------------+-------------+-----------

In [52]:
# Сохраним отфильтрованный датасет
cleaned_data_folder_path = WORKING_DIRECTORY+'tests/tests_petco/toy_examples/toy_example_1/original'
output_name = 'items'
process_data.save_dataset_parquet(filtered_items, cleaned_data_folder_path, output_name)

Папка /content/drive/MyDrive/Colab Notebooks/diploma/tests/tests_petco/toy_examples/toy_example_1/original уже существует.
Датасет сохранён по пути /content/drive/MyDrive/Colab Notebooks/diploma/tests/tests_petco/toy_examples/toy_example_1/original/items.


# Теперь вручную заполняем ожидаемый датасет с признаками товаров в формате SASRec

In [67]:
process_data.print_metadata(filtered_items.filter(col('customer_id')==5023023))

Основные метаданные:
url: /shop/en/petcostore/product/ideal-pet-screen-guard
image_url: https://assets.petco.com/petco/image/upload/f_auto,q_auto,w_190/dpr_auto/2765752-center-1
group_ids: ['cat-doors-and-flaps', 'dog-patio-mount-doors']
PTC_OMNI_IN_STORE_ONLY_FLAG: No
startDate: 27860825
PTC_OMNI_PRIMARY_ITEM_FLAG: Yes
UPC_NUMBER: 30559100913
PTC_OMNI_BOPUS_FLAG: No
PTC_OMNI_BRAND_PRIMARY: Perfect Pet
PTC_OMNI_PROP_65_FLAG: Yes
PTC_OMNI_PERSONALIZED_ITEM_FL: No
itemurl: /shop/en/petcostore/product/ideal-pet-screen-guard
deactivated: False
PTC_OMNI_SAME_DAY_DELIVERY_FG: No
itemname: Perfect Pet Screen Guard Pet Door in White
itemimg: https://assets.petco.com/petco/image/upload/f_auto,q_auto,w_190/dpr_auto/2765752-center-1
mfName: Perfect Pet
PTC_OMNI_REPEAT_DELIVERY_FL: No
AverageRating: 5.0
PTC_OMNI_TAXONOMY: Pet Containment|Gates, Fences & Doors
parentCatEntryIDAsString: 1272003
TotalReviewCount: 1
parentCatEntryID: 1272003

Facets:
Material: ['Plastic']
How to get it: ['One Time Del

In [None]:
process_data.print_metadata(filtered_items.filter((col('customer_id')==5023023) & (col('computed_score')==4690)))

In [23]:
group_ids_1_1 = ['small-animal-repeat-delivery-products', 'repeat-delivery-eligible-products', 'rat-food', 'small-animal-food', 'buy-online-pick-up-in-store-small-animal-products', 'same-day-delivery-small-pet-products']
group_ids_1 = ['small-animal-repeat-delivery-products', 'repeat-delivery-eligible-products', 'rat-food', 'small-animal-food', 'buy-online-pick-up-in-store-small-animal-products', 'same-day-delivery-small-pet-products']
deliveries_1 = ['Same Day Delivery', 'Free Pickup Today', 'One Time Delivery', 'Repeat Delivery']
brands_1 = ['Kaytee']
pets_1 = ['Small Animal']

In [24]:
group_ids_2_1 = ['cat-doors-and-flaps', 'dog-patio-mount-doors']
group_ids_2 = ['cat-doors-and-flaps', 'dog-patio-mount-doors']
deliveries_2 = ["One Time Delivery"]
brands_2 = ['Perfect Pet']
pets_2 = ['Cat', 'Dog']

In [25]:
group_ids_3_1 = ["repeat-delivery-eligible-products", "premium-cat-food", "canned-cat-food", "fall-shop-food"]
group_ids_3 = ["fall-shop-food","canned-cat-food","premium-cat-food","repeat-delivery-eligible-products"]
deliveries_3 = ["One Time Delivery"]
brands_3 = ["Tiki Cat"]
pets_3 = ['Cat']

In [28]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

def create_product_dataset(data):
    """
    Функция для создания PySpark DataFrame с указанными данными.
    :param data: Список данных, каждый элемент — это кортеж с (id, value, feature)
    :return: PySpark DataFrame
    """
    # Инициализация Spark
    spark = SparkSession.builder.appName("ProductDataset").getOrCreate()

    # Преобразование данных в список Row
    rows = [Row(id=item[0], value=item[1], feature=item[2]) for item in data]

    # Создание DataFrame из списка Row
    df = spark.createDataFrame(rows)

    return df

In [36]:
# Пример данных
item_1 = [*[[5017277, category, 'Category'] for category in group_ids_1], \
         *[[5017277, delivery, 'Delivery'] for delivery in deliveries_1], \
        *[[5017277, brand, 'Brand'] for brand in brands_1], \
        *[[5017277, pet, 'Pet'] for pet in pets_1]
]

item_1

[[5017277, 'small-animal-repeat-delivery-products', 'Category'],
 [5017277, 'repeat-delivery-eligible-products', 'Category'],
 [5017277, 'rat-food', 'Category'],
 [5017277, 'small-animal-food', 'Category'],
 [5017277, 'buy-online-pick-up-in-store-small-animal-products', 'Category'],
 [5017277, 'same-day-delivery-small-pet-products', 'Category'],
 [5017277, 'Same Day Delivery', 'Delivery'],
 [5017277, 'Free Pickup Today', 'Delivery'],
 [5017277, 'One Time Delivery', 'Delivery'],
 [5017277, 'Repeat Delivery', 'Delivery'],
 [5017277, 'Kaytee', 'Brand'],
 [5017277, 'Small Animal', 'Pet']]

In [37]:
item_2 = [*[[5023023, category, 'Category'] for category in group_ids_2], \
         *[[5023023, delivery, 'Delivery'] for delivery in deliveries_2], \
        *[[5023023, brand, 'Brand'] for brand in brands_2], \
        *[[5023023, pet, 'Pet'] for pet in pets_2]
]

item_2

[[5023023, 'cat-doors-and-flaps', 'Category'],
 [5023023, 'dog-patio-mount-doors', 'Category'],
 [5023023, 'One Time Delivery', 'Delivery'],
 [5023023, 'Perfect Pet', 'Brand'],
 [5023023, 'Cat', 'Pet'],
 [5023023, 'Dog', 'Pet']]

In [62]:
item_3 = [*[[5024527, category, 'Category'] for category in group_ids_3], \
         *[[5024527, delivery, 'Delivery'] for delivery in deliveries_3], \
        *[[5024527, brand, 'Brand'] for brand in brands_3], \
        *[[5024527, pet, 'Pet'] for pet in pets_3]
]

item_3

[[5024527, 'fall-shop-food', 'Category'],
 [5024527, 'canned-cat-food', 'Category'],
 [5024527, 'premium-cat-food', 'Category'],
 [5024527, 'repeat-delivery-eligible-products', 'Category'],
 [5024527, 'One Time Delivery', 'Delivery'],
 [5024527, 'Tiki Cat', 'Brand'],
 [5024527, 'Cat', 'Pet']]

In [63]:
items = item_1 + item_2 + item_3
items

[[5017277, 'small-animal-repeat-delivery-products', 'Category'],
 [5017277, 'repeat-delivery-eligible-products', 'Category'],
 [5017277, 'rat-food', 'Category'],
 [5017277, 'small-animal-food', 'Category'],
 [5017277, 'buy-online-pick-up-in-store-small-animal-products', 'Category'],
 [5017277, 'same-day-delivery-small-pet-products', 'Category'],
 [5017277, 'Same Day Delivery', 'Delivery'],
 [5017277, 'Free Pickup Today', 'Delivery'],
 [5017277, 'One Time Delivery', 'Delivery'],
 [5017277, 'Repeat Delivery', 'Delivery'],
 [5017277, 'Kaytee', 'Brand'],
 [5017277, 'Small Animal', 'Pet'],
 [5023023, 'cat-doors-and-flaps', 'Category'],
 [5023023, 'dog-patio-mount-doors', 'Category'],
 [5023023, 'One Time Delivery', 'Delivery'],
 [5023023, 'Perfect Pet', 'Brand'],
 [5023023, 'Cat', 'Pet'],
 [5023023, 'Dog', 'Pet'],
 [5024527, 'fall-shop-food', 'Category'],
 [5024527, 'canned-cat-food', 'Category'],
 [5024527, 'premium-cat-food', 'Category'],
 [5024527, 'repeat-delivery-eligible-products', 'C

In [64]:
# Вызов функции
items = create_product_dataset(items)

# Печать результата
items = items.withColumn("id", col("id").cast("int")).select('id', 'value', 'feature')
print(items.count())
print(items.dtypes)
items.show()

25
[('id', 'int'), ('value', 'string'), ('feature', 'string')]
+-------+--------------------+--------+
|     id|               value| feature|
+-------+--------------------+--------+
|5017277|small-animal-repe...|Category|
|5017277|repeat-delivery-e...|Category|
|5017277|            rat-food|Category|
|5017277|   small-animal-food|Category|
|5017277|buy-online-pick-u...|Category|
|5017277|same-day-delivery...|Category|
|5017277|   Same Day Delivery|Delivery|
|5017277|   Free Pickup Today|Delivery|
|5017277|   One Time Delivery|Delivery|
|5017277|     Repeat Delivery|Delivery|
|5017277|              Kaytee|   Brand|
|5017277|        Small Animal|     Pet|
|5023023| cat-doors-and-flaps|Category|
|5023023|dog-patio-mount-d...|Category|
|5023023|   One Time Delivery|Delivery|
|5023023|         Perfect Pet|   Brand|
|5023023|                 Cat|     Pet|
|5023023|                 Dog|     Pet|
|5024527|      fall-shop-food|Category|
|5024527|     canned-cat-food|Category|
+-------+--------

In [65]:
# Сохраним отфильтрованный датасет
cleaned_data_folder_path = WORKING_DIRECTORY+'tests/tests_petco/toy_examples/toy_example_1/sasrec_format_expected'
output_name = 'items'
process_data.save_dataset_parquet(items, cleaned_data_folder_path, output_name)

Папка /content/drive/MyDrive/Colab Notebooks/diploma/tests/tests_petco/toy_examples/toy_example_1/sasrec_format_expected уже существует.
Датасет сохранён по пути /content/drive/MyDrive/Colab Notebooks/diploma/tests/tests_petco/toy_examples/toy_example_1/sasrec_format_expected/items.


# Создаем сырой с логами юзеров

In [46]:
# Read dataset with beh logs
data_logs = spark.read.parquet(WORKING_DIRECTORY+'data/needed_beh_logs')
print(data_logs.dtypes)
# data_logs.show()

[('customer_user_id', 'string'), ('ac_key', 'string'), ('ac_id', 'bigint'), ('user_id', 'string'), ('session_id', 'string'), ('timestamp', 'timestamp'), ('raw_term', 'string'), ('filter_name', 'string'), ('filter_value', 'string'), ('action', 'string'), ('customer_id', 'string'), ('item_name', 'string'), ('variation_id', 'bigint'), ('variation_name', 'string'), ('items', 'array<struct<customer_id:string,item_id:bigint,item_name:string,price:double>>'), ('items_v2', 'array<struct<customer_id:string,item_id:bigint,item_name:string,price:double,variation_customer_id:string,variation_id:bigint,variation_name:string,count:bigint>>'), ('revenue', 'double')]


In [45]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import random
from datetime import datetime, timedelta

# Инициализация Spark
spark = SparkSession.builder.appName("ToyUserLogs").getOrCreate()

# Фиксированные значения
customer_ids = [5017277, 5023023, 5024527]

# Генерация данных
data = []
for i in range(3):  # 3 уникальных юзера
    user_id = random.randint(10**11, 10**12 - 1)  # 12-значный ID юзера

    num_purchases = random.randint(1, 3)  # от 1 до 3 покупок
    num_conversions = random.randint(2, 4)  # от 2 до 4 конверсий

    for _ in range(num_purchases):
        # Генерируем случайное время для покупки с использованием datetime
        timestamp = datetime.now() + timedelta(seconds=random.randint(0, 100000))
        timestamp_str = timestamp.strftime('%Y-%m-%d %H:%M:%S.%f')

        # Добавляем строку с покупкой
        data.append((user_id, random.choice(customer_ids), "purchase", timestamp_str, round(random.uniform(10, 100), 2)))

    for _ in range(num_conversions):
        # Генерируем случайное время для конверсии
        timestamp = datetime.now() + timedelta(seconds=random.randint(0, 100000))
        timestamp_str = timestamp.strftime('%Y-%m-%d %H:%M:%S.%f')

        # Добавляем строку с конверсией
        data.append((user_id, random.choice(customer_ids), "conversion", timestamp_str, None))

# Создание DataFrame
columns = ["customer_user_id", "customer_id", "action", "timestamp", "revenue"]
df_logs = spark.createDataFrame(data, columns)

# Преобразуем столбцы в тип string и timestamp
data_logs_example = df_logs.withColumn("customer_user_id", col("customer_user_id").cast("string")) \
                 .withColumn("customer_id", col("customer_id").cast("string")) \
                 .withColumn("timestamp", col("timestamp").cast("timestamp"))

# Показываем результат
print(data_logs_example.dtypes)
data_logs_example.show(truncate=False)

[('customer_user_id', 'string'), ('customer_id', 'string'), ('action', 'string'), ('timestamp', 'timestamp'), ('revenue', 'double')]
+----------------+-----------+----------+--------------------------+-------+
|customer_user_id|customer_id|action    |timestamp                 |revenue|
+----------------+-----------+----------+--------------------------+-------+
|858327868722    |5017277    |purchase  |2025-03-08 13:55:22.112712|45.91  |
|858327868722    |5023023    |purchase  |2025-03-07 23:07:09.112748|56.66  |
|858327868722    |5024527    |purchase  |2025-03-08 22:34:27.112757|55.46  |
|858327868722    |5023023    |conversion|2025-03-08 15:14:07.112764|NULL   |
|858327868722    |5024527    |conversion|2025-03-08 18:24:01.112769|NULL   |
|958934768514    |5023023    |purchase  |2025-03-08 10:50:26.112776|95.81  |
|958934768514    |5017277    |purchase  |2025-03-08 13:59:52.112784|94.03  |
|958934768514    |5023023    |conversion|2025-03-08 19:29:39.112791|NULL   |
|958934768514    |50

In [51]:
# Сохраним отфильтрованный датасет
cleaned_data_folder_path = WORKING_DIRECTORY+'tests/tests_petco/toy_examples/toy_example_1/original'
output_name = 'actions'
process_data.save_dataset_parquet(data_logs_example, cleaned_data_folder_path, output_name)

Папка /content/drive/MyDrive/Colab Notebooks/diploma/tests/tests_petco/toy_examples/toy_example_1/original уже существует.
Датасет сохранён по пути /content/drive/MyDrive/Colab Notebooks/diploma/tests/tests_petco/toy_examples/toy_example_1/original/actions.


# Теперь вручную заполняем ожидаемый датасет с действиями юзеров в формате SASRec

In [59]:
# Пример данных в виде списка списков
data_logs_expected = data_logs_example.filter(col('action')=='conversion').withColumn("weight", F.lit(1)).drop('action', 'revenue').withColumn("timestamp", F.date_format("timestamp", "yyyy-MM-dd HH:mm:ss"))
data_logs_expected = data_logs_expected \
    .withColumn("user_id", col("customer_user_id").cast("bigint")) \
    .withColumn("item_id", col("customer_id").cast("int")) \
    .withColumn("datetime", col("timestamp").cast("timestamp")) \
    .withColumn("weight", col("weight").cast("int")).select('user_id', 'item_id', 'datetime', 'weight').orderBy(['customer_user_id', 'timestamp'])

# Показываем результат
print(data_logs_expected.dtypes)
data_logs_expected.show(truncate=False)

[('user_id', 'bigint'), ('item_id', 'int'), ('datetime', 'timestamp'), ('weight', 'int')]
+------------+-------+-------------------+------+
|user_id     |item_id|datetime           |weight|
+------------+-------+-------------------+------+
|574363016960|5024527|2025-03-08 02:13:41|1     |
|574363016960|5017277|2025-03-08 03:54:25|1     |
|574363016960|5023023|2025-03-08 16:07:14|1     |
|858327868722|5023023|2025-03-08 15:14:07|1     |
|858327868722|5024527|2025-03-08 18:24:01|1     |
|958934768514|5024527|2025-03-08 00:31:48|1     |
|958934768514|5024527|2025-03-08 01:05:49|1     |
|958934768514|5023023|2025-03-08 19:29:39|1     |
+------------+-------+-------------------+------+



In [60]:
# Сохраним отфильтрованный датасет
cleaned_data_folder_path = WORKING_DIRECTORY+'tests/tests_petco/toy_examples/toy_example_1/sasrec_format_expected'
output_name = 'actions'
process_data.save_dataset_parquet(data_logs_expected, cleaned_data_folder_path, output_name)

Папка /content/drive/MyDrive/Colab Notebooks/diploma/tests/tests_petco/toy_examples/toy_example_1/sasrec_format_expected уже существует.
Датасет сохранён по пути /content/drive/MyDrive/Colab Notebooks/diploma/tests/tests_petco/toy_examples/toy_example_1/sasrec_format_expected/actions.
