In [1]:
# Get access to Google disk
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
# Import libraries
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
import pandas as pd
from pyspark.sql.functions import col
from pyspark.sql.functions import to_timestamp
import pyspark

In [4]:
%cd "/content/drive/MyDrive/Colab Notebooks/diploma/scripts/"
import process_data

/content/drive/MyDrive/Colab Notebooks/diploma/scripts


In [5]:
# Создаём SparkSession
spark = SparkSession.builder \
    .appName("PetCo") \
    .getOrCreate()

In [6]:
HEAD_DIRECTORY = '/content/drive/MyDrive/Colab Notebooks/diploma/'

# Приведем датасет с логами юзеров к формату для SASRec

In [7]:
# Read dataset with beh logs
data_logs = spark.read.parquet(HEAD_DIRECTORY+'data/needed_beh_logs')
# data_logs.show()

In [8]:
#  Отфильтруем датасет по нужному действию
data_actions_cleaned = process_data.clean_data_actions(data_logs, 'conversion')
# Теперь приведем датасет к формату SASRec
data_actions_processed = process_data.actions_to_sasrec_format(data_actions_cleaned)
print(f"Размер датасета: ({data_actions_processed.count()}, {len(data_actions_processed.columns)})")
data_actions_processed.show(5)

Размер датасета: (449982, 4)
+-------+-------+-------------------+------+
|user_id|item_id|           datetime|weight|
+-------+-------+-------------------+------+
|  -1002|5181461|2024-06-14 15:17:25|     1|
|  -1002|5206702|2024-06-14 15:19:15|     1|
|  -1002|5119257|2024-06-14 15:21:21|     1|
|  -1002|5107211|2024-06-28 19:06:10|     1|
|  -1002|5008694|2024-07-01 18:43:40|     1|
+-------+-------+-------------------+------+
only showing top 5 rows



In [9]:
# Сохраним датасет формата SASRec
sasrec_data_folder_path = HEAD_DIRECTORY+'data/sasrec_format/'
output_name = 'actions'
process_data.save_dataset_parquet(data_actions_processed, sasrec_data_folder_path, output_name)
# data_actions_processed = spark.read.parquet(sasrec_data_folder_path+output_name)

Папка /content/drive/MyDrive/Colab Notebooks/diploma/data/sasrec_format/ уже существует.
Датасет сохранён по пути /content/drive/MyDrive/Colab Notebooks/diploma/data/sasrec_format/actions.


# Приведем датасет с признаками товаров к формату SASRec

In [10]:
# Read dataset with items
data_items = spark.read.parquet(HEAD_DIRECTORY+'data/data_set_items')
# data_items.show()
# Print metadata and facets columns
# process_data.print_metadata(data_items)

In [11]:
# Профильтруем датасет, развернув колонку метаданных и оставив только нужные метаданные и facets
metadata = ['url', 'image_url', 'itemname', 'group_ids']
facets = ['How to get it', 'Primary Brand', 'Pet Type', 'Primary Pet Type']
data_items_cleaned = process_data.clean_data_items(data_items, metadata = metadata, facets = facets)

# Приведем датасет к формату SASRec
features = ['group_ids_intersect', 'How_to_get_it', 'Primary_Brand', 'Primary_Pet_Type']
features_final_names = ['Category', 'Delivery', 'Brand', 'Pet']
data_items_processed = process_data.items_to_sasrec_format(data_items_cleaned, features, features_final_names)
print(f"Размер датасета: ({data_items_processed.count()}, {len(data_items_processed.columns)})")
data_items_processed.show(5)

Размер датасета: (223730, 3)
+----+--------------------+--------+
|  id|               value| feature|
+----+--------------------+--------+
|1697|                 API|   Brand|
|1697|     betta-fish-shop|Category|
|1697|fish-tank-salt-mixes|Category|
|1697|     pond-water-care|Category|
|1697|repeat-delivery-a...|Category|
+----+--------------------+--------+
only showing top 5 rows



In [12]:
# Сохраним датасет формата SASRec
sasrec_data_folder_path = HEAD_DIRECTORY+'data/sasrec_format/'
output_name = 'items'
process_data.save_dataset_parquet(data_items_processed, sasrec_data_folder_path, output_name)
# data_items_processed = spark.read.parquet(sasrec_data_folder_path+output_name)

Папка /content/drive/MyDrive/Colab Notebooks/diploma/data/sasrec_format/ уже существует.
Датасет сохранён по пути /content/drive/MyDrive/Colab Notebooks/diploma/data/sasrec_format/items.


# Проведем прямые и косвенные тесты на корректность собранных датасетов

In [13]:
%cd "/content/drive/MyDrive/Colab Notebooks/diploma/tests/tests_petco"
!pytest

/content/drive/MyDrive/Colab Notebooks/diploma/tests/tests_petco
platform linux -- Python 3.11.11, pytest-8.3.5, pluggy-1.5.0
rootdir: /content/drive/MyDrive/Colab Notebooks/diploma/tests/tests_petco
plugins: typeguard-4.4.2, anyio-3.7.1, langsmith-0.3.13
collected 11 items                                                                                 [0m

test_petco_direct.py [32m.[0m[32m.[0m[32m                                                                      [ 18%][0m
test_petco_indirect.py [32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m                                                             [100%][0m

