#### Подключаемся к серверy

```bash
ssh 305_koryagin@37.139.32.56 -i ./id_rsa_305_koryagin.txt
```

#### Запускаем spark

```bash
/spark2.4/bin/pyspark
```

```python
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("gogin_spark").getOrCreate()
```

## Создание DataFrame

```python
# для начала готовим DataFrame
data = spark.read.parquet("input_csv_for_recommend_system/data.parquet")
data.show(n=5, truncate=True)
```

<details>
    <summary> → вывод консоли SPARK</summary>
    
![Title](../images/2021-04-25_125511.jpg)

</details>

## GroupBy

#### Посмотрим диапазон дат

```python
data.select(F.min('sale_date_date').alias('first_date'), 
            F.max('sale_date_date').alias('last_date'), 
            F.datediff(F.max('sale_date_date'), F.min('sale_date_date')).alias('datediff')) \
    .show(truncate=False)
```

```bash
+----------+----------+--------+
|first_date|last_date |datediff|
+----------+----------+--------+
|2018-01-01|2018-12-09|342     |
+----------+----------+--------+
```

#### Разряженность матрицы

```python
n_users = data.select(F.countDistinct(col='contact_id')).collect()[0][0]
n_items = data.select(F.countDistinct(col='product_id')).collect()[0][0]
n_interactions = data.count()
```

In [8]:
n_users, n_items, n_interactions = 1642379, 36549, 20000000
print(f'Разряженность матрицы = {round(n_interactions / (n_users * n_items), 6) * 100}%')

Разряженность матрицы = 0.0333%


#### Популярность

```python
# популярность - группируем товары по сумме товаров
popularity_by_items = data \
    .groupBy("product_id").sum("quantity") \
    .withColumnRenamed(existing='sum(quantity)', new='total_quantity')

popularity_by_items.select('total_quantity').describe().show(truncate=False)
```
```bash
+-------+-----------------+
|summary|total_quantity   |
+-------+-----------------+
|count  |36549            |
|mean   |2483.608399971266|
|stddev |64484.54503307139|
|min    |-5741.0          |
|max    |5681946.0        |
+-------+-----------------+
```

```python
popularity_by_items \
    .orderBy('total_quantity', ascending=False) \
    .withColumn(colName="total_quantity", col=popularity_by_items["total_quantity"].cast(IntegerType(), )) \
    .show(n=5)
```
```bash
+----------+--------------+
|product_id|total_quantity|
+----------+--------------+
|     93554|       5681946|
|     33645|       5434930|
|    141757|       4739478|
|     38560|       3937244|
|    106053|       2939204|
+----------+--------------+
only showing top 5 rows
```

---

```python
# популярность - группируем пользователей по сумме товаров

popularity_by_users = data \
    .groupBy("contact_id").sum("quantity") \
    .withColumnRenamed(existing='sum(quantity)', new='total_quantity')

popularity_by_users.select('total_quantity').describe().show(truncate=False)
```
```bash
+-------+------------------+
|summary|total_quantity    |
+-------+------------------+
|count  |1642379           |
|mean   |55.26946180543576 |
|stddev |440.10752485241625|
|min    |-6.0              |
|max    |74439.63600009121 |
+-------+------------------+
```

```python
popularity_by_users.orderBy('total_quantity', ascending=False) \
    .withColumn(colName="total_quantity", col=popularity_by_users["total_quantity"].cast(IntegerType(), )) \
    .show(n=5)
```
```bash
+----------+--------------+
|contact_id|total_quantity|
+----------+--------------+
|    903455|         74439|
|    983545|         51697|
|   1962163|         48000|
|    795400|         45384|
|    850395|         45330|
+----------+--------------+
```

## Train-test split
В рекомендательных системах корректнее использовать train-test split по времени, а не случайно.
Я возьму последние 3 недели в качестве теста

```python
# Посмотрим диапазон неделей
data.select(F.min('week_of_year'), F.max('week_of_year'),
            (F.max('week_of_year') - F.min('week_of_year')).alias('week_diff')) \
    .show(truncate=False)
```
```bash
+-----------------+-----------------+---------+
|min(week_of_year)|max(week_of_year)|week_diff|
+-----------------+-----------------+---------+
|1                |49               |48       |
+-----------------+-----------------+---------+
```

```python
def train_test_split_by_week(df, week_col_name, test_size_weeks):
    """
    Разделение на train и test по неделям
    :param df: исходный датафрейм
    :param week_col_name: название колонки с номерами недели в году
    :param test_size_weeks: число недель для теста
    :return: 2 датасета
    """
    threshold_week = int(data.select(F.max(week_col_name)).collect()[0][0]) - test_size_weeks
    train = df.filter(F.col(week_col_name) < threshold_week)
    test = df.filter(F.col(week_col_name) >= threshold_week)
    return train, test


# Разделим набор данных на тренировочную и тестовую выборки
data_train, data_test = train_test_split_by_week(df=data, week_col_name='week_of_year', test_size_weeks=3)
```