### Start SparkSession

In [202]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F


spark = SparkSession \
            .builder \
            .appName("halltape_pyspark_local") \
    .getOrCreate()


print("Активные Spark сессии:", spark.sparkContext.uiWebUrl)

Активные Spark сессии: http://macbookpro:4040


### Read

In [122]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, row_number
from pyspark.sql.window import Window
from datetime import datetime, timedelta

# Создание Spark сессии
spark = SparkSession.builder.appName("Example").getOrCreate()

# Пример создания DataFrame (замените на ваш)
df = spark.read.csv('data/output.csv', sep=';', header=True)

# Определяем четыре разные даты
dates = [
    datetime(2024, 1, 1),
    datetime(2024, 2, 1),
    datetime(2024, 3, 1),
    datetime(2024, 4, 1)
]

# Создаем окно для нумерации строк
window_spec = Window.orderBy("pid")

# Добавляем колонку с номером строки
df_with_row_num = df.withColumn("row_num", row_number().over(window_spec))

# Добавляем новую колонку с датами на основе номера строки
df_final = df_with_row_num.withColumn(
    "date",
    when(col("row_num") % 4 == 1, dates[0])
    .when(col("row_num") % 4 == 2, dates[1])
    .when(col("row_num") % 4 == 3, dates[2])
    .when(col("row_num") % 4 == 0, dates[3])
)

# Удаляем временную колонку row_num
df_final = df_final.drop("row_num", "crawled at")

# Показать результат


df_final.write.format('csv').options(header='True', delimiter=';') \
 .csv("data/output_data")

24/11/29 18:55:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/11/29 18:55:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/11/29 18:55:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/11/29 18:55:43 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: ,  id, actual PRICE, average rating, brand, category, description, discount, Изображение, out of stock, pid, product details, seller, selling price, sub  category, TITLE , url
 Schema: _c0,  id, actual PRICE, average rating, brand, category, description, discount, Изображение, out of stock, pid, product details, seller, selling price, sub  category, TITLE , url
Expected: _c0 but found: 
CSV file: file:///User

In [123]:
spark.read.csv('data/output_data.csv').show()

+--------------------+
|                 _c0|
+--------------------+
|_c0;id;actual PRI...|
|299;17142be6-72a0...|
|"going strong. Th...|
|308;c839dcbf-4218...|
|"going strong. Th...|
|312;9ac18a59-6085...|
|"going strong. Th...|
|355;3d42a088-ba2c...|
|World is full of ...|
|"100% Cotton, Sli...|
|358;91c7ccd4-06e6...|
|"100% Cotton, Reg...|
|364;bb3690f3-ec5f...|
|"World is full of...|
|378;194f3f3a-c3b2...|
|"World is full of...|
|393;cf00407e-716a...|
|"World is full of...|
|395;6ab2fa7b-d04d...|
|"World is full of...|
+--------------------+
only showing top 20 rows



In [124]:
spark.read.csv('data/output_data.csv', sep=';', header=True).show(5)

+--------------------+--------------------+------------+----------------+--------------------+--------------------+-----------+--------------------+--------------------+------------+----+---------------+------+-------------+-------------+-----+----+--------------------+
|                 _c0|                  id|actual PRICE|  average rating|               brand|            category|description|            discount|         Изображение|out of stock| pid|product details|seller|selling price|sub  category|TITLE| url|                date|
+--------------------+--------------------+------------+----------------+--------------------+--------------------+-----------+--------------------+--------------------+------------+----+---------------+------+-------------+-------------+-----+----+--------------------+
|                 299|17142be6-72a0-578...|       2,299|             3.9|      adidas Origina|Clothing and Acce...|    33% off|An instant classi...|                null|        null|null|

In [125]:
df = spark.read.csv('data/output_data.csv', sep=';', header=True).drop('_c0')

df.show(5)

+--------------------+------------+----------------+--------------------+--------------------+-----------+--------------------+--------------------+------------+----+---------------+------+-------------+-------------+-----+----+--------------------+
|                  id|actual PRICE|  average rating|               brand|            category|description|            discount|         Изображение|out of stock| pid|product details|seller|selling price|sub  category|TITLE| url|                date|
+--------------------+------------+----------------+--------------------+--------------------+-----------+--------------------+--------------------+------------+----+---------------+------+-------------+-------------+-----+----+--------------------+
|17142be6-72a0-578...|       2,299|             3.9|      adidas Origina|Clothing and Acce...|    33% off|An instant classi...|                null|        null|null|           null|  null|         null|         null| null|null|2024-01-01T00:00:...|


In [126]:
df.show(2, False, True)

-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 id              | 17142be6-72a0-578f-8c82-0bc52b3194c4                                                                                                                           

In [None]:
### PrintSchema

In [127]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- actual PRICE: string (nullable = true)
 |-- average rating: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- category: string (nullable = true)
 |-- description: string (nullable = true)
 |-- discount: string (nullable = true)
 |-- Изображение: string (nullable = true)
 |-- out of stock: string (nullable = true)
 |-- pid: string (nullable = true)
 |-- product details: string (nullable = true)
 |-- seller: string (nullable = true)
 |-- selling price: string (nullable = true)
 |-- sub  category: string (nullable = true)
 |-- TITLE: string (nullable = true)
 |-- url: string (nullable = true)
 |-- date: string (nullable = true)



In [128]:
df.columns

new_columns = []


for column in df.columns:
    new_columns.append(column.strip().lower().replace(" ", "_"))

print(new_columns)

['id', 'actual_price', 'average_rating', 'brand', 'category', 'description', 'discount', 'изображение', 'out_of_stock', 'pid', 'product_details', 'seller', 'selling_price', 'sub__category', 'title', 'url', 'date']


In [129]:
new_df = df.toDF(*new_columns)
new_df.columns

['id',
 'actual_price',
 'average_rating',
 'brand',
 'category',
 'description',
 'discount',
 'изображение',
 'out_of_stock',
 'pid',
 'product_details',
 'seller',
 'selling_price',
 'sub__category',
 'title',
 'url',
 'date']

In [130]:
result = new_df\
        .withColumnRenamed("sub__category", "sub_category")\
        .withColumnRenamed("изображение", "image")

result.columns

['id',
 'actual_price',
 'average_rating',
 'brand',
 'category',
 'description',
 'discount',
 'image',
 'out_of_stock',
 'pid',
 'product_details',
 'seller',
 'selling_price',
 'sub_category',
 'title',
 'url',
 'date']

### Select

In [131]:
result.select('brand').distinct().show(10, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|brand                                                                                                                                         

In [133]:
only_brands = result\
                    .where(~F.col('brand').contains('['))\
                    .select('id',
                            'brand',
                            'actual_price',
                            'date')

only_brands.show(5, truncate=False)

print(f'Total brands: {only_brands.select("brand").distinct().count()}')

+------------------------------------+--------------+------------+-----------------------------+
|id                                  |brand         |actual_price|date                         |
+------------------------------------+--------------+------------+-----------------------------+
|17142be6-72a0-578f-8c82-0bc52b3194c4|adidas Origina|2,299       |2024-01-01T00:00:00.000+03:00|
|c839dcbf-4218-5d49-8a00-03f52be09384|adidas Origina|2,299       |2024-03-01T00:00:00.000+03:00|
|9ac18a59-6085-54d3-8f73-95574c1de763|adidas Origina|2,299       |2024-01-01T00:00:00.000+03:00|
|3d42a088-ba2c-5f8e-8142-69d834e0e6f9|SayItLo       |799         |2024-03-01T00:00:00.000+03:00|
|bb3690f3-ec5f-5552-a30b-9e3400493836|SayItLo       |799         |2024-04-01T00:00:00.000+03:00|
+------------------------------------+--------------+------------+-----------------------------+
only showing top 5 rows

Total brands: 328


In [134]:
only_brands\
        .groupBy('brand')\
        .agg(F.count('*').alias('total_rows'))\
        .orderBy(F.col('total_rows').desc())\
        .show()

[Stage 302:>                                                        (0 + 8) / 8]

+------------+----------+
|       brand|total_rows|
+------------+----------+
|        ARBO|       999|
|          Pu|       996|
|     True Bl|       996|
|        REEB|       996|
|    ECKO Unl|       993|
|Free Authori|       864|
|         Keo|       668|
|         Amp|       621|
|  Black Beat|       560|
|        PixF|       528|
|   yellowvib|       507|
|    vims rai|       504|
|  Marca Disa|       444|
|         Oka|       419|
|      Gracew|       405|
|     TEE BUD|       398|
|       Shoef|       358|
|           V|       346|
|    CupidSto|       338|
|     Urban D|       324|
+------------+----------+
only showing top 20 rows



                                                                                

### Save to CSV

In [197]:
final = only_brands\
                .withColumn('load_date', F.col('date').cast('date'))

print(final.count())
final.show(2, truncate=False)

27936
+------------------------------------+--------------+------------+-----------------------------+----------+
|id                                  |brand         |actual_price|date                         |load_date |
+------------------------------------+--------------+------------+-----------------------------+----------+
|17142be6-72a0-578f-8c82-0bc52b3194c4|adidas Origina|2,299       |2024-01-01T00:00:00.000+03:00|2024-01-01|
|c839dcbf-4218-5d49-8a00-03f52be09384|adidas Origina|2,299       |2024-03-01T00:00:00.000+03:00|2024-03-01|
+------------------------------------+--------------+------------+-----------------------------+----------+
only showing top 2 rows



In [198]:
# Сохранение неконтроллируемое по кол-ву файлов
final\
    .write\
    .format('csv')\
    .options(header='True', sep=';')\
    .csv('data/final_no_control')

partition_num = final.rdd.getNumPartitions()
print(f'Кол-во партиций {partition_num}')

# Сохранение контроллируемое по кол-ву файлов - ОДИН ФАЙЛ
final\
    .coalesce(1)\
    .write\
    .format('csv')\
    .options(header='True', sep=';')\
    .csv('data/final_one_file') 

partition_num = final.coalesce(1).rdd.getNumPartitions()
print(f'Кол-во партиций {partition_num}')


# Сохранения с партицированием
final\
    .write\
    .partitionBy('load_date')\
    .format('csv')\
    .options(header='True', sep=';')\
    .csv('data/final_partitioned')

print_df = final.select('load_date').distinct()
print(f'Load_date distinct: {print_df.count()}')


# Сохранения с партицированием и repartition внутри самой партиции
final\
    .repartition(1, 'load_date')\
    .write\
    .partitionBy('load_date')\
    .format('csv')\
    .options(header='True', sep=';')\
    .csv('data/final_partitioned_repart')

partition_num = final.repartition(1, 'load_date').rdd.getNumPartitions()
print(f'Кол-во партиций {partition_num}')

Кол-во партиций 8
Кол-во партиций 1


                                                                                

Load_date distinct: 4
Кол-во партиций 1


In [201]:
spark.stop()

In [204]:
reader1 = spark\
            .read\
            .csv('data/final_partitioned_repart', header=True, sep=';')\
            .where(''' load_date = "2024-01-01" ''')


print(reader1.count())


# reader2 = spark\
#             .read\
#             .csv('data/final_no_conrtol', header=True, sep=';')\
#             .where(''' load_date = "2024-01-01" ''')


# reader2.count()



reader3 = spark\
            .read\
            .csv('data/final_one_file/part-00000-fdfcbba9-0f73-4f51-b5d8-3741788def8b-c000.csv', header=True, sep=';')\
            .where(''' load_date = "2024-01-01" ''')


print(reader3.count())

6974
6974


### Filter

In [None]:
new_df.where(''' brand != "York" ''')\
    .select('id', 'actual_price', 'average_rating', 'brand')\
    .show(10, truncate=False)



new_df.where(F.col("brand") != "York")\
    .select('id', 'actual_price', 'average_rating', 'brand')\
    .show(10, truncate=False)

### JOIN

In [None]:
data = [(1,'one'), (2,'two'), (3,'three'), (4,'four'),
        (5,'five'), (6,'six'), (7, 'seven'), (8, 'eight'),
        (9, 'nine')]

mix = spark.createDataFrame(data, ['id', 'number'])


mix.join(mix, "id", "inner").show()

In [None]:
mix.join(F.broadcast(mix), "id", "inner").show()

In [None]:
filtered = mix.where(F.col("number") != "six")

mix.join(filtered, "id", "anti").show()

### Cache | Persist

In [205]:
df = spark.read.csv('data/final_partitioned_repart', header=True, sep=';')

df.cache().count()

                                                                                

27936

In [208]:
df.unpersist()

DataFrame[id: string, brand: string, actual_price: string, date: string, load_date: date]

In [207]:
from pyspark.storagelevel import StorageLevel

df = spark.read.csv('data/final_partitioned_repart', header=True, sep=';')

df.persist(StorageLevel.DISK_ONLY).count()

                                                                                

27936

### Functions

In [None]:
df = spark.read.csv('not_york_one_file', header=True)

df.select("brand")\
        .distinct()\
        .orderBy(F.col("brand").desc())\
        .show(truncate=False)

In [None]:
df.select(F.lower("brand").alias("test_col"))\
        .distinct()\
        .orderBy(F.col("brand").asc())\
        .show(1, truncate=False)

In [None]:
one_row = df\
            .select(F.lower("brand").alias("test_col"))\
            .distinct()\
            .orderBy(F.col("test_col").asc())\
            .limit(1)

In [None]:
from pyspark.sql.types import *

one_row = df\
            .select(F.lower("brand").alias("test_col"))\
            .distinct()\
            .orderBy(F.col("test_col").asc())\
            .limit(1)


schema = ArrayType(
    StructType([
        StructField("key", StringType()),
        StructField("value", StringType())
    ])
)



new_df = one_row\
            .withColumn("test", F.from_json("test_col", schema))\
            .withColumn("element", F.explode(F.col("test")))



new_df.show(truncate=False)

### Repartition & Coalesce

In [209]:
data = [(1,'one'), (2,'two'), (3,'three'), (4,'four'),
        (5,'five'), (6,'six'), (7, 'seven'), (8, 'eight'),
        (9, 'nine')]

df = spark.createDataFrame(data, ['id', 'number'])

df.show()

+---+------+
| id|number|
+---+------+
|  1|   one|
|  2|   two|
|  3| three|
|  4|  four|
|  5|  five|
|  6|   six|
|  7| seven|
|  8| eight|
|  9|  nine|
+---+------+



In [210]:
# Намеренно перемешаем и поделим на 8 разделов
mix = df.repartition(8)
mix.rdd.glom().collect()

[[Row(id=3, number='three'), Row(id=6, number='six')],
 [],
 [Row(id=1, number='one'), Row(id=2, number='two')],
 [],
 [Row(id=8, number='eight')],
 [Row(id=5, number='five'), Row(id=9, number='nine')],
 [Row(id=4, number='four')],
 [Row(id=7, number='seven')]]

In [211]:
mix.repartition(3).rdd.glom().collect()

[[Row(id=1, number='one'),
  Row(id=4, number='four'),
  Row(id=7, number='seven'),
  Row(id=9, number='nine')],
 [Row(id=2, number='two')],
 [Row(id=3, number='three'),
  Row(id=5, number='five'),
  Row(id=6, number='six'),
  Row(id=8, number='eight')]]

In [212]:
mix.coalesce(3).rdd.glom().collect()

[[Row(id=3, number='three'), Row(id=6, number='six')],
 [Row(id=7, number='seven'),
  Row(id=1, number='one'),
  Row(id=2, number='two'),
  Row(id=5, number='five'),
  Row(id=9, number='nine')],
 [Row(id=8, number='eight'), Row(id=4, number='four')]]

In [213]:
mix.toPandas().head()

Unnamed: 0,id,number
0,3,three
1,6,six
2,1,one
3,2,two
4,8,eight
