### Start SparkSession

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
import pyspark.sql.functions as F


spark = SparkSession \
            .builder \
            .appName("spark_halltape") \
    .getOrCreate()


print("Активные Spark сессии:", spark.sparkContext.uiWebUrl)

In [31]:
PATH = '/Users/halltape/Desktop/CODE/'

In [None]:
spark.read.csv('datasets/output.csv').show()

In [None]:
spark.read.csv('datasets/output.csv').first()

In [None]:
spark.read.csv('datasets/output.csv', sep=';', header=True).show()

In [None]:
spark.read.csv('datasets/output.csv', sep=';', header=True).show(2, False, True)

### PrintSchema

In [None]:
df = spark.read.csv('datasets/output.csv', sep=';', header=True).drop("_c0")

df.printSchema()

In [None]:
df = df\
        .withColumnRenamed(" id", "id")\
        .withColumnRenamed("actual PRICE", "actual_price")

df.columns

In [None]:
list_of_columns = ['id',
                 'actual_price',
                 'average_rating',
                 'brand',
                 'category',
                 'crawled_at',
                 'description',
                 'discount',
                 'image',
                 'out_of_stock',
                 'pid',
                 'product_details',
                 'seller',
                 'selling_price',
                 'sub_category',
                 'title',
                 'url']

new_df = df.toDF(*list_of_columns)
new_df.columns

### Select

In [None]:
new_df.select('id', 'actual_price', 'average_rating', 'brand').show(4, truncate=False)

### Filter

In [None]:
new_df.where(''' brand != "York" ''')\
    .select('id', 'actual_price', 'average_rating', 'brand')\
    .show(10, truncate=False)



new_df.where(F.col("brand") != "York")\
    .select('id', 'actual_price', 'average_rating', 'brand')\
    .show(10, truncate=False)

### JOIN

In [None]:
data = [(1,'one'), (2,'two'), (3,'three'), (4,'four'),
        (5,'five'), (6,'six'), (7, 'seven'), (8, 'eight'),
        (9, 'nine')]

mix = spark.createDataFrame(data, ['id', 'number'])


mix.join(mix, "id", "inner").show()

In [None]:
mix.join(F.broadcast(mix), "id", "inner").show()

In [None]:
filtered = mix.where(F.col("number") != "six")

mix.join(filtered, "id", "anti").show()

### Count

In [None]:
spark.read.csv('not_york_one_file', header=True).count()

In [None]:
total_rows = spark.read.csv('not_york_one_file', header=True).count()

total_rows

In [None]:
df = spark.read.csv('not_york_one_file', header=True)
df.count()

### Cache | Persist

In [None]:
df = spark.read.csv('not_york_one_file', header=True)

df.cache().count()

In [None]:
df.unpersist()

In [None]:
from pyspark.storagelevel import StorageLevel

df = spark.read.csv('not_york_one_file', header=True)

df.persist(StorageLevel.DISK_ONLY).count()

In [None]:
df.show()

### Functions

In [None]:
df = spark.read.csv('not_york_one_file', header=True)

df.select("brand")\
        .distinct()\
        .orderBy(F.col("brand").desc())\
        .show(truncate=False)

In [None]:
df.select(F.lower("brand").alias("test_col"))\
        .distinct()\
        .orderBy(F.col("brand").asc())\
        .show(1, truncate=False)

In [None]:
one_row = df\
            .select(F.lower("brand").alias("test_col"))\
            .distinct()\
            .orderBy(F.col("test_col").asc())\
            .limit(1)

In [None]:
from pyspark.sql.types import *

one_row = df\
            .select(F.lower("brand").alias("test_col"))\
            .distinct()\
            .orderBy(F.col("test_col").asc())\
            .limit(1)


schema = ArrayType(
    StructType([
        StructField("key", StringType()),
        StructField("value", StringType())
    ])
)



new_df = one_row\
            .withColumn("test", F.from_json("test_col", schema))\
            .withColumn("element", F.explode(F.col("test")))



new_df.show(truncate=False)

### Save to CSV

In [None]:
not_york_df = new_df\
                .where(''' brand != "York" ''')\
                .select('id',
                        'actual_price',
                        'average_rating',
                        'brand')

In [None]:
not_york_df.write.mode('overwrite').csv('not_york_csv', header=True)

In [33]:
spark.read.csv(f"{PATH}datasets/customs_data.csv").rdd.getNumPartitions()

In [None]:
print(f'Степень параллелизма в not_york_df: {not_york_df.rdd.getNumPartitions()}')

In [None]:
ls -l not_york_csv

In [None]:
not_york_one_file = not_york_df.coalesce(1)

not_york_one_file.write.mode('overwrite').option("header", "true").csv('not_york_one_file')

In [None]:
ls -l not_york_one_file

### Repartition & Coalesce

In [None]:
data = [(1,'one'), (2,'two'), (3,'three'), (4,'four'),
        (5,'five'), (6,'six'), (7, 'seven'), (8, 'eight'),
        (9, 'nine')]

df = spark.createDataFrame(data, ['id', 'number'])

df.show()

In [None]:
# Намеренно перемешаем и поделим на 8 разделов
mix = df.repartition(8)
mix.rdd.glom().collect()

In [None]:
mix.repartition(3).rdd.glom().collect()

In [None]:
mix.coalesce(3).rdd.glom().collect()

In [27]:
mix.toPandas().head()