In [1]:
import os

# Установите переменную окружения JAVA_HOME
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'
os.environ['PATH'] = os.environ['JAVA_HOME'] + '/bin:' + os.environ['PATH']

# Проверьте версию Java
!java -version

openjdk version "1.8.0_422"
OpenJDK Runtime Environment (build 1.8.0_422-8u422-b05-1~24.04-b05)
OpenJDK 64-Bit Server VM (build 25.422-b05, mixed mode)


In [76]:
import pyspark

from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName('less_pysqprk_DF') \
        .master('local') \
        .config('spark.executor.memory', "512m") \
        .config("spark.executor.cores", "4") \
        .config("spark.executor.memory", "8g") \
        .config("spark.sql.shuffle.partitions", "12") \
        .config("spark.default.parallelism", "12") \
        .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9000") \
        .config("spark.hadoop.fs.default.name", "hdfs://localhost:9000") \
        .getOrCreate()




In [77]:
spark

In [78]:
files = [
    'Sales_April_2019.csv',
    'Sales_August_2019.csv',
    'Sales_December_2019.csv',
    'Sales_February_2019.csv',
    'Sales_January_2019.csv',
    'Sales_July_2019.csv',
    'Sales_June_2019.csv',
    'Sales_March_2019.csv',
    'Sales_May_2019.csv',
    'Sales_November_2019.csv',
    'Sales_October_2019.csv',
    'Sales_September_2019.csv'
]


In [79]:
df = spark.read.csv(f'hdfs://localhost:9000/data/{files[0]}', header=True, inferSchema=True)


In [80]:
df.count()

18383

In [81]:
df.show()

+--------+--------------------+----------------+----------+--------------+--------------------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  176558|USB-C Charging Cable|               2|     11.95|04/19/19 08:46|917 1st St, Dalla...|
|    null|                null|            null|      null|          null|                null|
|  176559|Bose SoundSport H...|               1|     99.99|04/07/19 22:30|682 Chestnut St, ...|
|  176560|        Google Phone|               1|     600.0|04/12/19 14:38|669 Spruce St, Lo...|
|  176560|    Wired Headphones|               1|     11.99|04/12/19 14:38|669 Spruce St, Lo...|
|  176561|    Wired Headphones|               1|     11.99|04/30/19 09:27|333 8th St, Los A...|
|  176562|USB-C Charging Cable|               1|     11.95|04/29/19 13:03|381 Wilson St, Sa...|
|  176563|Bose SoundSport H...|         

In [82]:
for name in range(1, len(files)-1):
    df2 = spark.read.csv(f'hdfs://localhost:9000/data/{files[name+1]}', header=True, inferSchema=True)
    df = df.union(df2)
    df.count()    

In [83]:

df.count()    

174839

In [84]:
df.printSchema()

root
 |-- Order ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: integer (nullable = true)
 |-- Price Each: double (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase Address: string (nullable = true)



In [85]:
from pyspark.sql.functions import col, sum as _sum

In [86]:
df.groupby('Product')\
    .agg(_sum(col('Quantity Ordered'))\
    .alias('Quantity_sum'))\
    .sort(col('Quantity_sum')\
    .desc())\
    .limit(10)\
    .show()

#df.filter(col('Product') == 'USB-C Charging Cable').show()

+--------------------+------------+
|             Product|Quantity_sum|
+--------------------+------------+
|AAA Batteries (4-...|       28998|
|AA Batteries (4-p...|       25809|
|USB-C Charging Cable|       22513|
|Lightning Chargin...|       21776|
|    Wired Headphones|       19267|
|Apple Airpods Hea...|       14651|
|Bose SoundSport H...|       12577|
|    27in FHD Monitor|        7058|
|              iPhone|        6411|
|34in Ultrawide Mo...|        5818|
+--------------------+------------+



In [87]:
df_replicated = df.repartition(col('Product'))
print(df_replicated.rdd.getNumPartitions())


4


In [93]:
from pyspark.sql import Row

def aggregate_partition(partition):
    total_quantity = 0
    for row in partition:
        if row["Quantity Ordered"] is not None:
            total_quantity += row["Quantity Ordered"]
    yield Row(total_quantity=total_quantity)

aggregated_df = df_replicated.rdd.mapPartitions(aggregate_partition).toDF()

aggregated_df.show()


+--------------+
|total_quantity|
+--------------+
|         35917|
|         35214|
|         64711|
|         59795|
+--------------+



In [97]:
df_replicated_less = df_replicated.coalesce(3)
print(df_replicated_less.rdd.getNumPartitions())


3


In [98]:
df_replicated_less.write.format('csv').option("header", True).save("hdfs://localhost:9000/sales/data_hw/")

In [102]:
%time
df_replicated_less.write.format("json").save("hdfs://localhost:9000/sales/data_hw_json/")


CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 6.91 µs


In [103]:
%time
df_replicated_less.write.format("parquet").save("hdfs://localhost:9000/sales/data_hw_parquet/")


CPU times: user 3 µs, sys: 2 µs, total: 5 µs
Wall time: 8.58 µs


In [111]:
df_replicated_less.select("Purchase Address").show(truncate=False)

+---------------------------------------+
|Purchase Address                       |
+---------------------------------------+
|657 Hill St, Dallas, TX 75001          |
|84 Jackson St, Boston, MA 02215        |
|87 West St, Boston, MA 02215           |
|15 Cherry St, San Francisco, CA 94016  |
|85 North St, San Francisco, CA 94016   |
|997 South St, Boston, MA 02215         |
|92 11th St, Los Angeles, CA 90001      |
|343 Johnson St, New York City, NY 10001|
|209 14th St, New York City, NY 10001   |
|613 12th St, Portland, OR 97035        |
|504 Wilson St, Dallas, TX 75001        |
|570 11th St, Atlanta, GA 30301         |
|638 Park St, Seattle, WA 98101         |
|825 Madison St, Seattle, WA 98101      |
|205 4th St, Atlanta, GA 30301          |
|976 Hickory St, Dallas, TX 75001       |
|772 11th St, Atlanta, GA 30301         |
|445 Jefferson St, Atlanta, GA 30301    |
|125 North St, San Francisco, CA 94016  |
|623 Lincoln St, Portland, OR 97035     |
+---------------------------------

In [151]:
from pyspark.sql.functions import udf 
from pyspark.sql.types import ArrayType, StringType, StructType, StructField

In [183]:
def splter_addrres(address):
    address_splited = address.split(',')
    
    address_splited[2], post = address_splited[2].split()[0], address_splited[2].split()[1]
    address_splited.append(post)
    
    return (address_splited[0].strip(),address_splited[1].strip(),address_splited[2].strip(),address_splited[3].strip())
    

splter_addrres('657 Hill St, Dallas, TX 75001')

('657 Hill St', 'Dallas', 'TX', '75001')

In [184]:
my_udf = udf(splter_addrres, StructType([
    StructField("Street address", StringType()),
    StructField("City", StringType()),
    StructField("State", StringType()),
    StructField("Postal Code", StringType())
]))

In [186]:
df_replicated_less = df_replicated_less.withColumn('Street address', my_udf(col("Purchase Address")).getItem("Street address"))\
                    .withColumn('City', my_udf(col("Purchase Address")).getItem("City"))\
                    .withColumn('State', my_udf(col("Purchase Address")).getItem("State"))\
                    .withColumn('Postal Code', my_udf(col("Purchase Address")).getItem("Postal Code"))

In [187]:
df_replicated_less.show()

+--------+--------------------+----------------+----------+--------------+--------------------+----------------+-------------+-----+-----------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|  Street address|         City|State|Postal Code|
+--------+--------------------+----------------+----------+--------------+--------------------+----------------+-------------+-----+-----------+
|  176569|27in 4K Gaming Mo...|               1|    389.99|04/16/19 19:23|657 Hill St, Dall...|     657 Hill St|       Dallas|   TX|      75001|
|  176581|              iPhone|               1|     700.0|04/09/19 21:38|84 Jackson St, Bo...|   84 Jackson St|       Boston|   MA|      02215|
|  176600|27in 4K Gaming Mo...|               1|    389.99|04/30/19 15:54|87 West St, Bosto...|      87 West St|       Boston|   MA|      02215|
|  176608|              iPhone|               1|     700.0|04/11/19 12:01|15 Cherry St, San...|    15 Cherry St|San Francisco|   C

In [188]:
df_replicated_less = df_replicated_less.cache()

In [190]:
df_replicated_less = df_replicated_less.withColumn('profit', col('Quantity Ordered') * col('Price Each'))

In [191]:
df_replicated_less.show()

+--------+--------------------+----------------+----------+--------------+--------------------+----------------+-------------+-----+-----------+------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|  Street address|         City|State|Postal Code|profit|
+--------+--------------------+----------------+----------+--------------+--------------------+----------------+-------------+-----+-----------+------+
|  176569|27in 4K Gaming Mo...|               1|    389.99|04/16/19 19:23|657 Hill St, Dall...|     657 Hill St|       Dallas|   TX|      75001|389.99|
|  176581|              iPhone|               1|     700.0|04/09/19 21:38|84 Jackson St, Bo...|   84 Jackson St|       Boston|   MA|      02215| 700.0|
|  176600|27in 4K Gaming Mo...|               1|    389.99|04/30/19 15:54|87 West St, Bosto...|      87 West St|       Boston|   MA|      02215|389.99|
|  176608|              iPhone|               1|     700.0|04/11/19 12:01|15 Cherry St, 

In [192]:
spark.stop()