In [29]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName("Ebay Auction").getOrCreate()
auctions = spark.read.csv('../data/auction.csv',inferSchema=True, header= True)
auctions.count()

10681

In [30]:
auctions.show(1)

+----------+-----+--------+------------+----------+-------+-----+------------------+-------------+
| auctionid|  bid| bidtime|      bidder|bidderrate|openbid|price|              item| auction_type|
+----------+-----+--------+------------+----------+-------+-----+------------------+-------------+
|1638893549|175.0|2.230949|schadenfreud|         0|   99.0|177.5|Cartier wristwatch|3 day auction|
+----------+-----+--------+------------+----------+-------+-----+------------------+-------------+
only showing top 1 row



In [31]:
auctions.printSchema()

root
 |-- auctionid: long (nullable = true)
 |-- bid: double (nullable = true)
 |-- bidtime: double (nullable = true)
 |-- bidder: string (nullable = true)
 |-- bidderrate: string (nullable = true)
 |-- openbid: double (nullable = true)
 |-- price: double (nullable = true)
 |-- item: string (nullable = true)
 |-- auction_type: string (nullable = true)



In [32]:
auctions.select("auctionid", "bid", "price").show(2)

+----------+-----+-----+
| auctionid|  bid|price|
+----------+-----+-----+
|1638893549|175.0|177.5|
|1638893549|100.0|177.5|
+----------+-----+-----+
only showing top 2 rows



In [33]:
#unique auctions
auctions.select("auctionid").distinct().count()

628

In [34]:
#bids per auction
auctions.groupBy("auctionid","bid").count().show(10)

+----------+-------+-----+
| auctionid|    bid|count|
+----------+-------+-----+
|1640809333|1201.69|    1|
|1646988233| 348.88|    1|
|3015694920|  240.0|    1|
|3016458866|  155.0|    1|
|3022785804|  242.5|    1|
|8212711136|  26.99|    1|
|1647870862|  850.0|    1|
|1646448593|  200.0|    1|
|1650406935|  455.0|    1|
|1644109746| 3000.0|    1|
+----------+-------+-----+
only showing top 10 rows



In [37]:
#auctions greater than 100
auctions.filter("price > 100").show()

+----------+-----+--------+--------------------+----------+-------+-----+------------------+-------------+
| auctionid|  bid| bidtime|              bidder|bidderrate|openbid|price|              item| auction_type|
+----------+-----+--------+--------------------+----------+-------+-----+------------------+-------------+
|1638893549|175.0|2.230949|        schadenfreud|         0|   99.0|177.5|Cartier wristwatch|3 day auction|
|1638893549|100.0|2.600116|               chuik|         0|   99.0|177.5|Cartier wristwatch|3 day auction|
|1638893549|120.0| 2.60081|          kiwisstuff|         2|   99.0|177.5|Cartier wristwatch|3 day auction|
|1638893549|150.0|2.601076|          kiwisstuff|         2|   99.0|177.5|Cartier wristwatch|3 day auction|
|1638893549|177.5|2.909826|eli.flint@flights...|         4|   99.0|177.5|Cartier wristwatch|3 day auction|
|1639453840|  1.0|0.355856|            bfalconb|         2|    1.0|355.0|Cartier wristwatch|3 day auction|
|1639453840| 1.25|0.484757|          

In [72]:
auctions.registerTempTable("auction")
results = spark.sql(
  """SELECT auctionid, MAX(price) as price, item FROM auction
    GROUP BY item,auctionid"""
  )
results.show()
results
auctionDetails = results.rdd.map(lambda p:"Auction ID- {}".format(p.auctionid)).collect()

for name in auctionDetails:
    print(name)

+----------+------+-------------------+
| auctionid| price|               item|
+----------+------+-------------------+
|1642243766| 355.0| Cartier wristwatch|
|3018091954| 232.5|Palm Pilot M515 PDA|
|3019696199| 247.5|Palm Pilot M515 PDA|
|3013912877| 232.5|Palm Pilot M515 PDA|
|3023653463|204.03|Palm Pilot M515 PDA|
|8212116757| 167.5|  Xbox game console|
|3015053455| 260.0|Palm Pilot M515 PDA|
|3020684186| 202.5|Palm Pilot M515 PDA|
|8215001975| 127.5|  Xbox game console|
|1639333116|501.62| Cartier wristwatch|
|3023885982|239.01|Palm Pilot M515 PDA|
|1648706567| 202.5| Cartier wristwatch|
|3018740612| 255.0|Palm Pilot M515 PDA|
|3023647851| 255.5|Palm Pilot M515 PDA|
|3014834745| 213.5|Palm Pilot M515 PDA|
|3014844738| 213.5|Palm Pilot M515 PDA|
|3021003299| 245.0|Palm Pilot M515 PDA|
|3014834982| 217.5|Palm Pilot M515 PDA|
|3017676972| 255.0|Palm Pilot M515 PDA|
|3023876273| 242.5|Palm Pilot M515 PDA|
+----------+------+-------------------+
only showing top 20 rows

Auction ID- 16