In [1]:
import findspark
findspark.init("/opt/manual/spark")
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import functions as F

In [2]:
spark = SparkSession \
.builder \
.master("local[2]") \
.appName("PySpark Dataframe Homework") \
.config("spark.executer.memory","2g") \
.config("spark.driver.memory","1g") \
.enableHiveSupport() \
.getOrCreate()

# .config("spark.storage.memoryFraction","0.1") \
# .config("park.shuffle.memoryFraction","0.0") \

In [3]:
spark.version

'3.1.1'

In [4]:
data_path = "file:///home/train/datasets/retail_db"
output_path = "file:///home/train/pyspark_output_data"

In [5]:
categories = spark.read.option("inferSchema",True).csv(data_path+"/categories.csv", header=True)
categories.show(3)

+----------+--------------------+-------------------+
|categoryId|categoryDepartmentId|       categoryName|
+----------+--------------------+-------------------+
|         1|                   2|           Football|
|         2|                   2|             Soccer|
|         3|                   2|Baseball & Softball|
+----------+--------------------+-------------------+
only showing top 3 rows



In [6]:
categories.printSchema()

root
 |-- categoryId: integer (nullable = true)
 |-- categoryDepartmentId: integer (nullable = true)
 |-- categoryName: string (nullable = true)



In [7]:
departments = spark.read.option("inferSchema",True).csv(data_path+"/departments.csv", header=True)
departments.show(3)

+------------+--------------+
|departmentId|departmentName|
+------------+--------------+
|           2|       Fitness|
|           3|      Footwear|
|           4|       Apparel|
+------------+--------------+
only showing top 3 rows



In [8]:
departments.printSchema()

root
 |-- departmentId: integer (nullable = true)
 |-- departmentName: string (nullable = true)



In [9]:
products = spark.read.option("inferSchema",True).csv(data_path+"/products.csv", header=True)
products.show(3)

+---------+-----------------+--------------------+------------------+------------+--------------------+
|productId|productCategoryId|         productName|productDescription|productPrice|        productImage|
+---------+-----------------+--------------------+------------------+------------+--------------------+
|        1|                2|Quest Q64 10 FT. ...|              null|       59.98|http://images.acm...|
|        2|                2|Under Armour Men'...|              null|      129.99|http://images.acm...|
|        3|                2|Under Armour Men'...|              null|       89.99|http://images.acm...|
+---------+-----------------+--------------------+------------------+------------+--------------------+
only showing top 3 rows



In [10]:
products.printSchema()

root
 |-- productId: integer (nullable = true)
 |-- productCategoryId: integer (nullable = true)
 |-- productName: string (nullable = true)
 |-- productDescription: string (nullable = true)
 |-- productPrice: double (nullable = true)
 |-- productImage: string (nullable = true)



In [11]:
customers = spark.read.option("inferSchema",True).csv(data_path+"/customers.csv", header=True)
customers.limit(3).toPandas()

Unnamed: 0,customerId,customerFName,customerLName,customerEmail,customerPassword,customerStreet,customerCity,customerState,customerZipcode
0,1,Richard,Hernandez,XXXXXXXXX,XXXXXXXXX,6303 Heather Plaza,Brownsville,TX,78521
1,2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,9526 Noble Embers Ridge,Littleton,CO,80126
2,3,Ann,Smith,XXXXXXXXX,XXXXXXXXX,3422 Blue Pioneer Bend,Caguas,PR,725


In [12]:
customers.printSchema()

root
 |-- customerId: integer (nullable = true)
 |-- customerFName: string (nullable = true)
 |-- customerLName: string (nullable = true)
 |-- customerEmail: string (nullable = true)
 |-- customerPassword: string (nullable = true)
 |-- customerStreet: string (nullable = true)
 |-- customerCity: string (nullable = true)
 |-- customerState: string (nullable = true)
 |-- customerZipcode: integer (nullable = true)



In [13]:
orders = spark.read.option("inferSchema",True).csv(data_path+"/orders.csv", header=True)
orders.show(3)

+-------+--------------------+---------------+---------------+
|orderId|           orderDate|orderCustomerId|    orderStatus|
+-------+--------------------+---------------+---------------+
|      1|2013-07-25 00:00:...|          11599|         CLOSED|
|      2|2013-07-25 00:00:...|            256|PENDING_PAYMENT|
|      3|2013-07-25 00:00:...|          12111|       COMPLETE|
+-------+--------------------+---------------+---------------+
only showing top 3 rows



In [14]:
orders.printSchema()

root
 |-- orderId: integer (nullable = true)
 |-- orderDate: string (nullable = true)
 |-- orderCustomerId: integer (nullable = true)
 |-- orderStatus: string (nullable = true)



In [15]:
order_items = spark.read.option("inferSchema",True).csv(data_path+"/order_items.csv", header=True)
order_items.show(3)

+-------------+----------------+------------------+-----------------+-----------------+---------------------+
|orderItemName|orderItemOrderId|orderItemProductId|orderItemQuantity|orderItemSubTotal|orderItemProductPrice|
+-------------+----------------+------------------+-----------------+-----------------+---------------------+
|            1|               1|               957|                1|           299.98|               299.98|
|            2|               2|              1073|                1|           199.99|               199.99|
|            3|               2|               502|                5|            250.0|                 50.0|
+-------------+----------------+------------------+-----------------+-----------------+---------------------+
only showing top 3 rows



In [16]:
order_items.printSchema()

root
 |-- orderItemName: integer (nullable = true)
 |-- orderItemOrderId: integer (nullable = true)
 |-- orderItemProductId: integer (nullable = true)
 |-- orderItemQuantity: integer (nullable = true)
 |-- orderItemSubTotal: double (nullable = true)
 |-- orderItemProductPrice: double (nullable = true)



# CEVAPLAR

# 1.1. `order_items` tablosunda kaç tane tekil `orderItemOrderId` vardır sayısını bulunuz.

In [17]:
order_items.select("orderItemOrderId").distinct().count()

57431

# 1.2. `orders` ve `order_items` tablolarında kaç satır vardır bulunuz.

In [18]:
orders.count()

68883

In [19]:
order_items.count()

172198

# # 1.3. Toplam satış tutarı bakımından en çok iptal edilen (azalan sıra) ürünleri lokal diske `parquet` formatında yazınız.

In [20]:
# Yukarıdaki rakamlardan orders'ın siparişler olduğunu ve her siparişte birden fazla ürün olduğu için 
# hangi siparişten hangi ürünlerin bulunduğunu order_items tablosundan anlıyoruz. 
# Şimdi bizden istenen iş en çok iptal edilen ürün ve kategorileri bulmak olduğu için öncelikle siparişler
# ile sipariş detaylarını birleştirmeliyiz. 
# Daha sonra bu birleşimden iptal olan siparişleri filtrelemeliyiz.

# Joins

In [21]:
# Products ve Categories birleştirme. Sadece ihtiyacımız olanları seçiyoruz.
cat_products = products.join(categories, products['productCategoryId'] == categories['categoryId']) \
.select("productId","productName", "categoryName")
cat_products.limit(5).toPandas().head()

Unnamed: 0,productId,productName,categoryName
0,1,Quest Q64 10 FT. x 10 FT. Slant Leg Instant U,Soccer
1,2,Under Armour Men's Highlight MC Football Clea,Soccer
2,3,Under Armour Men's Renegade D Mid Football Cl,Soccer
3,4,Under Armour Men's Renegade D Mid Football Cl,Soccer
4,5,Riddell Youth Revolution Speed Custom Footbal,Soccer


In [22]:
# orders ve order_items birleştirme
orders_and_items = order_items.join(orders, order_items['orderItemOrderId'] == orders['orderId']) \
.select("orderId","orderItemProductId","orderItemSubTotal","orderStatus")
orders_and_items.limit(5).toPandas().head()

Unnamed: 0,orderId,orderItemProductId,orderItemSubTotal,orderStatus
0,1,957,299.98,CLOSED
1,2,1073,199.99,PENDING_PAYMENT
2,2,502,250.0,PENDING_PAYMENT
3,2,403,129.99,PENDING_PAYMENT
4,4,897,49.98,CLOSED


In [23]:
# Ceategory-Products birleşimini iptal edilmiş siparişlerle ürün id üzerinden birleştirelim
final_table = orders_and_items.join(cat_products, orders_and_items['orderItemProductId'] == cat_products['productId'])
final_table.limit(25).toPandas().head(25)

Unnamed: 0,orderId,orderItemProductId,orderItemSubTotal,orderStatus,productId,productName,categoryName
0,57760,858,199.99,PENDING_PAYMENT,858,GolfBuddy VT3 GPS Watch,Kids' Golf Clubs
1,57847,858,199.99,COMPLETE,858,GolfBuddy VT3 GPS Watch,Kids' Golf Clubs
2,58071,858,199.99,PENDING,858,GolfBuddy VT3 GPS Watch,Kids' Golf Clubs
3,58170,858,199.99,PENDING,858,GolfBuddy VT3 GPS Watch,Kids' Golf Clubs
4,58585,858,199.99,CANCELED,858,GolfBuddy VT3 GPS Watch,Kids' Golf Clubs
5,58589,858,199.99,COMPLETE,858,GolfBuddy VT3 GPS Watch,Kids' Golf Clubs
6,58695,858,199.99,COMPLETE,858,GolfBuddy VT3 GPS Watch,Kids' Golf Clubs
7,58774,858,199.99,PENDING,858,GolfBuddy VT3 GPS Watch,Kids' Golf Clubs
8,58797,858,199.99,COMPLETE,858,GolfBuddy VT3 GPS Watch,Kids' Golf Clubs
9,58926,858,199.99,PENDING_PAYMENT,858,GolfBuddy VT3 GPS Watch,Kids' Golf Clubs


## Toplam satış tutarı bakımından en çok iptal edilen (azalan sıra) ürünleri lokal diske parquet formatında yazma

In [24]:
#İş ihtiyacı-1: Toplam satış tutarı bakımından en çok iptal edilen ürünleri diske parquet formatında yazınız
most_cancelled_products = final_table.filter("orderStatus == 'CANCELED'") \
.groupBy("productName").agg(F.sum("orderItemSubTotal").alias("Totalprice")) \
.orderBy(F.desc("TotalPrice"))

most_cancelled_products.limit(10).toPandas().head(10)

Unnamed: 0,productName,Totalprice
0,Field & Stream Sportsman 16 Gun Fire Safe,134393.28
1,Perfect Fitness Perfect Rip Deck,85785.7
2,Nike Men's Free 5.0+ Running Shoe,80691.93
3,Diamondback Women's Serene Classic Comfort Bi,80094.66
4,Pelican Sunstream 100 Kayak,66196.69


In [25]:
# coalesce parçaları birleştirir tek dosya oluşturur.
most_cancelled_products.coalesce(1) \
.write.mode("overwrite").parquet(output_path+"/most_cancelled_products_parquet")

In [26]:
# Kontrol
spark.read.parquet(output_path+"/most_cancelled_products_parquet") \
.limit(10).toPandas().head(10)

Unnamed: 0,productName,Totalprice
0,Field & Stream Sportsman 16 Gun Fire Safe,134393.28
1,Perfect Fitness Perfect Rip Deck,85785.7
2,Nike Men's Free 5.0+ Running Shoe,80691.93
3,Diamondback Women's Serene Classic Comfort Bi,80094.66
4,Pelican Sunstream 100 Kayak,66196.69
5,Nike Men's Dri-FIT Victory Golf Polo,65750.0
6,Nike Men's CJ Elite 2 TD Football Cleat,60705.33
7,O'Brien Men's Neoprene Life Vest,58126.74
8,Under Armour Girls' Toddler Spine Surge Runni,26153.46
9,LIJA Women's Eyelet Sleeveless Golf Polo,2145.0


# 1.4. Toplam satış tutarı bakımından en çok iptal edilen (azalan sıra) kategorileri local diske parquet formatında yazınız.

In [27]:
#İş ihtiyacı-2: Toplam satış tutarı bakımından en çok iptal edilen kategorileri diske parquet formatında yazınız
most_cancelled_cats = final_table.filter("orderStatus == 'CANCELED'") \
.groupBy("categoryName").agg(F.sum("orderItemSubTotal").alias("Totalprice")) \
.orderBy(F.desc("TotalPrice"))
most_cancelled_cats.limit(10).toPandas().head(10)

Unnamed: 0,categoryName,Totalprice
0,Fishing,134393.28
1,Cleats,85785.7
2,Cardio Equipment,81351.93
3,Camping & Hiking,80094.66
4,Water Sports,66196.69
5,Women's Apparel,65750.0
6,Men's Footwear,60705.33
7,Indoor/Outdoor Games,58126.74
8,Shop By Sport,27423.44
9,Electronics,5685.5


In [28]:
# coalesce parçaları birleştirir tek dosya oluşturur.
most_cancelled_cats.coalesce(1) \
.write.mode("overwrite").parquet(output_path+"/most_cancelled_cats_parquet")

In [29]:
# Kontrol
spark.read.parquet(output_path+"/most_cancelled_cats_parquet") \
.limit(10).toPandas().head(10)

Unnamed: 0,categoryName,Totalprice
0,Fishing,134393.28
1,Cleats,85785.7
2,Cardio Equipment,81351.93
3,Camping & Hiking,80094.66
4,Water Sports,66196.69
5,Women's Apparel,65750.0
6,Men's Footwear,60705.33
7,Indoor/Outdoor Games,58126.74
8,Shop By Sport,27423.44
9,Electronics,5685.5


# 1.5. En yüksek toplam satış hangi yılın hangi ayında (Türkçe) olmuştur?

In [30]:
orders_and_items_date = order_items.join(orders, order_items['orderItemOrderId'] == orders['orderId']) \
.select("orderId","orderItemProductId","orderItemSubTotal","orderStatus","orderDate")
orders_and_items_date.limit(5).toPandas().head()

Unnamed: 0,orderId,orderItemProductId,orderItemSubTotal,orderStatus,orderDate
0,1,957,299.98,CLOSED,2013-07-25 00:00:00.0
1,2,1073,199.99,PENDING_PAYMENT,2013-07-25 00:00:00.0
2,2,502,250.0,PENDING_PAYMENT,2013-07-25 00:00:00.0
3,2,403,129.99,PENDING_PAYMENT,2013-07-25 00:00:00.0
4,4,897,49.98,CLOSED,2013-07-25 00:00:00.0


In [31]:
df_year_and_month = orders_and_items_date.withColumn("orderDate", 
                                F.to_timestamp(F.col("orderDate"), "yyyy-MM-dd HH:mm:ss.S")) \
.withColumn("Year", F.year(F.col("orderDate"))) \
.withColumn("Month", F.month(F.col("orderDate")))

df_year_and_month.limit(5).toPandas()

Unnamed: 0,orderId,orderItemProductId,orderItemSubTotal,orderStatus,orderDate,Year,Month
0,1,957,299.98,CLOSED,2013-07-25,2013,7
1,2,1073,199.99,PENDING_PAYMENT,2013-07-25,2013,7
2,2,502,250.0,PENDING_PAYMENT,2013-07-25,2013,7
3,2,403,129.99,PENDING_PAYMENT,2013-07-25,2013,7
4,4,897,49.98,CLOSED,2013-07-25,2013,7


In [32]:
df_year_and_month.filter("orderStatus not in ('CANCELED')") \
.groupBy('Year','Month').agg(F.sum("orderItemSubTotal").alias("Sum_SubTotal")) \
.orderBy(F.desc("Sum_SubTotal")) \
.limit(10).toPandas()

Unnamed: 0,Year,Month,Sum_SubTotal
0,2013,11,3105843.27
1,2014,1,2870834.18
2,2013,12,2869997.88
3,2013,9,2866553.33
4,2014,3,2805006.32
5,2013,8,2769236.03
6,2014,4,2758912.47
7,2014,2,2712838.58
8,2014,5,2695699.48
9,2014,6,2657013.04


In [33]:
# convert month TR with udf
def month_to_tr(input_month):
    month_converter = {
        1: "Ocak",
        2: "Şubat",
        3: "Mart",
        4: "Nisan",
        5: "Mayıs",
        6: "Haziran",
        7: "Temmuz",
        8: "Ağustos",
        9: "Eylül",
        10: "Ekim",
        11: "Kasım",
        12: "Aralık"
    }
    
    return month_converter.get(input_month)

In [34]:
month_to_tr(2)

'Şubat'

In [35]:
month_to_tr_udf = spark.udf.register("month_to_tr_udf", month_to_tr, StringType())

In [36]:
df_year_and_month.filter("orderStatus not in ('CANCELED')") \
.groupBy('Year','Month').agg(F.sum("orderItemSubTotal").alias("Sum_SubTotal")) \
.orderBy(F.desc("Sum_SubTotal")) \
.withColumn("Month_TR", month_to_tr_udf(F.col("Month"))) \
.limit(10).toPandas()

Unnamed: 0,Year,Month,Sum_SubTotal,Month_TR
0,2013,11,3105843.27,Kasım
1,2014,1,2870834.18,Ocak
2,2013,12,2869997.88,Aralık
3,2013,9,2866553.33,Eylül
4,2014,3,2805006.32,Mart
5,2013,8,2769236.03,Ağustos
6,2014,4,2758912.47,Nisan
7,2014,2,2712838.58,Şubat
8,2014,5,2695699.48,Mayıs
9,2014,6,2657013.04,Haziran


# 1.6. En yüksek toplam satış haftanın hangi gününde (Türkçe) olmuştur?

In [37]:
df_day_of_week = orders_and_items_date.withColumn("orderDate", 
                                F.to_timestamp(F.col("orderDate"), "yyyy-MM-dd HH:mm:ss.S")) \
.withColumn("Day", F.dayofweek(F.col("orderDate")))

df_day_of_week.limit(5).toPandas()

Unnamed: 0,orderId,orderItemProductId,orderItemSubTotal,orderStatus,orderDate,Day
0,1,957,299.98,CLOSED,2013-07-25,5
1,2,1073,199.99,PENDING_PAYMENT,2013-07-25,5
2,2,502,250.0,PENDING_PAYMENT,2013-07-25,5
3,2,403,129.99,PENDING_PAYMENT,2013-07-25,5
4,4,897,49.98,CLOSED,2013-07-25,5


In [38]:
df_day_of_week.filter("orderStatus not in ('CANCELED')") \
.groupBy('Day').agg(F.sum("orderItemSubTotal").alias("Sum_SubTotal")) \
.orderBy(F.desc("Sum_SubTotal")) \
.limit(10).toPandas()

Unnamed: 0,Day,Sum_SubTotal
0,6,5065099.0
1,5,4878165.0
2,7,4862228.0
3,3,4809500.0
4,4,4805157.0
5,1,4750555.0
6,2,4455886.0


In [39]:
# convert day to TR with udf
def day_to_tr(input_day):
    day_converter = {
        1: "Pazartesi",
        2: "Salı",
        3: "Çarşamba",
        4: "Perşembe",
        5: "Cuma",
        6: "Cumartesi",
        7: "Pazar"
    }
    
    return day_converter.get(input_day)

In [40]:
day_to_tr(2)

'Salı'

In [41]:
day_to_tr_udf = spark.udf.register("day_to_tr_udf", day_to_tr, StringType())

In [42]:
df_day_of_week.filter("orderStatus not in ('CANCELED')") \
.groupBy('Day').agg(F.sum("orderItemSubTotal").alias("Sum_SubTotal")) \
.orderBy(F.desc("Sum_SubTotal")) \
.withColumn("Day_TR", day_to_tr_udf(F.col("Day"))) \
.limit(10).toPandas()

Unnamed: 0,Day,Sum_SubTotal,Day_TR
0,6,5065099.0,Cumartesi
1,5,4878165.0,Cuma
2,7,4862228.0,Pazar
3,3,4809500.0,Çarşamba
4,4,4805157.0,Perşembe
5,1,4750555.0,Pazartesi
6,2,4455886.0,Salı


# 1.7. Bütün bu tablolardan mümkün olan en büyük tabloyu oluşturup hive test1 veri tabanına retail_all adında bir tabloya yazınız.

In [43]:
! ls /home/train/datasets/retail_db/

categories.csv	departments.csv  orders.csv
customers.csv	order_items.csv  products.csv


In [44]:
categories.filter("categoryDepartmentId = 8").limit(8).toPandas().head(8)

Unnamed: 0,categoryId,categoryDepartmentId,categoryName
0,49,8,MLB
1,50,8,NFL
2,51,8,NHL
3,52,8,NBA
4,53,8,NCAA
5,54,8,MLS
6,55,8,International Soccer
7,56,8,World Cup Shop


In [45]:
departments.limit(8).toPandas().head(8)

Unnamed: 0,departmentId,departmentName
0,2,Fitness
1,3,Footwear
2,4,Apparel
3,5,Golf
4,6,Outdoors
5,7,Fan Shop
6,8,Others


In [46]:
orders.limit(1).toPandas()

Unnamed: 0,orderId,orderDate,orderCustomerId,orderStatus
0,1,2013-07-25 00:00:00.0,11599,CLOSED


In [47]:
customers.limit(1).toPandas()

Unnamed: 0,customerId,customerFName,customerLName,customerEmail,customerPassword,customerStreet,customerCity,customerState,customerZipcode
0,1,Richard,Hernandez,XXXXXXXXX,XXXXXXXXX,6303 Heather Plaza,Brownsville,TX,78521


In [48]:
order_items.limit(1).toPandas()

Unnamed: 0,orderItemName,orderItemOrderId,orderItemProductId,orderItemQuantity,orderItemSubTotal,orderItemProductPrice
0,1,1,957,1,299.98,299.98


In [49]:
products.limit(1).toPandas()

Unnamed: 0,productId,productCategoryId,productName,productDescription,productPrice,productImage
0,1,2,Quest Q64 10 FT. x 10 FT. Slant Leg Instant U,,59.98,http://images.acmesports.sports/Quest+Q64+10+F...


In [50]:
categories.select("categoryDepartmentId").distinct().show()

+--------------------+
|categoryDepartmentId|
+--------------------+
|                   6|
|                   3|
|                   5|
|                   4|
|                   8|
|                   7|
|                   2|
+--------------------+



In [51]:
departments.select("departmentId").distinct().show()

+------------+
|departmentId|
+------------+
|           6|
|           3|
|           5|
|           4|
|           8|
|           7|
|           2|
+------------+



In [52]:
print(departments.count())
print(categories.count())

7
58


In [53]:
cat_dep_prod = products.join(categories, products.productCategoryId == categories.categoryId, "left") \
.join(departments, categories.categoryDepartmentId == departments.departmentId, "left")


cat_dep_prod.limit(1).toPandas()

Unnamed: 0,productId,productCategoryId,productName,productDescription,productPrice,productImage,categoryId,categoryDepartmentId,categoryName,departmentId,departmentName
0,1,2,Quest Q64 10 FT. x 10 FT. Slant Leg Instant U,,59.98,http://images.acmesports.sports/Quest+Q64+10+F...,2,2,Soccer,2,Fitness


In [54]:
cat_dep_prod.count()

1345

In [55]:
print(departments.count())
print(categories.count())
print(products.count())
print(cat_dep_prod.count())

7
58
1345
1345


In [56]:
orders_and_items_cust = order_items.join(orders, order_items['orderItemOrderId'] == orders['orderId']) \
.join(customers, orders.orderCustomerId == customers.customerId)

In [57]:
orders_and_items_cust.limit(1).toPandas()

Unnamed: 0,orderItemName,orderItemOrderId,orderItemProductId,orderItemQuantity,orderItemSubTotal,orderItemProductPrice,orderId,orderDate,orderCustomerId,orderStatus,customerId,customerFName,customerLName,customerEmail,customerPassword,customerStreet,customerCity,customerState,customerZipcode
0,1,1,957,1,299.98,299.98,1,2013-07-25 00:00:00.0,11599,CLOSED,11599,Mary,Malone,XXXXXXXXX,XXXXXXXXX,8708 Indian Horse Highway,Hickory,NC,28601


In [58]:
orders_and_items_cust.count()

172198

In [59]:
all_in_one = orders_and_items_cust.join(cat_dep_prod, orders_and_items_cust.orderItemProductId == cat_dep_prod.productId)

In [60]:
all_in_one.count()

172198

In [61]:
all_in_one.limit(2).toPandas()

Unnamed: 0,orderItemName,orderItemOrderId,orderItemProductId,orderItemQuantity,orderItemSubTotal,orderItemProductPrice,orderId,orderDate,orderCustomerId,orderStatus,...,productCategoryId,productName,productDescription,productPrice,productImage,categoryId,categoryDepartmentId,categoryName,departmentId,departmentName
0,144513,57760,858,1,199.99,199.99,57760,2013-07-25 00:00:00.0,8330,PENDING_PAYMENT,...,38,GolfBuddy VT3 GPS Watch,,199.99,http://images.acmesports.sports/GolfBuddy+VT3+...,38,6,Kids' Golf Clubs,6,Outdoors
1,144738,57847,858,1,199.99,199.99,57847,2013-07-28 00:00:00.0,9548,COMPLETE,...,38,GolfBuddy VT3 GPS Watch,,199.99,http://images.acmesports.sports/GolfBuddy+VT3+...,38,6,Kids' Golf Clubs,6,Outdoors


In [62]:
all_in_one.write.format("orc") \
.mode("overwrite") \
.saveAsTable("test1.retail_all")

In [63]:
spark.stop()