### Import libraries

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext
import os

### Set Java home

In [2]:
# set Java home
os.environ["JAVA_HOME"] = "C:\Program Files\Java\jdk-18.0.2.1"

### Start Spark Session

In [3]:
conf = SparkConf() \
    .setAppName("Example") \
    .setMaster("local[*]") \
    .set("spark.driver.extraClassPath","G:/pyspark/*") \
    .set('spark.executor.extraClassPath', 'G:/pyspark/*')

In [4]:
sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession(sc)

In [5]:
spark

### Read data

In [6]:
database = "AdventureWorksDW2019"
table = "DimProduct"
password = os.environ['PGPASS']
user = os.environ['PGUID']
schema  = "dbo"

In [7]:
jdbc_url = f'jdbc:sqlserver://localhost:1433;database={database};encrypt=true;trustServerCertificate=true;'

In [8]:
df = spark.read \
    .format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", f"{schema}.{table}") \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .load()

In [9]:
df.show()

+----------+-------------------+---------------------+---------------------+-------------------+--------------------+------------------+-----------------+------------+-----------------+------+----------------+------------+---------+----+---------+------+-----------------+-----------+-----------+-----+-----+---------+--------------------+------------------+-----------------+------------------+-----------------+-----------------+---------------+-----------------+-------------------+------------------+-------------------+-------+-------+
|ProductKey|ProductAlternateKey|ProductSubcategoryKey|WeightUnitMeasureCode|SizeUnitMeasureCode|  EnglishProductName|SpanishProductName|FrenchProductName|StandardCost|FinishedGoodsFlag| Color|SafetyStockLevel|ReorderPoint|ListPrice|Size|SizeRange|Weight|DaysToManufacture|ProductLine|DealerPrice|Class|Style|ModelName|          LargePhoto|EnglishDescription|FrenchDescription|ChineseDescription|ArabicDescription|HebrewDescription|ThaiDescription|GermanDescri

In [10]:
df.limit(10).toPandas()

Unnamed: 0,ProductKey,ProductAlternateKey,ProductSubcategoryKey,WeightUnitMeasureCode,SizeUnitMeasureCode,EnglishProductName,SpanishProductName,FrenchProductName,StandardCost,FinishedGoodsFlag,...,ChineseDescription,ArabicDescription,HebrewDescription,ThaiDescription,GermanDescription,JapaneseDescription,TurkishDescription,StartDate,EndDate,Status
0,1,AR-5381,,,,Adjustable Race,,,,False,...,,,,,,,,2003-07-01,NaT,Current
1,2,BA-8327,,,,Bearing Ball,,,,False,...,,,,,,,,2003-07-01,NaT,Current
2,3,BE-2349,,,,BB Ball Bearing,,,,False,...,,,,,,,,2003-07-01,NaT,Current
3,4,BE-2908,,,,Headset Ball Bearings,,,,False,...,,,,,,,,2003-07-01,NaT,Current
4,5,BL-2036,,,,Blade,,,,False,...,,,,,,,,2003-07-01,NaT,Current
5,6,CA-5965,,,,LL Crankarm,,,,False,...,,,,,,,,2003-07-01,NaT,Current
6,7,CA-6738,,,,ML Crankarm,,,,False,...,,,,,,,,2003-07-01,NaT,Current
7,8,CA-7457,,,,HL Crankarm,,,,False,...,,,,,,,,2003-07-01,NaT,Current
8,9,CB-2903,,,,Chainring Bolts,,,,False,...,,,,,,,,2003-07-01,NaT,Current
9,10,CN-6137,,,,Chainring Nut,,,,False,...,,,,,,,,2003-07-01,NaT,Current


###  PySpark DataFrame API

In [11]:
#rename column
df = df.withColumnRenamed("EnglishProductName","ProductName")
df.show()

+----------+-------------------+---------------------+---------------------+-------------------+--------------------+------------------+-----------------+------------+-----------------+------+----------------+------------+---------+----+---------+------+-----------------+-----------+-----------+-----+-----+---------+--------------------+------------------+-----------------+------------------+-----------------+-----------------+---------------+-----------------+-------------------+------------------+-------------------+-------+-------+
|ProductKey|ProductAlternateKey|ProductSubcategoryKey|WeightUnitMeasureCode|SizeUnitMeasureCode|         ProductName|SpanishProductName|FrenchProductName|StandardCost|FinishedGoodsFlag| Color|SafetyStockLevel|ReorderPoint|ListPrice|Size|SizeRange|Weight|DaysToManufacture|ProductLine|DealerPrice|Class|Style|ModelName|          LargePhoto|EnglishDescription|FrenchDescription|ChineseDescription|ArabicDescription|HebrewDescription|ThaiDescription|GermanDescri

In [12]:
# Row count
df.count()

606

In [13]:
# Select subset of columns
df = df.select("ProductKey", "ProductName", "Color")
df.show()

+----------+--------------------+------+
|ProductKey|         ProductName| Color|
+----------+--------------------+------+
|         1|     Adjustable Race|    NA|
|         2|        Bearing Ball|    NA|
|         3|     BB Ball Bearing|    NA|
|         4|Headset Ball Bear...|    NA|
|         5|               Blade|    NA|
|         6|         LL Crankarm| Black|
|         7|         ML Crankarm| Black|
|         8|         HL Crankarm| Black|
|         9|     Chainring Bolts|Silver|
|        10|       Chainring Nut|Silver|
|        11|           Chainring| Black|
|        12|          Crown Race|    NA|
|        13|         Chain Stays|    NA|
|        14|             Decal 1|    NA|
|        15|             Decal 2|    NA|
|        16|           Down Tube|    NA|
|        17|   Mountain End Caps|    NA|
|        18|       Road End Caps|    NA|
|        19|    Touring End Caps|    NA|
|        20|            Fork End|    NA|
+----------+--------------------+------+
only showing top

### Dataframe sort operation

In [14]:
df.sort("ProductName").show()

+----------+--------------------+------+
|ProductKey|         ProductName| Color|
+----------+--------------------+------+
|       225|        AWC Logo Cap| Multi|
|       224|        AWC Logo Cap| Multi|
|       223|        AWC Logo Cap| Multi|
|         1|     Adjustable Race|    NA|
|       486|All-Purpose Bike ...|    NA|
|         3|     BB Ball Bearing|    NA|
|         2|        Bearing Ball|    NA|
|       484|Bike Wash - Disso...|    NA|
|         5|               Blade|    NA|
|       447|          Cable Lock|    NA|
|       559|               Chain|Silver|
|        13|         Chain Stays|    NA|
|        11|           Chainring| Black|
|         9|     Chainring Bolts|Silver|
|        10|       Chainring Nut|Silver|
|       473|     Classic Vest, L|  Blue|
|       472|     Classic Vest, M|  Blue|
|       471|     Classic Vest, S|  Blue|
|       178|    Cone-Shaped Race|    NA|
|        12|          Crown Race|    NA|
+----------+--------------------+------+
only showing top

In [15]:
# descending Sort Order
from pyspark.sql import functions as F
df.sort(F.desc("ProductName")).show()

+----------+--------------------+------+
|ProductKey|         ProductName| Color|
+----------+--------------------+------+
|       456|   Women's Tights, S| Black|
|       457|   Women's Tights, M| Black|
|       458|   Women's Tights, L| Black|
|       474|Women's Mountain ...| Black|
|       475|Women's Mountain ...| Black|
|       476|Women's Mountain ...| Black|
|       477|Water Bottle - 30...|    NA|
|       446|Touring-Panniers,...|  Grey|
|       572|Touring-3000 Yell...|Yellow|
|       571|Touring-3000 Yell...|Yellow|
|       570|Touring-3000 Yell...|Yellow|
|       569|Touring-3000 Yell...|Yellow|
|       568|Touring-3000 Yell...|Yellow|
|       567|Touring-3000 Blue...|  Blue|
|       566|Touring-3000 Blue...|  Blue|
|       565|Touring-3000 Blue...|  Blue|
|       586|Touring-3000 Blue...|  Blue|
|       585|Touring-3000 Blue...|  Blue|
|       560|Touring-2000 Blue...|  Blue|
|       579|Touring-2000 Blue...|  Blue|
+----------+--------------------+------+
only showing top

In [16]:
df.printSchema()

root
 |-- ProductKey: integer (nullable = true)
 |-- ProductName: string (nullable = true)
 |-- Color: string (nullable = true)



### Dataframe filter operation

In [17]:
df.select("ProductKey", "ProductName").filter("ProductKey = 22").show()

+----------+-------------+
|ProductKey|  ProductName|
+----------+-------------+
|        22|Flat Washer 1|
+----------+-------------+



In [18]:
df.select(df.ProductKey, df.ProductName).filter("ProductKey = 22").show()

+----------+-------------+
|ProductKey|  ProductName|
+----------+-------------+
|        22|Flat Washer 1|
+----------+-------------+



In [19]:
df.select("ProductKey", "ProductName").filter("ProductName like '%helmet%'").show()

+----------+--------------------+
|ProductKey|         ProductName|
+----------+--------------------+
|       212|Sport-100 Helmet,...|
|       213|Sport-100 Helmet,...|
|       214|Sport-100 Helmet,...|
|       215|Sport-100 Helmet,...|
|       216|Sport-100 Helmet,...|
|       217|Sport-100 Helmet,...|
|       220|Sport-100 Helmet,...|
|       221|Sport-100 Helmet,...|
|       222|Sport-100 Helmet,...|
+----------+--------------------+



In [20]:
df.filter((df.ProductName.like('%helmet%')) & (df.Color=='Black')).show()

+----------+--------------------+-----+
|ProductKey|         ProductName|Color|
+----------+--------------------+-----+
|       215|Sport-100 Helmet,...|Black|
|       216|Sport-100 Helmet,...|Black|
|       217|Sport-100 Helmet,...|Black|
+----------+--------------------+-----+



### Spark SQL filter operations

In [21]:
#
df.createOrReplaceTempView("Product")
spark.sql("select count(1) from Product").show()


+--------+
|count(1)|
+--------+
|     606|
+--------+



In [22]:
spark.sql("select ProductKey, ProductName from Product where ProductKey = 22").show()

+----------+-------------+
|ProductKey|  ProductName|
+----------+-------------+
|        22|Flat Washer 1|
+----------+-------------+



In [23]:
spark.sql("select ProductKey, ProductName from Product where ProductName like '%helmet%'").show()

+----------+--------------------+
|ProductKey|         ProductName|
+----------+--------------------+
|       212|Sport-100 Helmet,...|
|       213|Sport-100 Helmet,...|
|       214|Sport-100 Helmet,...|
|       215|Sport-100 Helmet,...|
|       216|Sport-100 Helmet,...|
|       217|Sport-100 Helmet,...|
|       220|Sport-100 Helmet,...|
|       221|Sport-100 Helmet,...|
|       222|Sport-100 Helmet,...|
+----------+--------------------+



### Create a sales view

In [24]:
#Let's get the fact table with product sales transactions
sales = spark.read \
    .format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", f"dbo.FactInternetSales") \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .load()

In [25]:
sales.createOrReplaceTempView("sales")
sales.cache()

DataFrame[ProductKey: int, OrderDateKey: int, DueDateKey: int, ShipDateKey: int, CustomerKey: int, PromotionKey: int, CurrencyKey: int, SalesTerritoryKey: int, SalesOrderNumber: string, SalesOrderLineNumber: int, RevisionNumber: int, OrderQuantity: smallint, UnitPrice: decimal(19,4), ExtendedAmount: decimal(19,4), UnitPriceDiscountPct: double, DiscountAmount: double, ProductStandardCost: decimal(19,4), TotalProductCost: decimal(19,4), SalesAmount: decimal(19,4), TaxAmt: decimal(19,4), Freight: decimal(19,4), CarrierTrackingNumber: string, CustomerPONumber: string, OrderDate: timestamp, DueDate: timestamp, ShipDate: timestamp]

### Join the Product and Sales with SQL

In [26]:
spark.sql("""
SELECT 
    p.ProductName,
    SUM(s.SalesAmount) AS SalesAmount
FROM  Product p
    Inner join sales s on p.ProductKey = s.ProductKey
where ProductName like '%helmet%'
Group by 
    p.ProductName
order by 
    SUM(s.SalesAmount) desc"""
).show()

+--------------------+-----------+
|         ProductName|SalesAmount|
+--------------------+-----------+
|Sport-100 Helmet,...| 78027.7000|
|Sport-100 Helmet,...| 74353.7500|
|Sport-100 Helmet,...| 72954.1500|
+--------------------+-----------+



### Join two dataframes

In [27]:
salesjoined = sales.join(df, ['ProductKey'],how='inner')
salesjoined.limit(10).toPandas()

Unnamed: 0,ProductKey,OrderDateKey,DueDateKey,ShipDateKey,CustomerKey,PromotionKey,CurrencyKey,SalesTerritoryKey,SalesOrderNumber,SalesOrderLineNumber,...,SalesAmount,TaxAmt,Freight,CarrierTrackingNumber,CustomerPONumber,OrderDate,DueDate,ShipDate,ProductName,Color
0,214,20140128,20140209,20140204,15251,1,100,6,SO75121,3,...,34.99,2.7992,0.8748,,,2014-01-28,2014-02-09,2014-02-04,"Sport-100 Helmet, Red",Red
1,214,20140128,20140209,20140204,15160,1,100,4,SO75105,2,...,34.99,2.7992,0.8748,,,2014-01-28,2014-02-09,2014-02-04,"Sport-100 Helmet, Red",Red
2,214,20140128,20140209,20140204,11794,1,100,4,SO75087,1,...,34.99,2.7992,0.8748,,,2014-01-28,2014-02-09,2014-02-04,"Sport-100 Helmet, Red",Red
3,214,20140126,20140207,20140202,16175,1,100,9,SO75033,2,...,34.99,2.7992,0.8748,,,2014-01-26,2014-02-07,2014-02-02,"Sport-100 Helmet, Red",Red
4,214,20140125,20140206,20140201,20169,1,100,8,SO75020,3,...,34.99,2.7992,0.8748,,,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red
5,214,20140125,20140206,20140201,14168,1,100,7,SO75019,4,...,34.99,2.7992,0.8748,,,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red
6,214,20140125,20140206,20140201,17686,1,100,6,SO75016,4,...,34.99,2.7992,0.8748,,,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red
7,214,20140125,20140206,20140201,24635,1,100,4,SO75009,3,...,34.99,2.7992,0.8748,,,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red
8,214,20140125,20140206,20140201,11229,1,100,4,SO75004,2,...,34.99,2.7992,0.8748,,,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red
9,214,20140125,20140206,20140201,17305,1,100,9,SO75002,1,...,34.99,2.7992,0.8748,,,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red


### Group by with column alias

In [28]:
salesjoined.groupBy(["ProductName","Color"]).agg(
    F.sum("SalesAmount").alias("TotalSalesAmounted"),\
    F.max("SalesAmount").alias("MaxSalesAmount")\
    ).show()

+--------------------+------+------------------+--------------+
|         ProductName| Color|TotalSalesAmounted|MaxSalesAmount|
+--------------------+------+------------------+--------------+
|Half-Finger Glove...| Black|        12220.5100|       24.4900|
|Touring-3000 Blue...|  Blue|        42313.9500|      742.3500|
|Touring-3000 Yell...|Yellow|        43798.6500|      742.3500|
|  Road-250 Black, 52| Black|       734401.2000|     2443.3500|
|Women's Mountain ...| Black|        24636.4800|       69.9900|
|  Road-650 Black, 48| Black|        45553.2394|      782.9900|
|    Road-650 Red, 58|   Red|        56347.3158|      782.9900|
|Mountain-200 Blac...| Black|      1363142.0934|     2294.9900|
|    Road-250 Red, 52|   Red|       324965.5500|     2443.3500|
|  Road-250 Black, 48| Black|       691206.2625|     2443.3500|
| Hitch Rack - 4-Bike|    NA|        39360.0000|      120.0000|
|  Road-650 Black, 52| Black|        66917.6806|      782.9900|
|    Road-250 Red, 48|   Red|       3958

### Create a new column based on calculation

In [29]:
saleswithNet = salesjoined.withColumn("NetSales", F.col("SalesAmount") - F.col("TaxAmt"))
saleswithNet.limit(10).toPandas()

Unnamed: 0,ProductKey,OrderDateKey,DueDateKey,ShipDateKey,CustomerKey,PromotionKey,CurrencyKey,SalesTerritoryKey,SalesOrderNumber,SalesOrderLineNumber,...,TaxAmt,Freight,CarrierTrackingNumber,CustomerPONumber,OrderDate,DueDate,ShipDate,ProductName,Color,NetSales
0,214,20140128,20140209,20140204,15251,1,100,6,SO75121,3,...,2.7992,0.8748,,,2014-01-28,2014-02-09,2014-02-04,"Sport-100 Helmet, Red",Red,32.1908
1,214,20140128,20140209,20140204,15160,1,100,4,SO75105,2,...,2.7992,0.8748,,,2014-01-28,2014-02-09,2014-02-04,"Sport-100 Helmet, Red",Red,32.1908
2,214,20140128,20140209,20140204,11794,1,100,4,SO75087,1,...,2.7992,0.8748,,,2014-01-28,2014-02-09,2014-02-04,"Sport-100 Helmet, Red",Red,32.1908
3,214,20140126,20140207,20140202,16175,1,100,9,SO75033,2,...,2.7992,0.8748,,,2014-01-26,2014-02-07,2014-02-02,"Sport-100 Helmet, Red",Red,32.1908
4,214,20140125,20140206,20140201,20169,1,100,8,SO75020,3,...,2.7992,0.8748,,,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red,32.1908
5,214,20140125,20140206,20140201,14168,1,100,7,SO75019,4,...,2.7992,0.8748,,,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red,32.1908
6,214,20140125,20140206,20140201,17686,1,100,6,SO75016,4,...,2.7992,0.8748,,,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red,32.1908
7,214,20140125,20140206,20140201,24635,1,100,4,SO75009,3,...,2.7992,0.8748,,,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red,32.1908
8,214,20140125,20140206,20140201,11229,1,100,4,SO75004,2,...,2.7992,0.8748,,,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red,32.1908
9,214,20140125,20140206,20140201,17305,1,100,9,SO75002,1,...,2.7992,0.8748,,,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red,32.1908


### Create a new column based on a condition

In [30]:
#Create a new column based on a condition
saleswithNet = saleswithNet.withColumn(
    'Region',
    F.when((F.col("SalesTerritoryKey") == 7), "Europe")\
    .when((F.col("SalesTerritoryKey") == 8) , "Europe")\
    .when((F.col("SalesTerritoryKey") == 9) , "Pacific")\
    .when((F.col("SalesTerritoryKey") == 10) , "Europe")\
    .otherwise("Americas")
)
saleswithNet.limit(10).toPandas()

Unnamed: 0,ProductKey,OrderDateKey,DueDateKey,ShipDateKey,CustomerKey,PromotionKey,CurrencyKey,SalesTerritoryKey,SalesOrderNumber,SalesOrderLineNumber,...,Freight,CarrierTrackingNumber,CustomerPONumber,OrderDate,DueDate,ShipDate,ProductName,Color,NetSales,Region
0,214,20140128,20140209,20140204,15251,1,100,6,SO75121,3,...,0.8748,,,2014-01-28,2014-02-09,2014-02-04,"Sport-100 Helmet, Red",Red,32.1908,Americas
1,214,20140128,20140209,20140204,15160,1,100,4,SO75105,2,...,0.8748,,,2014-01-28,2014-02-09,2014-02-04,"Sport-100 Helmet, Red",Red,32.1908,Americas
2,214,20140128,20140209,20140204,11794,1,100,4,SO75087,1,...,0.8748,,,2014-01-28,2014-02-09,2014-02-04,"Sport-100 Helmet, Red",Red,32.1908,Americas
3,214,20140126,20140207,20140202,16175,1,100,9,SO75033,2,...,0.8748,,,2014-01-26,2014-02-07,2014-02-02,"Sport-100 Helmet, Red",Red,32.1908,Pacific
4,214,20140125,20140206,20140201,20169,1,100,8,SO75020,3,...,0.8748,,,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red,32.1908,Europe
5,214,20140125,20140206,20140201,14168,1,100,7,SO75019,4,...,0.8748,,,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red,32.1908,Europe
6,214,20140125,20140206,20140201,17686,1,100,6,SO75016,4,...,0.8748,,,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red,32.1908,Americas
7,214,20140125,20140206,20140201,24635,1,100,4,SO75009,3,...,0.8748,,,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red,32.1908,Americas
8,214,20140125,20140206,20140201,11229,1,100,4,SO75004,2,...,0.8748,,,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red,32.1908,Americas
9,214,20140125,20140206,20140201,17305,1,100,9,SO75002,1,...,0.8748,,,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red,32.1908,Pacific


In [31]:
#Check distinct values
saleswithNet.select('Region').distinct().collect()

[Row(Region='Europe'), Row(Region='Pacific'), Row(Region='Americas')]

### Replace null with zero

In [32]:
#Replace null with 0
from pyspark.sql.functions import when, lit
saleswithNet = saleswithNet.withColumn('CarrierTrackingNumber', when(saleswithNet.CarrierTrackingNumber.isNull(), 
lit('0')).otherwise(saleswithNet.CarrierTrackingNumber))
saleswithNet.limit(10).toPandas()

Unnamed: 0,ProductKey,OrderDateKey,DueDateKey,ShipDateKey,CustomerKey,PromotionKey,CurrencyKey,SalesTerritoryKey,SalesOrderNumber,SalesOrderLineNumber,...,Freight,CarrierTrackingNumber,CustomerPONumber,OrderDate,DueDate,ShipDate,ProductName,Color,NetSales,Region
0,214,20140128,20140209,20140204,15251,1,100,6,SO75121,3,...,0.8748,0,,2014-01-28,2014-02-09,2014-02-04,"Sport-100 Helmet, Red",Red,32.1908,Americas
1,214,20140128,20140209,20140204,15160,1,100,4,SO75105,2,...,0.8748,0,,2014-01-28,2014-02-09,2014-02-04,"Sport-100 Helmet, Red",Red,32.1908,Americas
2,214,20140128,20140209,20140204,11794,1,100,4,SO75087,1,...,0.8748,0,,2014-01-28,2014-02-09,2014-02-04,"Sport-100 Helmet, Red",Red,32.1908,Americas
3,214,20140126,20140207,20140202,16175,1,100,9,SO75033,2,...,0.8748,0,,2014-01-26,2014-02-07,2014-02-02,"Sport-100 Helmet, Red",Red,32.1908,Pacific
4,214,20140125,20140206,20140201,20169,1,100,8,SO75020,3,...,0.8748,0,,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red,32.1908,Europe
5,214,20140125,20140206,20140201,14168,1,100,7,SO75019,4,...,0.8748,0,,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red,32.1908,Europe
6,214,20140125,20140206,20140201,17686,1,100,6,SO75016,4,...,0.8748,0,,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red,32.1908,Americas
7,214,20140125,20140206,20140201,24635,1,100,4,SO75009,3,...,0.8748,0,,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red,32.1908,Americas
8,214,20140125,20140206,20140201,11229,1,100,4,SO75004,2,...,0.8748,0,,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red,32.1908,Americas
9,214,20140125,20140206,20140201,17305,1,100,9,SO75002,1,...,0.8748,0,,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red,32.1908,Pacific


In [33]:
# Drop a columns
saleswithNet=saleswithNet.drop("CustomerPONumber")
saleswithNet.limit(10).toPandas()

Unnamed: 0,ProductKey,OrderDateKey,DueDateKey,ShipDateKey,CustomerKey,PromotionKey,CurrencyKey,SalesTerritoryKey,SalesOrderNumber,SalesOrderLineNumber,...,TaxAmt,Freight,CarrierTrackingNumber,OrderDate,DueDate,ShipDate,ProductName,Color,NetSales,Region
0,214,20140128,20140209,20140204,15251,1,100,6,SO75121,3,...,2.7992,0.8748,0,2014-01-28,2014-02-09,2014-02-04,"Sport-100 Helmet, Red",Red,32.1908,Americas
1,214,20140128,20140209,20140204,15160,1,100,4,SO75105,2,...,2.7992,0.8748,0,2014-01-28,2014-02-09,2014-02-04,"Sport-100 Helmet, Red",Red,32.1908,Americas
2,214,20140128,20140209,20140204,11794,1,100,4,SO75087,1,...,2.7992,0.8748,0,2014-01-28,2014-02-09,2014-02-04,"Sport-100 Helmet, Red",Red,32.1908,Americas
3,214,20140126,20140207,20140202,16175,1,100,9,SO75033,2,...,2.7992,0.8748,0,2014-01-26,2014-02-07,2014-02-02,"Sport-100 Helmet, Red",Red,32.1908,Pacific
4,214,20140125,20140206,20140201,20169,1,100,8,SO75020,3,...,2.7992,0.8748,0,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red,32.1908,Europe
5,214,20140125,20140206,20140201,14168,1,100,7,SO75019,4,...,2.7992,0.8748,0,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red,32.1908,Europe
6,214,20140125,20140206,20140201,17686,1,100,6,SO75016,4,...,2.7992,0.8748,0,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red,32.1908,Americas
7,214,20140125,20140206,20140201,24635,1,100,4,SO75009,3,...,2.7992,0.8748,0,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red,32.1908,Americas
8,214,20140125,20140206,20140201,11229,1,100,4,SO75004,2,...,2.7992,0.8748,0,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red,32.1908,Americas
9,214,20140125,20140206,20140201,17305,1,100,9,SO75002,1,...,2.7992,0.8748,0,2014-01-25,2014-02-06,2014-02-01,"Sport-100 Helmet, Red",Red,32.1908,Pacific
