### 1. Important Links:

a. https://sparkbyexamples.com/pyspark-tutorial/

b. https://www.youtube.com/watch?v=3kX5ry0RCOQ&list=PL3N9eeOlCrP7MKqbOG3WL_zSJrEmJXLPx

c. https://www.sqlite.org/windowfunctions.html#:~:text=A%20window%20function%20is%20an,it%20is%20a%20window%20function.

### 2. Initialize Spark

In [None]:
import findspark
findspark.init()
import pyspark


from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
spark = SparkSession.builder.appName("001").master("local").config("spark.sql.warehouse.dir", "file:///C:/tmp/hive").getOrCreate()##

In [2]:
from pyspark.sql import Row
from pyspark.sql.functions import instr,col,column, expr, coalesce, broadcast
from pyspark.sql.functions import count, first, last,min, max, sum, avg, sumDistinct,sum_distinct, pow, desc, countDistinct, approx_count_distinct, dense_rank, rank
from pyspark.sql.functions import corr, round, bround, monotonically_increasing_id, mean, format_number, var_pop, stddev_pop,var_samp, stddev_samp, skewness, kurtosis, covar_pop, covar_samp
from pyspark.sql.functions import initcap , lower, upper, lit, ltrim, rtrim, rpad, lpad, trim, regexp_replace, regexp_extract, translate, collect_set, collect_list
from pyspark.sql.functions import when,current_date, current_timestamp, date_add, date_sub, datediff, to_date, months_between, dayofmonth,hour,dayofyear,month,year,weekofyear,date_format
from pyspark.sql.window import Window

### 2. Python Complex DataTypes

List

In [None]:
a = [5,10,15,20,25,30,'xyz',40]
print("a[2] = ", a[2])      # a[2] = 15
print("a[0:3] = ", a[0:3])  # a[0:3] = [5, 10, 15]
print("a[5:] = ", a[5:])    # a[5:] = [30, 35, 40]
print("a[:5] = ", a[:5])    # a[:5] =  [5, 10, 15, 20, 25]
#Lists are mutable, meaning, the value of elements of a list can be altered.
a[2] = 4
print(a)

Tuple (Immutable List)

In [None]:
t = (5,'program', 1+3j)
print(t[1])      #   program
print( t[0:3])   #  (5, 'program', (1+3j))
# t[0] = 10 # Generates error # Tuples are immutable

In [None]:
#Python Strings
s = "This is a string"
s = '''A multiline
string'''
print(s[4])     # l
print( s[6:11]) #  iline
#s[5] ='d' # Generates error # Strings are immutable in Python

Set (set are unordered collection, indexing has no meaning. Hence, the slicing operator [] does not work.)

In [None]:
a = {5,2,3,1,1,1,1,1,1,1,4}
print(a)       #{1, 2, 3, 4, 5}
print(type(a)) # <class 'set'>

Dictionary (unordered collection of key-value pairs.)

In [None]:
d = {1:'value','key':2}
print(type(d))           # <class 'dict'>
print(d[1]);             # value
print(d['key']);         # 2
#print("d[2] = ", d[2]);  # Generates error

### Create DataFrame
Dataframe can be created from : 
(1).List, (2).rdd, (3).reading data from disk

In [11]:
df = spark.range(1000).toDF("Column1") 
df.take(3)

[Row(Column1=0), Row(Column1=1), Row(Column1=2)]

In [19]:
col("someColumnName")
column("someColumnName")
columns = ["language","users_count"] #list
data = [("Java", "20000"), ("Python", "100000")]  #list
ddata = [{"language":"Java", "users_count":"20000"}, {"language":"Python","users_count": "100000"}]  #list

rdd = spark.sparkContext.parallelize(data) #RDD

Person = Row('language', 'users_count') # Row
person = rdd.map(lambda r: Person(*r)) #PipelinedRDD

Creating DataFrame from List

In [None]:
spark.createDataFrame(data).collect() # [Row(_1='Java', _2='20000'), Row(_1='Python', _2='100000')]
spark.createDataFrame(data, ['language', 'users_count']).collect() #[Row(language='Java', users_count='20000'), Row(langu..)]
spark.createDataFrame(data, columns).collect() #[Row(language='Java', users_count='20000'), Row(language='Python', users_..)]
spark.createDataFrame(ddata).collect()#[Row(language='Java', users_count='20000'), Row(language='Python', users_count='100000')]

Creating DataFrame from RDD

In [None]:
rdd.toDF().collect()

spark.createDataFrame(rdd).collect()                  #[Row(_1='Java', _2='20000'), Row(_1='Python', _2='100000')]
spark.createDataFrame(rdd, ["language","users_count"]).collect()  #[Row(language='Java', users_count='20000'), Row(langu..)]

spark.createDataFrame(person).collect() #[Row(language='Java', users_count='20000'), Row(language='Python', users_count='100000')]

In [None]:
rdd.toDF().collect() #[Row(_1='Java', _2='20000'), Row(_1='Python', _2='100000')]
rdd.toDF(columns).collect() #[Row(language='Java', users_count='20000'), Row(language='Python', users_count='100000')]

Creating DataFrame from File

In [13]:
df=spark.read.json("data/people.json")
from pyspark.sql import Row
df.printSchema() 
df.write.format("json").save("data/people2.json")

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [18]:
type(df.head())             #pyspark.sql.types.Row
type(df.head(3))            # List  
#type(df.take())             #error
type(df.take(3))            # List
type(df.collect())          # List
type(df.columns)            # List
type(df['age'])             #pyspark.sql.column.Column
type(df.select('age'))      #pyspark.sql.dataframe.DataFrame


df.columns         #['age', 'name']
df.count()         #3
df.describe()      #DataFrame[summary: string, age: string, name: string]
df.select('age')   #  DataFrame[age: bigint]
df.first()         # returns first row 
df.collect()       #  Returns a list containing all the rows of the Dataframe [Row(Date='2010-01-04',......
df.head(3)          #  top 3 rows in a list. Row(Date='2010-01-04', Open=213.429998, High=214
df.take(3)         #  [Row(Date='2010-01-04', Open=213.429998, High=214
#df.head() returns the top Row   #  df.take(1) returns a list of one Row(top)
#df.head(1)=df.take(1) returns a list of one Row(top)


df.select('age').show()
df.select('age','name').show()
df.select(['age','name']).show()
df.withColumn('newage',df['age']*10).show()


+----+
| age|
+----+
|null|
|  30|
|  19|
+----+

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+

+----+-------+------+
| age|   name|newage|
+----+-------+------+
|null|Michael|  null|
|  30|   Andy|   300|
|  19| Justin|   190|
+----+-------+------+



#### Col, Column, Expr

In [None]:
from pyspark.sql.functions import expr, col, column ,lit
df=spark.read.json('data/2015-summary.json')
df=spark.read.format("json").load('data/2015-summary.json')

df.select(
expr("DEST_COUNTRY_NAME"),
col("DEST_COUNTRY_NAME"),
column("DEST_COUNTRY_NAME")).show(2)

df.select(expr("DEST_COUNTRY_NAME as destination")).show(2)
df.select(expr("DEST_COUNTRY_NAME as destination").alias("DEST_COUNTRY_NAME")).show(2)
df.selectExpr("DEST_COUNTRY_NAME as newColumnName", "DEST_COUNTRY_NAME").show(2)


df.selectExpr("*", "(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry").show(2)
#SELECT *, (DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry FROM dfTable LIMIT 2.

df.select(expr("*"), lit(1).alias("One")).show(2)

from pyspark.sql.functions import expr, col, column ,lit
#df=spark.read.json('data/2015-summary.json')
#df=spark.read.format("json").load('data/2015-summary.json')

df.select("DEST_COUNTRY_NAME").show(2)
df.select(expr("DEST_COUNTRY_NAME")).show(2)
df.select(col("DEST_COUNTRY_NAME")).show(2)
df.select(column("DEST_COUNTRY_NAME")).show(2)

df.select(expr("DEST_COUNTRY_NAME as destination")).show(2)
df.select(expr("DEST_COUNTRY_NAME as destination").alias("DEST_COUNTRY_NAME")).show(2)
# df.select(expr = df.selectExpr (shorthand)
df.selectExpr("DEST_COUNTRY_NAME as newColumnName", "DEST_COUNTRY_NAME").show(2)

df.selectExpr("*", "(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry").show(2)
#SELECT *, (DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry FROM dfTable LIMIT 2.

#aggregation
df.selectExpr("avg(count)", "count(distinct(DEST_COUNTRY_NAME))").show(2)

df.select(expr("*"), lit(1).alias("One")).show(2)

#### Adding ,Renaming, Dropping, Casting Columns

In [None]:
df.withColumn("numberOne", lit(1)).show(2)
df.withColumn("withinCountry", expr("ORIGIN_COUNTRY_NAME == DEST_COUNTRY_NAME")).show(2)
df.selectExpr( "*" ,"ORIGIN_COUNTRY_NAME == DEST_COUNTRY_NAME").show(2)

df.withColumnRenamed("DEST_COUNTRY_NAME", "dest").show(2)

df.drop("ORIGIN_COUNTRY_NAME").show(2)

df.withColumn("countLong", col("count").cast("long")).drop("count").show(2)

#### Filter, Where 

In [None]:
from pyspark.sql.functions import expr, col, column
df = spark.read.csv('data/appl_stock.csv',inferSchema=True,header=True)

df.filter("Close<500").show(2)
df.filter('Close<500').select('open','close').show(2)
df.filter(df['close']<500).select(['open','close']).show(2)
df.filter((df['Open'] <200) & (df['close'] > 200)).show(2)
df.filter((col('Close')<500) & (col('Close')>200) ).show(2)

df.where("Open < 213").show(2)
df.where("Open < 213").where(col('High')> 215).show(2)
df.where(col("Open")< 213).where('High > 215').select('Open','Close','High','Low').show(2)
df.where((col('Close')<500) & (col('Close')>200) ).show(2)

df.where('High = 215.23').show(2)
df.where('High != 215.23').show(2)
df.where('High <> 215.23').show(2)

In [None]:
from pyspark.sql.functions import instr ,expr 
dff = spark.read.csv('data/online-retail-dataset.csv',inferSchema=True,header=True)

DOTCodeFilter = col("StockCode") == "DOT"
priceFilter = col("UnitPrice") > 600
descripFilter = instr(col("Description"), "POSTAGE") >= 1

dff.withColumn("isExpensive", DOTCodeFilter & (priceFilter | descripFilter)).where("isExpensive")\
.select("unitPrice", "isExpensive").show(5)

dff.withColumn("isExpensive", expr("NOT UnitPrice <= 250")).where("isExpensive")\
.select("Description", "UnitPrice").show(5)

dff.where(col("Description").eqNullSafe("hello")).show()
containsBlack = instr(col("Description"), "BLACK") >= 1
containsWhite = instr(col("Description"), "WHITE") >= 1

dff.withColumn("hasSimpleColor", containsBlack | containsWhite)\
.where("hasSimpleColor").select("Description").show(3, False)

#### Distinct

In [None]:
df.select("ORIGIN_COUNTRY_NAME").distinct().show()
df.select("ORIGIN_COUNTRY_NAME").distinct().count()                          #125 unique Country

df.show()
df.select("ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME").distinct().show()
df.select("ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME").distinct().count()     #125 unique combination

#### Random Samples & Splits

In [None]:
seed = 5
withReplacement = False
fraction = 0.5
df.sample(withReplacement, fraction, seed).count()

dataFrames = df.randomSplit([0.25, 0.75], seed)# Splits into array of df
dataFrames[0].count() > dataFrames[1].count() # False

#### Union

In [None]:
from pyspark.sql import Row
schema = df.schema
newRows = [Row("New Country", "Other Country", 5),Row("New Country 2", "Other Country 3", 1)]
parallelizedRows = spark.sparkContext.parallelize(newRows)
newDF = spark.createDataFrame(parallelizedRows, schema)

df.union(newDF).where("count = 1").where(col("ORIGIN_COUNTRY_NAME") != "United States").show()

#### desc,    asc    ,asc_nulls_last ,    asc_nulls_first,      desc_nulls_last,     desc_nulls_first

In [None]:
from pyspark.sql.functions import desc, asc

df.sort(desc('count')).show(2)
df.orderBy('count',  ascending=False).show(2)
df.orderBy(col("count").desc()).show(2)
df.orderBy(expr("count desc")).show(2)

df.orderBy(col("count").desc(), col("DEST_COUNTRY_NAME").asc()).show(2)
df.orderBy(col("count").desc_nulls_first(), col("DEST_COUNTRY_NAME").asc()).show(2)

#### Repartition and Coalesce
If you know that you’re going to be filtering by a certain column often, it can be worth repartitioning based on that column

In [None]:
df.rdd.getNumPartitions()
df.repartition(5)
df.repartition(col("DEST_COUNTRY_NAME"))
df.repartition(5, col("DEST_COUNTRY_NAME"))
df.repartition(col("DEST_COUNTRY_NAME")).rdd.getNumPartitions()
#This operation will shuffle your data into five partitions based on the destination country name, and then coalesce them (without a full shuffle):
df.repartition(5, col("DEST_COUNTRY_NAME")).coalesce(2).rdd.getNumPartitions()

#### Collecting Rows to the Driver

In [None]:
collectDF = df.limit(10)
collectDF.take(5) # take works with an Integer count
collectDF.show() # this prints it out nicely
collectDF.show(5, False)
collectDF.show(5, True)
collectDF.collect()

# when this is called  data from all the partitions doesnt go to driver at the ssame time, but one after another .
collectDF.toLocalIterator() #generator

### Spark Datatypes

### Booleans

In [None]:
df.where(col("InvoiceNo") != 536365).select("InvoiceNo", "Description").show(5, True)
df.where("InvoiceNo = 536365").show(2,False)
df.where("InvoiceNo <> 536365").show(2,True)
df.withColumn("isExpensive", expr("NOT UnitPrice <= 250")).where("isExpensive").show(5)

In [None]:
DOTCodeFilter = col("StockCode") == "DOT"
priceFilter = col("UnitPrice") > 600
descripFilter = instr(col("Description"), "POSTAGE") >= 1 # instr = contains()
df.withColumn("isExpensive", DOTCodeFilter & (priceFilter | descripFilter)).where("isExpensive").show(5)

#### Numbers

In [None]:
fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"), 2) + 5
df.select(col("CustomerId"), fabricatedQuantity.alias("realQuantity")).show(2)
df.selectExpr("CustomerId","(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity").show(2)

In [None]:
df.stat.corr("Quantity", "UnitPrice")
df.select(corr("Quantity", "UnitPrice")).show()

In [None]:
df.select(round(col("UnitPrice"), 1).alias("rounded"), col("UnitPrice")).show(2)

Row Number / Rownum

In [None]:
df.withColumn('RowNum',monotonically_increasing_id()).show(5)

#### String

In [None]:
df.select(col("Description"),initcap(col("Description")), lower(col("Description")),upper (col("Description") )).show(2)

In [None]:
df.select(
ltrim(lit(" HELLO ")).alias("ltrim"),
rtrim(lit(" HELLO ")).alias("rtrim"),
trim(lit(" HELLO ")).alias("trim"),
lpad(lit("HELLO"), 10, " ").alias("lp"),
rpad(lit("HELLO"), 10, " ").alias("rp")).show(2)

##### Regex

In [None]:

regex_string = "BLACK|WHITE|RED|GREEN|BLUE"   

# Replace anything from the StringList with "COLOR"
df.select(regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"),col("Description")).show(2)

# replace L with 1, E with 3, T with 7 
df.select(translate(col("Description"), "LET", "137"),col("Description")).show(2) 

 # pull first occurence 
extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)"
df.select(regexp_extract(col("Description"), extract_str, 1).alias("color_clean"),col("Description")).show(2)


In [None]:
# Contaics/ instr 
containsBlack = instr(col("Description"), "BLACK") >= 1
containsWhite = instr(col("Description"), "WHITE") >= 1
df.withColumn("hasSimpleColor", containsBlack | containsWhite).select("Description", "hasSimpleColor").show(3, False)

#### Dates and Timestamps

In [None]:
dateDF = spark.range(10).withColumn("today", current_date()).withColumn("now", current_timestamp())
dateDF.show(2)

dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(1)
dateDF.withColumn("week_ago", date_sub(col("today"), 7)).select(datediff(col("week_ago"), col("today"))).show(1)


#to_date function allows you to convert a string to a date
dateDF.select(
to_date(lit("2016-01-01")).alias("start"),
to_date(lit("2017-05-22")).alias("end"))\
.select(months_between(col("start"), col("end"))).show(1)

df.select(coalesce(col("Description"), col("CustomerId"))).show(2)

In [None]:
df = spark.read.csv("data/appl_stock.csv",header=True,inferSchema=True) 
df.show()

In [None]:
df.select(dayofmonth(df['Date'])).show(2)
df.select(dayofmonth(col('Date')).alias('DayOfMonth')).show(2)

df.select(hour(df['Date'])).show(2)
df.withColumn("Year",year(df['Date'])).show(2)

newdf = df.withColumn("Year",year(df['Date']))
newdf.groupBy("Year").mean()[['avg(Year)','avg(Close)']].show()

newdf.groupBy("Year").mean()[['avg(Year)','avg(Close)']].withColumnRenamed("avg(Year)","Year").select('Year',format_number('avg(Close)',2).alias("Mean Close")).show()

df.na.replace(["2010-01-04"], ["UNKNOWN"], "Date").show(2)

#### NULLs

#### Drop the missing data 
You can use the .na functions for missing data. The drop command has the following parameters: 
**df.na.drop(how='any', thresh=None, subset=None)**   
* param how: 'any' or 'all'.    
     If 'any', drop a row if it contains any nulls.
     If 'all', drop a row only if all its values are null.    
* param thresh: int, default None    
     If specified, drop rows that have less than `thresh` non-null values.
     This overwrites the `how` parameter.        
* param subset: 
     optional list of column names to consider.

In [None]:
df = spark.read.csv('data/ContainsNull.csv',inferSchema=True,header=True)
df.show()

# Drop any row that contains missing data
df.na.drop().show()

# Drop any row that contains 2 null values
df.na.drop(thresh=2).show()

# Drop the rows where null is present in column, which name is in the subset list.
df.na.drop(subset=["Sales"]).show()

df.na.drop(how='any').show()
df.na.drop(how='all').show()

##### Fill the missing values

We can also fill the missing values with new values. If you have multiple nulls across multiple data types, Spark is actually smart enough to match up the data types. For example:

In [None]:
df.na.fill('Null was here').show()
df.na.fill(0).show()
df.na.fill('No Name',subset=['Name']).show() #specify what columns 
df.na.fill(df.select(mean(df['Sales'])).collect()[0][0],['Sales']).show()

#### Complex DataTypes

**Structs** DataFrames within DataFrames

In [None]:
df=spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("data/2010-12-01.csv")

df.selectExpr("(Description, InvoiceNo) as complex", "*").show(2)
df.selectExpr("struct(Description, InvoiceNo) as complex", "*").show(2)

from pyspark.sql.functions import struct
df.select(struct("Description", "InvoiceNo").alias("complex")).show(2)

#### Array
**Split, length, contains**

In [None]:
from pyspark.sql.functions import split,col
df.select(split("Description", " ")).show(2)
df.select(split(col("Description"), " ")).show(2)

from pyspark.sql.functions import size
df.select(size(split(col("Description"), " "))).show(2)

from pyspark.sql.functions import array_contains
df.select(array_contains(split(col("Description"), " "), "WHITE")).show(2)

**Explode**

In [None]:
from pyspark.sql.functions import split, explode
df.withColumn("splitted", split(col("Description"), " "))\
.withColumn("exploded", explode(col("splitted")))\
.select("Description", "InvoiceNo", "splitted", "exploded").show(10)

df.select(explode(split("Description", " "))).show(2)

**Maps**

In [None]:
from pyspark.sql.functions import create_map ,split,col
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map")).show(2)
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map")).selectExpr("complex_map['WHITE METAL LANTERN']").show(2)

**JSON**

In [None]:
jsonDF = spark.range(1).selectExpr("""'{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}' as jsonString""")
jsonDF.show()

In [None]:
# get_json_object ,json_tuple
from pyspark.sql.functions import get_json_object, json_tuple
jsonDF.select(get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]").alias('Column') ,
json_tuple(col("jsonString"), "myJSONKey").alias('Tuple')).show(2)

# to_json (from Struct to Json)
from pyspark.sql.functions import to_json
df.selectExpr("(InvoiceNo, Description) as myStruct")\
.select(to_json(col("myStruct"))).show(2)

# from_json (from Json to specified schema)
from pyspark.sql.functions import from_json
from pyspark.sql.types import *
parseSchema = StructType((
StructField("InvoiceNo",StringType(),True),
StructField("Description",StringType(),True)))
df.selectExpr("(InvoiceNo, Description) as myStruct")\
.select(to_json(col("myStruct")).alias("newJSON"))\
.select(from_json(col("newJSON"), parseSchema), col("newJSON")).show(2)

### UDF

In [None]:
udfExampleDF = spark.range(5).toDF("num") 
def power3(double_value): return double_value ** 3 # Define UDF
power3(2.0)   

#### When you use the function, there are essentially two different things that occur. 
If the function is written in Scala or Java, you can use it within the Java Virtual Machine (JVM). This means that there will be little performance penalty aside from the fact that you can’t take advantage of code generation capabilities that Spark has for built-in functions. There can be performance issues if you create or use a lot of objects
If the function is written in Python, something quite different happens. Spark starts a Python process on the worker, serializes all of the data to a format that Python can understand (remember, it was in the JVM earlier), executes the function row by row on that data in the Python process, and then finally returns the results of the row operations to the JVM and Spark.
#### Thats why UDF is not a good choice in spark. 

In [None]:
udfExampleDF = spark.range(5).toDF("num") 

def power3(double_value):  # create UDF
 return double_value ** 3
power3(2.0)


from pyspark.sql.functions import udf
power3udf = udf(power3)                        # Register the UDF
from pyspark.sql.functions import col
udfExampleDF.select(power3udf(col("num"))).show(2)     # Use the uDF

## Aggregate functions 

In [None]:
df=spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("data/all/*.csv").coalesce(5)

df.cache()
df.createOrReplaceTempView("dfTable")
df.printSchema()
df.show(2)
df.select(count("*")).show()
df.count()

#### min, max, avg, sum, mean, Variance, StandardDeviations, kewness, kurtosis, Covariance and Correlation

In [None]:
df.select(first("StockCode"), last("StockCode")).show()

df.select(min("Quantity"), max("Quantity")).show()
df.select(sum("Quantity")).show()
df.select(sum_distinct("Quantity")).show()

df.select("StockCode").distinct().count()
df.select(countDistinct("StockCode")).show()
df.select(approx_count_distinct("StockCode", 0.1)).show()

df.select(count("Quantity").alias("total_t"),sum("Quantity").alias("total_p"),avg("Quantity").alias("avg"),expr("mean(Quantity)").alias("mean_p"))\
.selectExpr("total_p/total_t","avg","mean_p").show()

df.select(var_pop("Quantity"), var_samp("Quantity"),stddev_pop("Quantity"), stddev_samp("Quantity")).show()
df.select(skewness("Quantity"), kurtosis("Quantity")).show()

df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"),covar_pop("InvoiceNo", "Quantity")).show()

df.agg(collect_set("Country"), collect_list("Country")).show()



In [None]:
df.groupBy("InvoiceNo", "CustomerId").count().show(2)          # This count is an action     
df.groupBy("InvoiceNo", "CustomerId").count().sort(desc("CustomerId"),desc("InvoiceNo")).show(10)   
df.groupBy("InvoiceNo").agg(count("Quantity").alias("quan"),expr("count(Quantity)")).show(2) # This count is a transformation
df.groupBy("InvoiceNo").agg(expr("avg(Quantity)"),expr("stddev_pop(Quantity)")).show(2)

#### Join

In [None]:
person          = spark.createDataFrame([    
    (0, "Bill Chambers", 0, [100]),
    (1, "Matei Zaharia", 1, [500, 250, 100]),
    (2, "Michael Armbrust", 1, [250, 100])
     ]).toDF("id", "name", "graduate_program", "spark_status")

graduateProgram = spark.createDataFrame([
    (0, "Masters", "School of Information", "UC Berkeley"),
    (2, "Masters", "EECS", "UC Berkeley"),
    (1, "Ph.D.", "EECS", "UC Berkeley")
     ]).toDF("id", "degree", "department", "school")

sparkStatus     = spark.createDataFrame([
    (500, "Vice President"),
    (250, "PMC Member"),
    (100, "Contributor")
     ]).toDF("id", "status")

person.createOrReplaceTempView("person")
graduateProgram.createOrReplaceTempView("graduateProgram")
sparkStatus.createOrReplaceTempView("sparkStatus")

#join the graduateProgram DataFrame with the person DataFrame to create a new DataFrame:
joinExpression = person["graduate_program"] == graduateProgram['id']

# by default = inner, others are outer, left_outer, right_outer
person.join(graduateProgram, joinExpression).show()
person.join(graduateProgram, joinExpression, 'right_outer').show()

# Joins on Complex Types

# Person.spark_status[].contains(sparkStatus.id)
person.withColumnRenamed("id", "personId").join(sparkStatus, expr("array_contains(spark_status, id)")).show()


In [None]:
joinExpr = person["graduate_program"] == person["id"]
person.join(broadcast(graduateProgram), joinExpr).explain()

**SparkSQL**

In [46]:
spark.sql("SELECT 1 + 1").show()

spark.read.json("data/2015-summary.json").createOrReplaceTempView("some_sql_view") # DF => SQL
spark.sql(""" SELECT DEST_COUNTRY_NAME, sum(count) FROM some_sql_view GROUP BY DEST_COUNTRY_NAME """).where("DEST_COUNTRY_NAME like 'S%'").where("`sum(count)` > 10").count() # SQL => DF

spark.sql(""" show tables IN default """).show()
#spark.sql("""SELECT user_id, department, first_name FROM professors WHERE department IN (SELECT name FROM department WHERE created_date >= '2016-01-01')""")

+-------+
|(1 + 1)|
+-------+
|      2|
+-------+

+---------+-------------+-----------+
|namespace|    tableName|isTemporary|
+---------+-------------+-----------+
|         |     dfnonull|       true|
|         |some_sql_view|       true|
+---------+-------------+-----------+



In [54]:
spark.sql(""" show databases """).show()
spark.sql(""" show tables IN default """).show()
spark.sql(""" CREATE TABLE if not exists flights (DEST_COUNTRY_NAME STRING, ORIGIN_COUNTRY_NAME STRING, count LONG) USING JSON OPTIONS (path "data/2015-summary.json")""")
spark.sql(""" CREATE TABLE if not exists flights_csv (DEST_COUNTRY_NAME STRING,ORIGIN_COUNTRY_NAME STRING COMMENT "remember, the US will be most prevalent", count LONG)USING csv OPTIONS (header true, path '/data/flight-data/csv/2015-summary.csv')""")
spark.sql(""" show tables IN default """).show()

+---------+
|namespace|
+---------+
|  default|
+---------+

+---------+-------------+-----------+
|namespace|    tableName|isTemporary|
+---------+-------------+-----------+
|  default|      flights|      false|
|  default|  flights_csv|      false|
|         |     dfnonull|       true|
|         |some_sql_view|       true|
+---------+-------------+-----------+

+---------+-------------+-----------+
|namespace|    tableName|isTemporary|
+---------+-------------+-----------+
|  default|      flights|      false|
|  default|  flights_csv|      false|
|         |     dfnonull|       true|
|         |some_sql_view|       true|
+---------+-------------+-----------+



#### Tuning

In [None]:
#Query Pushdown
#Reading only specific columns
#We could filter data make them arrive in their own partitions.
props = {"driver":"org.sqlite.JDBC"}
predicates = [
"DEST_COUNTRY_NAME = 'Sweden' OR ORIGIN_COUNTRY_NAME = 'Sweden'",
"DEST_COUNTRY_NAME = 'Anguilla' OR ORIGIN_COUNTRY_NAME = 'Anguilla'"]
spark.read.jdbc(url, tablename, predicates=predicates, properties=props).show()
spark.read.jdbc(url,tablename,predicates=predicates,properties=props).rdd.getNumPartitions() # 2

In [None]:
#Spark can read data from db directly , with a option for paralilsm and bounday like sqoop.
colName = "count"
lowerBound = 0L
upperBound = 348113L # this is the max count in our database
numPartitions = 10
spark.read.jdbc(url, tablename, column=colName, properties=props,lowerBound=lowerBound, upperBound=upperBound,
numPartitions=numPartitions).count()

In [None]:
# Managing File Size
# To prevent creating file too big or two small use this
df.write.option("maxRecordsPerFile", 5000)

**Dataframe and SQL in same query .**

In [None]:
spark.sql("""SELECT DEST_COUNTRY_NAME, sum(count)FROM some_sql_view GROUP BY DEST_COUNTRY_NAME""")\
.where("DEST_COUNTRY_NAME like 'S%'").where("`sum(count)` > 10").count()

####  Broadcast Variables 
Immutable

In [None]:
my_collection = "Spark The Definitive Guide : Big Data Processing Made Simple".split(" ") #list
words = spark.sparkContext.parallelize(my_collection, 2)                                  # RDD

supplementalData = {"Spark":1000, "Definitive":200,"Big":-300, "Simple":100}              # Dictionary / Map
suppBroadcast = spark.sparkContext.broadcast(supplementalData)                            # Broadcast (of the map)
suppBroadcast.value                                                                       #{'Spark':1000,'Definitive':200,'Big':-300,'Simple':100}
words.map(lambda word: (word, suppBroadcast.value.get(word, 0))).collect()   # Print the value , if not present print 0

##### Accumulators 
Mutable

In [None]:
flights = spark.read.parquet("data/2010-summary.parquet")
accChina = spark.sparkContext.accumulator(0)

def accChinaFunc(flight_row):
 destination = flight_row["DEST_COUNTRY_NAME"]
 origin = flight_row["ORIGIN_COUNTRY_NAME"]
 if destination == "China":
  accChina.add(flight_row["count"])
 if origin == "China":
  accChina.add(flight_row["count"])


flights.foreach(lambda flight_row: accChinaFunc(flight_row))
accChina.value    # 953

In [None]:
df1 = spark.range(2, 10000000, 2)
df2 = spark.range(2, 10000000, 4)
step1 = df1.repartition(5)
step12 = df2.repartition(6)
step2 = step1.selectExpr("id * 5 as id")
step3 = step2.join(step12, ["id"])
step4 = step3.selectExpr("sum(id)")
step4.collect()
step4.explain()

In [None]:
spark.read\
.option("header", "true")\
.csv("data/online-retail-dataset.csv")\
.repartition(2)\
.selectExpr("instr(Description, 'GLASS') >= 1 as is_glass")\
.groupBy("is_glass")\
.count()\
.collect()

In [None]:
spark.sparkContext.uiWebUrl

#### Read mode  
permissive(**default**)    ==>> Sets all fields to null when it encounters a corrupted record and places all corrupted records in a string column called _corrupt_record\
dropMalformed ==>> Drops the row that contains malformed records\
failFast       ==>> Fails immediately upon encountering malformed records

In [None]:
spark.read.csv("filePath/fileName.csv", inferSchema = True, header = True)
spark.read.csv("filePath/fileName.csv", inferSchema = True, header = True , mode = 'FAILFAST')

spark.read.csv("filePath/fileName.csv")
spark.read.text("filePath/fileName.txt").selectExpr("split(value, ',') as rows").
spark.read.json("filePath/fileName.json")
spark.read.parquet("filePath/fileName.parquet")
spark.read.format("avro").load("filePath/fileName.avro")

spark.read.format("csv")\
.option("header", "true")\
.option("mode", "FAILFAST")\
.option("inferSchema", "true")\
.load("filePath/fileName.csv")

spark.read.format("jdbc")\
.option("driver", "org.postgresql.Driver")\
.option("url", "jdbc:postgresql://database_server")\
.option("dbtable", "schema.tablename")\
.option("user", "username").option("password", "my-secret-password").load()

spark.readStream.format("kafka")\
.option("kafka.bootstrap.servers", "192.168.1.100:9092")\
.option("subscribe", "json_topic")\
.option("startingOffsets", "earliest").load()

#With explicit Schema
from pyspark.sql.types import StructField, StructType, StringType, LongType
myManualSchema = StructType([
StructField("DEST_COUNTRY_NAME", StringType(), True),
StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
StructField("count", LongType(), False, metadata={"hello":"world"})])
df = spark.read.format("json").schema(myManualSchema).load("filePath/fileName.csv")

#### Save mode
append ==>> Appends the output files to the list of files that already exist at that location
overwrite ==>> Will completely overwrite any data that already exists there
errorIfExists(**default**) ==>> Throws an error and fails the write if data or files already exist at the specified location
ignore ==>> If data or files exist at the location, do nothing with the current DataFrame

In [None]:
df.write.format("json").save("filePath/fileName.json")
df.write.format("json").mode("overwrite").save("filePath/fileName.json") 
df.write.mode('append').parquet("filePath/fileName.parquet")
df.write.mode('overwrite').parquet("filePath/fileName.parquet")
df.write.format("csv").mode("overwrite").option("sep", "\t").save("filePath/fileName.tsv")

#### Partitioning

In [None]:
df.write.partitionBy("gender","salary").parquet("filePath/fileName.parquet")
df.write.partitionBy("dob_year","dob_month").format("avro").save("person_partition.avro")

### DataFrame available methods

In [None]:
df.collect()
df.coalesce(1).rdd.getNumPartitions()
df.count()
df.createGlobalTempView("TestTable")
df.distinct().count()
df.drop('age').collect() 
df.join(df2, df.name == df2.name, 'inner').drop(df2.name).collect() 
df.dropDuplicates().show()
df.dtypes
df.explain()
df.filter(df.age>3).collect()
df.where(df.age==2).collect()
df.first()
df.foreach(f)
df.foreachPartition(f)
df.head()
df.join(df2,'name','outer').select('name','height').collect()
df.limit(1).collect()
df.orderBy(df.age.desc()).collect()
df.persist(StorageLevel.DISK_ONLY_2).storageLevel
df.printSchema()
df.registerTempTable("people")
df.repartition(10).rdd.getNumPartitions()
df.rollup("name",df.age).count().orderBy("name","age").show()
df.sample(False,0.5,42).count()
df.schema
df.select('').collect()
df.selectExpr("age2","abs(age)").collect()
df.sort(df.age.desc()).collect()
df.sortWithinPartitions("age",ascending=False).show()
df.storageLevel,df.cache().storageLevel
df.take(2)
df.toDF('f1','f2').collect()
df.toJSON().first()
df.union(df).repartition("age")
df.withColumn('age2',df.age+2).collect()

In [None]:
csvFile = spark.read.format("csv")\
.option("header", "true")\
.option("mode", "FAILFAST")\
.option("inferSchema", "true")\
.load("data/2010-summary.csv")

###  Advanced 

In [None]:
df=spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("data/2010-12-01.csv")
df.selectExpr("(Description, InvoiceNo) as complex", "*").show(2)
df.selectExpr("struct(Description, InvoiceNo) as complex", "*").show(2)

from pyspark.sql.functions import struct
df.select(struct("Description", "InvoiceNo").alias("complex")).show(2)

#### Window Functions

In [None]:
dfWithDate = df.withColumn("date", to_date(col("InvoiceDate"), "MM/d/yyyy H:mm"))
dfWithDate.show(2)
windowSpec = Window.partitionBy("CustomerId", "date").orderBy(desc("Quantity")).rowsBetween(Window.unboundedPreceding, Window.currentRow)

maxPurchaseQuantity = max(col("Quantity")).over(windowSpec)
purchaseDenseRank = dense_rank().over(windowSpec)
purchaseRank = rank().over(windowSpec)


spark.conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")
dfWithDate.where("CustomerId IS NOT NULL").orderBy("CustomerId")\
.select(col("CustomerId"),col("date"),col("Quantity"),purchaseRank.alias("quantityRank"),purchaseDenseRank.alias("quantityDenseRank"),maxPurchaseQuantity.alias("maxPurchaseQuantity")).show()

#### Grouping Sets

In [None]:
dfNoNull = dfWithDate.na.drop(subset=["date"]) #dfWithDate.drop()  # Removing nulls 
dfNoNull.createOrReplaceTempView("dfNoNull")

# Grouping set only avaiable in SQL # Normal way in SQL
spark.sql("SELECT CustomerId, stockCode, sum(Quantity) FROM dfNoNull \
          GROUP BY customerId, stockCode ORDER BY CustomerId DESC, stockCode DESC").show(2)

# With grouping set, it helps when complex/ multiple level grouping is needed.
spark.sql("SELECT CustomerId, stockCode, sum(Quantity) FROM dfNoNull GROUP BY customerId, stockCode \
          GROUPING SETS((customerId, stockCode)) ORDER BY CustomerId DESC, stockCode DESC").show(2)

# can add more sets 
spark.sql("SELECT CustomerId, stockCode, sum(Quantity) FROM dfNoNull GROUP BY customerId, stockCode \
GROUPING SETS((customerId, stockCode),()) ORDER BY CustomerId DESC, stockCode DESC ").show(2)

#### Rollups

In [None]:
rolledUpDF = dfNoNull.rollup("Date", "Country").agg(sum("Quantity"))\
.selectExpr("date", "Country", "`sum(Quantity)` as total_quantity").orderBy("Date")
rolledUpDF.show(5)

#### Cube

In [None]:
#The total across all dates and countries
#The total for each date across all countries
#The total for each country on each date
#The total for each country across all dates
from pyspark.sql.functions import sum
dfNoNull.cube("Date", "Country").agg(sum(col("Quantity"))).select("Date", "Country", "sum(Quantity)").orderBy("sum(Quantity)", ascending=False).show(5)