In [None]:
from pyspark.sql import SparkSession sample
from pyspark import SparkContext,SparkConf
from pyspark.sql import Row
if __name__ == "__main__":
    spark = SparkSession.builder.appName("StackOverFlowSurvey").getOrCreate()
    sc = spark.sparkContext
    

In [None]:
#Columns,Rows and collect row objects
#Dataframes are untyped APIs (runtime) and Datasets  are strongly typed API (compile time)

df = spark.range(100).toDF("num")
df10 = df.select(df["num"] + 10)
df.show()
df10.collect()

In [None]:
#Read a Dataframe from datasource

#dfjson = spark.read.format("json").load("file:///Users/jackshalu/Documents/Spark-The-Definitive-Guide-master/data/flight-data/json/2011-summary.json")
#dfjson.printSchema()
#dfjson.show(10)

#schema inference
#spark.read.format("json").load("file:///Users/jackshalu/Documents/Spark-The-Definitive-Guide-master/data/flight-data/json/2015-summary.json").schema

#Manual Schema

from pyspark.sql.types import *
from pyspark.sql.functions import col,expr

mySchema = StructType([StructField("DEST_COUNTRY_NAME",StringType(),False),StructField("ORIGIN_COUNTRY_NAME",StringType(),False),StructField("count",LongType(),True)])
dfjson = spark.read.format("json").schema(mySchema).load("file:///Users/jackshalu/Documents/Spark-The-Definitive-Guide-master/data/flight-data/json/2015-summary.json")
dfjson.printSchema()
dfjson.show()

'''
col and expr

col(count) + 10
expr ("(count) + 10")
'''


In [None]:
#Programmatic accessing columns
dfjson.columns

#create Row and Dataframe

from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import col,lit,current_timestamp

exschema = StructType([StructField("Fname",StringType(),True),StructField("Lname",StringType(),True),StructField("Age",LongType(),True)])

myRow = [Row("Jaga","Jaya",32),Row("Salini","Venu",25)]

exDF = spark.createDataFrame(myRow,exschema)

exDF.show()

#Select and Selectexpr

dfjson.select("DEST_COUNTRY_NAME","ORIGIN_COUNTRY_NAME").show()
dfjson.select(col("count") + 10).show()
dfjson.selectExpr("count + 10 as expr10","DEST_COUNTRY_NAME").show()

#dfjson.withColumn("WithinCountry", col("DEST_COUNTRY_NAME") == col("ORIGIN_COUNTRY_NAME")).show()
dfjson.selectExpr("*","DEST_COUNTRY_NAME in (ORIGIN_COUNTRY_NAME) as winthinCountry").show()
dfjson.selectExpr("*","DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME as winthinCountry").show
dfjson.selectExpr("avg(count)","count(distinct(DEST_COUNTRY_NAME))").show()

#Literals

dfjson.select(expr("*"),lit(current_timestamp()).alias("Date"))




In [None]:
#Adding and Renaming a Column

dfjson.withColumn("count-10",expr("count - 10")).show()
dfjson.withColumnRenamed("DEST_COUNTRY_NAME","destination").show()
dfwithLongname = dfjson.withColumn("This is Long Column-name",expr("DEST_COUNTRY_NAME"))
dfwithLongname.selectExpr("`This is Long Column-name`").show()

#Removing a Column
dfwithLongname.drop("This is Long Column-name").columns

#Changing a Column Type
dfnew = dfjson.withColumn("count-5",expr("count - 5").cast(LongType()))
dfnew.printSchema()

#Filter

dfnew.filter((col("count") > 300) & (col("count-5") > 9)).show()
dfnew.where(col("count") > 300).where(col("DEST_COUNTRY_NAME") == "United States").show()

spark.read.json("/data/flight-data/json/2015-summary.json")\  
.createOrReplaceTempView("some_sql_view") # DF => SQL
spark.sql("""SELECT DEST_COUNTRY_NAME, sum(count) FROM some_sql_view GROUP BY DEST_COUNTRY_NAME""")\  
.where("DEST_COUNTRY_NAME like 'S%'").where("`sum(count)` > 10")\ 
.count()

In [None]:
#Sample data

dfjson.sample(withReplacement = True, fraction = 0.5,seed = 5).count()

#Split data

splitData = dfjson.randomSplit([0.25,0.75])

splitData[0].count()
splitData[1].count()

#Sorting rows

from pyspark.sql.functions import desc, asc

dfjson.orderBy(expr("count desc")).show()
dfjson.orderBy(col("DEST_COUNTRY_NAME").asc(),col("count").desc()).show()

In [None]:
#Repartition and Coalesce

#repartition - change the partition with shuffle data
#coalesce - increase or decrease the partition without shuffle

dfjson.repartition(3)
dfjson.rdd.getNumPartitions()