# SQL functions

###  Step1: Create SparkContext and SparkSession

In [0]:
from pyspark import SparkContext

from pyspark.sql import SparkSession
spark = SparkSession.builder \
          .appName("Python Spark SQL basic example") \
          .config("spark.some.config.option", "some-value") \
          .getOrCreate()

###  Step2: Load 

**``iris.csv``** and **``prostate.csv``** into Databricks tables folder

###  Step3: Import both the above data

In [0]:
iris = spark.read.csv('dbfs:/FileStore/tables/iris.csv', header = True, inferSchema = True)
iris.show(5)

In [0]:
prostate = spark.read.csv('dbfs:/FileStore/tables/prostate.csv', header = True, inferSchema = True)
prostate.show(5)

###  Step4: Functions
#### import functions ad types

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

## `abs`

In [0]:
prostate.select('lpsa',abs(prostate.lpsa).alias('abs(lpsa)')).show(5)

## `array`

In [0]:
iris.show(5)

In [0]:
df_arr = iris.select('species',array(['sepal_length','sepal_width','petal_length','petal_width']).alias('features'))
df_arr.show(5)

## `array_contains`

In [0]:
df = df_arr.select('species','features',array_contains(df_arr.features,1.4).alias('new_features'))
df.show(5)

In [0]:
df.filter(df.new_features).show(5)

## `asc`

`asc` returns a **sort expression**, which can be used as argument of sort functions such as `pyspark.sql.DataFrame.sort` and `pyspark.sql.DataFrame.orderBy`

In [0]:
prostate.sort(prostate.lpsa.asc()).show(5)

In [0]:
prostate.orderBy(prostate.lpsa.asc()).show(5)

## `avg`

In [0]:
prostate.select(avg(prostate.lpsa)).show()

## `ceil`

In [0]:
prostate.select('lpsa',ceil(prostate.lpsa)).show(5)

## `col`

Returns a **Column** based on the given column name. It can save your some typing when the dataframe is very long.

In [0]:
prostate.show(5)

In [0]:
prostate.select(col('lcavol'),col('age')).show(5)

## `concat`

In [0]:
df = spark.createDataFrame([['a','1'],['b','2']],['x','y'])
df.show()
df.select('x','y',concat(df.x,df.y).alias('concat(x,y)')).show()

## `collect_set`

In [0]:
df.select(collect_set(df.x)).show()

## `concat_ws`

In [0]:
df.select('x','y',concat_ws('_',df.x,df.y).alias('concat(x,y)')).show()

## `corr`

In [0]:
prostate

## `count`

In [0]:
prostate.select(count(prostate.lpsa)).show()

## `countDistinct`

In [0]:
iris.select(countDistinct(iris.species)).show()

## `create_map`

In [0]:
iris.show(5)

In [0]:
df = iris.select(create_map('species','sepal_length'))
df.show(5)

In [0]:
df.dtypes

## `current_date`

In [0]:
df = spark.createDataFrame([[1],[2],[3],[4]],['x'])
df.show()

In [0]:
df.select('x',current_date()).show()

## `current_tmestamp`

In [0]:
df.select('x',current_timestamp()).show(truncate = False)

## `date_add`

In [0]:
df2 = df.select('x',current_date().alias('current_date'))
df2.show()

In [0]:
df2.select('x','current_date',date_add(df2.current_date,10)).show()

## `date_format`

In [0]:
df2.select('x','current_date',date_format('current_date','MM/dd/yyyy').alias('new_data')).show()