## Spark SQL in simple examples

This section is used to introduce the basic syntax of Spark SQL operations using simple sample examples. For more on each method, see [Manual Spark SQL](http://spark.apache.org/docs/2.4.0/api/python/pyspark.sql.html)

### How to start Pyspark

`export PYSPARK_PYTHON=python3`  
`pyspark --master yarn --num-executors 2 --executor-memory 4G --conf spark.ui.port=1<ddmm>`, kde `<ddmm>` je váš den a měsíc narození, např. `spark.ui.port=10811`

In [None]:
# helpfull import
from pyspark.sql import functions as F

In [None]:
### DataFrame load from databaze Hive and cache
Tep_DF = spark.sql('select * from fel_bigdata.trips').cache()

### Basic information about DataFrame

In [None]:
Tep_DF.show() # print to console as  a DataFrame
Tep_DF.take(5) # print to console as  RDD

Tep_DF.count() # number of rows

Tep_DF.printSchema() # print schema - columns names and types

### Columns and rows selection

In [None]:
### columns selection
Tep_DF2 = Tep_DF.select('route_id', 'service_id', 'bikes_allowed')
Tep_DF2.show()

### rows selection (different types of condition)
Tep_DF2 = Tep_DF.filter(Tep_DF['bikes_allowed']=='1')
Tep_DF2.show()

Tep_DF2 = Tep_DF.filter((Tep_DF['bikes_allowed']=='1') & (Tep_DF['route_id']=='L991')) # brackets are necessary
Tep_DF2.show()


Tep_DF2 = Tep_DF.filter('bikes_allowed="1"')
Tep_DF2.show()

Tep_DF2 = Tep_DF.filter('bikes_allowed="1" and route_id="L991"')
Tep_DF2.show()

# RDD syntax
Tep_DF2.rdd.filter(lambda r: r[3]=='1' and r[1]=='L991').take(5)  

### unique rows ( no diplicities)
Tep_DF2 = Tep_DF.select('route_id', 'service_id', 'bikes_allowed').distinct()
Tep_DF2.show()

### Column transformation

In [None]:
# add new column
Tep_DF2 = Tep_DF.withColumn('bikes_allowed_1', Tep_DF['bikes_allowed']+100)
Tep_DF2.show()

# column value rewrite
Tep_DF2 = Tep_DF.withColumn('bikes_allowed',Tep_DF['bikes_allowed']+100)
Tep_DF2.show()

# If you need to  to apply spark Function you have firstly import a specific modul
# see http://spark.apache.org/docs/2.4.0/api/python/pyspark.sql.html#module-pyspark.sql.functions
# from pyspark.sql import functions as F
Tep_DF2 = Tep_DF.withColumn('trip_name', F.lower(Tep_DF['trip_short_name']))
Tep_DF2.show()

Tep_DF2 = Tep_DF.withColumn('trip_short_name_split', F.size(F.split(Tep_DF['trip_short_name'], " ")))
Tep_DF2.show()

Tep_DF2 = Tep_DF.withColumn('route_id2', F.regexp_replace(Tep_DF['route_id'], 'L', 'Linka'))
Tep_DF2.show()

Tep_DF2 = Tep_DF.withColumn('bikes_yes_no', F.when(Tep_DF['bikes_allowed']=='1', 'yes').otherwise('no'))
Tep_DF2.show()

### work with missing values
Tep_DF2 = Tep_DF.dropna() # drop rows with null values
Tep_DF2.show()

Tep_DF2 = Tep_DF.fillna(0, 'exceptional')

### Column rename
# one column
Tep_DF2 = Tep_DF.withColumnRenamed('bikes_allowed', 'are_bikes_allowed')
Tep_DF2.show()

# all column in  DataFrame
Tep_DF2 = Tep_DF.select('route_id', 'service_id', 'bikes_allowed').toDF('linka', 'route_uniq_id', 'are_bikes_allowed')
Tep_DF2.show()

### Sorting and aggregation

In [None]:
### sorting
Tep_DF2 = Tep_DF.orderBy('route_id', ascending=False)
Tep_DF2.show()

Tep_DF2 = Tep_DF.orderBy(Tep_DF['route_id'].desc())
Tep_DF2.show()

### agg
Tep_DF2 = Tep_DF.groupBy('route_id').count()
Tep_DF2.show()

Tep_DF2 = Tep_DF.groupBy('mesic').agg({'teplota': 'avg'}) # alternatively
Tep_DF2.show()

Tep_DF2 = Tep_DF.groupBy('stat').count()
Tep_DF2.show()

# agregation accros whole DataFrame
Tep_DF2 = Tep_DF.max('teplota') # won't work, firstly you have to groupBy operation
Tep_DF2 = Tep_DF.groupBy().max('teplota') # now is working
Tep_DF2.show()

### Join - joining tables

In [None]:
# second table definition as dataFrame
Lines = spark.createDataFrame([('L991', 'Linka 991'), ('L332', 'Linka 332'), ('L333', 'Linka 333')], ('route_id', 'route_name'))
Lines.show()

# join using same columns names in both dataFrames
Tep_DF2 = Tep_DF.join(Lines, 'route_id')
Tep_DF2.show()

# join with condition - all columns from both dataFrame are included
Tep_DF2 = Tep_DF.join(Lines, Tep_DF['route_id']==Lines['route_id'])
Tep_DF2.show()