In [None]:
# importing necessary liberaries
from pyspark.sql import SparkSession

Any PySpark application starts with createing the pyspark session, which is the entry point for that application.

In [None]:
spark=SparkSession.builder.appName('LearningApplication').getOrCreate()
spark

# Creating Spark Datafame
Spark data frame can be created by using createDataFrame() function from SparkSession. Parameter to this function can be lists, tuples, dictionaries, or pyspark.sql.Row. You can specefy the schema by yourself or by default, it will infer existing schema from data.

The following code illustrates:
* creating spark dataframe with list of tuples
* creating spark dataframe with spark.SparkSession.Row: with and without explicit schema
* createing spark dataframe with pandas dataframe

In [None]:
# list of tuples
df1 = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]

columns = ["firstname","middlename","lastname","dob","gender","salary"]
df1 = spark.createDataFrame(data=df1, schema = columns)

# viewing dataframe
df1.show()



+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



In [None]:
#createing spark dataframe using spark.SparkSession.Row
from pyspark.sql import Row

df2 = spark.createDataFrame([
    Row(firstname='James',middlename='',lastname='Smith',dob='1991-04-01',gender='M',salary=3000),
    Row(firstname='Robert',middlename='',lastname='Williams',dob='1978-09-05',gender='M',salary=4000),
    Row(firstname='Satish',middlename='',lastname='Kandel',dob='1998-04-01',gender='M',salary=5000)
])
df2

DataFrame[firstname: string, middlename: string, lastname: string, dob: string, gender: string, salary: bigint]

In [None]:
# createing spark dataframe using explicit schema
from datetime import date,datetime
df2 = spark.createDataFrame([
    Row(firstname='James',middlename='',lastname='Smith',dob=datetime.strptime('1991-04-01','%Y-%m-%d'),gender='M',salary=3000),
    Row(firstname='Robert',middlename='',lastname='Williams',dob=datetime.strptime('1978-09-05','%Y-%m-%d'),gender='M',salary=4000),
    Row(firstname='Satish',middlename='',lastname='Kandel',dob=datetime.strptime('1998-04-01','%Y-%m-%d'),gender='M',salary=5000)
],schema='firstname string,middlename string,lastname string,dob date,gender string,salary int')
df2.show()


+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|   Satish|          |  Kandel|1998-04-01|     M|  5000|
+---------+----------+--------+----------+------+------+



In [None]:
# create a spark dataframe with pandas dataframe
import pandas as pd
df=pd.DataFrame({'firstname':['James','Robert','Satish'],
                 'middlename':['','',''],
                 'lastname':['Smith','Williams','Kandel'],
                 'dob':['1991-04-01','1978-09-05','1998-04-01'],
                 'gender':['M','M','M'],
                 'salary':[3000,4000,5000]
                 })

df3=spark.createDataFrame(df)
df3.show()


+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|   Satish|          |  Kandel|1998-04-01|     M|  5000|
+---------+----------+--------+----------+------+------+



In [None]:
# to see data in vertical manner
df3.show(vertical=True)

-RECORD 0----------------
 firstname  | James      
 middlename |            
 lastname   | Smith      
 dob        | 1991-04-01 
 gender     | M          
 salary     | 3000       
-RECORD 1----------------
 firstname  | Robert     
 middlename |            
 lastname   | Williams   
 dob        | 1978-09-05 
 gender     | M          
 salary     | 4000       
-RECORD 2----------------
 firstname  | Satish     
 middlename |            
 lastname   | Kandel     
 dob        | 1998-04-01 
 gender     | M          
 salary     | 5000       



In [None]:
# check for the schema
df3.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [None]:
df3.columns

['firstname', 'middlename', 'lastname', 'dob', 'gender', 'salary']

In [None]:
# show the summery count of the data by selecting only specefic columns from the dataframe
df3.select('firstname','lastname').describe().show()

+-------+---------+--------+
|summary|firstname|lastname|
+-------+---------+--------+
|  count|        3|       3|
|   mean|     NULL|    NULL|
| stddev|     NULL|    NULL|
|    min|    James|  Kandel|
|    max|   Satish|Williams|
+-------+---------+--------+



In [None]:
# select only few columns from the dataframe
df3.select('firstname').show()

+---------+
|firstname|
+---------+
|    James|
|   Robert|
|   Satish|
+---------+

<class 'pyspark.sql.dataframe.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


In [None]:
print(type(df3))
print(type(df))

<class 'pyspark.sql.dataframe.DataFrame'>
<class 'pandas.core.frame.DataFrame'>


Spark dataframe are built on the top of RDD. By default spark dataframe are lazy evaluated, when the spark transform the data, it doesnot computes the transformation immediately, instead, it plans how to compute latter. However, this default behaviour can be confugured by using the folllowing spark command.

In [None]:
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
spark.conf.set('spark.sql.repl.eagerEval.maxNumRows', 10)  #controls the maximum number of the rows shown at a time.

In PySpark, the collect() function is used to retrieve all the elements (rows) of a DataFrame (or RDD) from the distributed cluster to the local driver(driver node). It collects the data from the Spark workers, gathers it into a list, and returns it to the driver.

In [None]:
df1.collect()

[Row(firstname='James', middlename='', lastname='Smith', dob='1991-04-01', gender='M', salary=3000),
 Row(firstname='Michael', middlename='Rose', lastname='', dob='2000-05-19', gender='M', salary=4000),
 Row(firstname='Robert', middlename='', lastname='Williams', dob='1978-09-05', gender='M', salary=4000),
 Row(firstname='Maria', middlename='Anne', lastname='Jones', dob='1967-12-01', gender='F', salary=4000),
 Row(firstname='Jen', middlename='Mary', lastname='Brown', dob='1980-02-17', gender='F', salary=-1)]

In [None]:
# in order to deal with out-of-memory, we can using take(2), or tail(1)
df1.take(2)

[Row(firstname='James', middlename='', lastname='Smith', dob='1991-04-01', gender='M', salary=3000),
 Row(firstname='Michael', middlename='Rose', lastname='', dob='2000-05-19', gender='M', salary=4000)]

In [None]:
df1.tail(1)

[Row(firstname='Jen', middlename='Mary', lastname='Brown', dob='1980-02-17', gender='F', salary=-1)]

In [None]:
# converting back to pandas dataframe
df3.toPandas()

Unnamed: 0,firstname
0,James
1,Robert
2,Satish


In [None]:
# this will return column
df3.toPandas().firstname

Unnamed: 0,firstname
0,James
1,Robert
2,Satish


# Selecting and Accessing Data


In [None]:
# since spark data frame are lazy evaluated, when you access the column it returns the column instance
# this will return column instance
df2.firstname

Column<'firstname'>

In [None]:
from pyspark.sql import Column
from pyspark.sql.functions import upper

In [None]:
type(df3.firstname) == type(upper(df3.firstname)) == type(df3.firstname.isNull())


True

In [None]:
# different ways to access the columns
df3.select(df3.firstname).show()
df3.select("firstname").show()

+---------+
|firstname|
+---------+
|    James|
|   Robert|
|   Satish|
+---------+

+---------+
|firstname|
+---------+
|    James|
|   Robert|
|   Satish|
+---------+



In [None]:
# createing the new column to the spark dataframe
df3.withColumn('new_salary',df3.salary*2).show()

+---------+----------+--------+----------+------+------+----------+
|firstname|middlename|lastname|       dob|gender|salary|new_salary|
+---------+----------+--------+----------+------+------+----------+
|    James|          |   Smith|1991-04-01|     M|  3000|      6000|
|   Robert|          |Williams|1978-09-05|     M|  4000|      8000|
|   Satish|          |  Kandel|1998-04-01|     M|  5000|     10000|
+---------+----------+--------+----------+------+------+----------+



In [None]:
# converting data of the column to upper case as a new column
df3.withColumn('upper_firstname',upper(df3.firstname)).show()

+---------+----------+--------+----------+------+------+---------------+
|firstname|middlename|lastname|       dob|gender|salary|upper_firstname|
+---------+----------+--------+----------+------+------+---------------+
|    James|          |   Smith|1991-04-01|     M|  3000|          JAMES|
|   Robert|          |Williams|1978-09-05|     M|  4000|         ROBERT|
|   Satish|          |  Kandel|1998-04-01|     M|  5000|         SATISH|
+---------+----------+--------+----------+------+------+---------------+



In [None]:
# filtering the rows
df3.filter(df3.firstname=='Satish').show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|   Satish|          |  Kandel|1998-04-01|     M|  5000|
+---------+----------+--------+----------+------+------+



In [None]:
from pyspark.sql.functions import lower
df3.filter(lower(df3.firstname)=='satish').show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|   Satish|          |  Kandel|1998-04-01|     M|  5000|
+---------+----------+--------+----------+------+------+



# Applying a Function
In the pyspark, we can apply different functions to the dataframe, these functions can be userdefined functions(UDF), python built in functions or Pandas UDFs. These functions allows us to transoform, manupulate and perform calculations on the data

In [None]:
# define function to concatenate every column to the single column
def concat_columns(firstname,middlename,lastname):
    return firstname+' '+middlename+' '+lastname

# function to increase the salary
def increase_salary(salary):
    return salary*2

In [None]:
# applying both of the functions to spark dataframe
df3.withColumn("fullname",concat_columns(df3.firstname,df3.middlename,df3.lastname)).withColumn('new_salary',increase_salary(df3.salary)).show()


+---------+----------+--------+----------+------+------+--------+----------+
|firstname|middlename|lastname|       dob|gender|salary|fullname|new_salary|
+---------+----------+--------+----------+------+------+--------+----------+
|    James|          |   Smith|1991-04-01|     M|  3000|    NULL|      6000|
|   Robert|          |Williams|1978-09-05|     M|  4000|    NULL|      8000|
|   Satish|          |  Kandel|1998-04-01|     M|  5000|    NULL|     10000|
+---------+----------+--------+----------+------+------+--------+----------+



In [None]:
from pyspark.sql.functions import pandas_udf

@pandas_udf('long')
def pandas_plus_one(series: pd.Series) -> pd.Series:
    # add one by using pandas series
    return series + 1

df3.select(pandas_plus_one(df3.salary)).show() #doesnot change the dataframe yet
df3

+-----------------------+
|pandas_plus_one(salary)|
+-----------------------+
|                   3001|
|                   4001|
|                   5001|
+-----------------------+



firstname,middlename,lastname,dob,gender,salary
James,,Smith,1991-04-01,M,3000
Robert,,Williams,1978-09-05,M,4000
Satish,,Kandel,1998-04-01,M,5000


In [None]:
# selecting columns by different ways
df3.select("firstname","lastname").show()
df3.select(df3.firstname,df3.lastname).show()
df3.select(df3["firstname"],df3["lastname"]).show()

+---------+--------+
|firstname|lastname|
+---------+--------+
|    James|   Smith|
|   Robert|Williams|
|   Satish|  Kandel|
+---------+--------+

+---------+--------+
|firstname|lastname|
+---------+--------+
|    James|   Smith|
|   Robert|Williams|
|   Satish|  Kandel|
+---------+--------+

+---------+--------+
|firstname|lastname|
+---------+--------+
|    James|   Smith|
|   Robert|Williams|
|   Satish|  Kandel|
+---------+--------+



In [None]:
# By using col() function
from pyspark.sql.functions import col
df3.select(col("firstname"),col("lastname")).show()

+---------+--------+
|firstname|lastname|
+---------+--------+
|    James|   Smith|
|   Robert|Williams|
|   Satish|  Kandel|
+---------+--------+



In [None]:
# select columns by regular expression
df3.select(df3.colRegex("`^.*name*`")).show()

+---------+----------+--------+
|firstname|middlename|lastname|
+---------+----------+--------+
|    James|          |   Smith|
|   Robert|          |Williams|
|   Satish|          |  Kandel|
+---------+----------+--------+



In [None]:
# selecting all the columns from the spark dataframe
df3.select(*columns).show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|   Satish|          |  Kandel|1998-04-01|     M|  5000|
+---------+----------+--------+----------+------+------+



In [None]:
# df.select([col for col in df.columns]).show()
df3.select([col for col in df3.columns]).show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|   Satish|          |  Kandel|1998-04-01|     M|  5000|
+---------+----------+--------+----------+------+------+



In [None]:
df3.select("*").show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|   Satish|          |  Kandel|1998-04-01|     M|  5000|
+---------+----------+--------+----------+------+------+



selecting columns by index

In [None]:
#selecting first 3 columns and top 3 rows
df3.select(df3.columns[:3]).show(3)

+---------+----------+--------+
|firstname|middlename|lastname|
+---------+----------+--------+
|    James|          |   Smith|
|   Robert|          |Williams|
|   Satish|          |  Kandel|
+---------+----------+--------+



In [None]:
#selecting columns 2 to 4  and top 3 rows
df3.select(df3.columns[2:4]).show(3)

+--------+----------+
|lastname|       dob|
+--------+----------+
|   Smith|1991-04-01|
|Williams|1978-09-05|
|  Kandel|1998-04-01|
+--------+----------+



# Grouping Data

In [None]:
df = spark.createDataFrame([
    ['red', 'banana', 1, 10],
    ['blue', 'banana', 2, 20],
    ['red', 'carrot', 3, 30],
    ['blue', 'grape', 4, 40],
    ['red', 'carrot', 5, 50],
    ['black', 'carrot', 6, 60],
    ['red', 'banana', 7, 70],
    ['red', 'grape', 8, 80]],
    schema=['color', 'fruit', 'v1', 'v2'])
df.show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|  red|banana|  1| 10|
| blue|banana|  2| 20|
|  red|carrot|  3| 30|
| blue| grape|  4| 40|
|  red|carrot|  5| 50|
|black|carrot|  6| 60|
|  red|banana|  7| 70|
|  red| grape|  8| 80|
+-----+------+---+---+



In [None]:
# applying avg() function
df.groupby('color').avg().show()
# this function applies avg function to each of the numerical columns as we can see average of  categorical column is not calculated

+-----+-------+-------+
|color|avg(v1)|avg(v2)|
+-----+-------+-------+
|  red|    4.8|   48.0|
| blue|    3.0|   30.0|
|black|    6.0|   60.0|
+-----+-------+-------+



In [None]:
'''
problem:
add or subtract value with the mean of the column for V1 and V2
approach:
write a function which will accept dataframe and perform operation on the dataframe
'''
import pyspark.sql.functions as F

df.groupby('color').agg(F.avg('v1')).alias("avg_v1").show()

df_multiple_gb=df.groupby(['color','fruit']).agg(
    F.avg('v1').alias("avg_v1"),
    F.avg('v2').alias("avg_v2"),
    F.sum('v1').alias("sum_v1"),
    F.sum('v2').alias("sum_v2")
  )

df_multiple_gb.show()



df.groupby(['color','fruit']).agg(F.avg('v1')).alias("avg_v1").show()



+-----+-------+
|color|avg(v1)|
+-----+-------+
|  red|    4.8|
| blue|    3.0|
|black|    6.0|
+-----+-------+

+-----+------+------+------+------+------+
|color| fruit|avg_v1|avg_v2|sum_v1|sum_v2|
+-----+------+------+------+------+------+
| blue| grape|   4.0|  40.0|     4|    40|
|  red|banana|   4.0|  40.0|     8|    80|
|  red|carrot|   4.0|  40.0|     8|    80|
| blue|banana|   2.0|  20.0|     2|    20|
|black|carrot|   6.0|  60.0|     6|    60|
|  red| grape|   8.0|  80.0|     8|    80|
+-----+------+------+------+------+------+

+-----+------+-------+
|color| fruit|avg(v1)|
+-----+------+-------+
| blue| grape|    4.0|
|  red|banana|    4.0|
|  red|carrot|    4.0|
| blue|banana|    2.0|
|black|carrot|    6.0|
|  red| grape|    8.0|
+-----+------+-------+



# Merge and group by two dataframe

In [None]:
df1 = spark.createDataFrame(
    [(20000101, 1, 1.0), (20000101, 2, 2.0), (20000102, 1, 3.0), (20000102, 2, 4.0)],
    ('time', 'id', 'v1'))

df2 = spark.createDataFrame(
    [(20000101, 1, 'x'), (20000101, 2, 'y')],
    ('time', 'id', 'v2'))


In [None]:
def merge_ordered(l, r):
    return pd.merge_ordered(l, r)

df1.groupby('id').cogroup(df2.groupby('id')).applyInPandas(
    merge_ordered, schema='time int, id int, v1 double, v2 string').show()