<a href="https://colab.research.google.com/github/harenlin/PySpark-Learning/blob/main/PySpark_First_Eye.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark



# PySpark Hands-on Learning

Before you do anything, all you need to do is init the session.

In [None]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('PySpark').getOrCreate()
spark

Create a Spark Dataframe (spark.createDataFrame)

In [None]:
# initalize list of lists (same as in python)
df = [['Haren', 22], ['Jimmy', 30], ['Andy', 30]]
# create the pandas dataframe
df = spark.createDataFrame(df, ['Name', 'Age'])

In [None]:
df.show()

+-----+---+
| Name|Age|
+-----+---+
|Haren| 22|
|Jimmy| 30|
| Andy| 30|
+-----+---+



In [None]:
# if you want it to be seen like pandas df
df.toPandas()

Unnamed: 0,Name,Age
0,Haren,22
1,Jimmy,30
2,Andy,30


In [None]:
df.columns

['Name', 'Age']

In [None]:
# how many rows are int the DataFrame
df.count()

3

# Read in data

In [None]:
path = "./students.csv"
df = spark.read.csv(path, header=True)
df

DataFrame[gender: string, race/ethnicity: string, parental level of education: string, lunch: string, test preparation course: string, math score: string, reading score: string, writing score: string]

In [None]:
df.printSchema()

root
 |-- gender: string (nullable = true)
 |-- race/ethnicity: string (nullable = true)
 |-- parental level of education: string (nullable = true)
 |-- lunch: string (nullable = true)
 |-- test preparation course: string (nullable = true)
 |-- math score: string (nullable = true)
 |-- reading score: string (nullable = true)
 |-- writing score: string (nullable = true)



In [None]:
df.show(5)

+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|gender|race/ethnicity|parental level of education|       lunch|test preparation course|math score|reading score|writing score|
+------+--------------+---------------------------+------------+-----------------------+----------+-------------+-------------+
|female|       group B|          bachelor's degree|    standard|                   none|        72|           72|           74|
|female|       group C|               some college|    standard|              completed|        69|           90|           88|
|female|       group B|            master's degree|    standard|                   none|        90|           95|           93|
|  male|       group A|         associate's degree|free/reduced|                   none|        47|           57|           44|
|  male|       group C|               some college|    standard|                   none|        76|     

# Aggregate Data

In [None]:
# the method is similar to pandas but you can only do one metric at a time
df.groupBy('gender').agg({'math score': 'mean'}).show()

+------+------------------+
|gender|   avg(math score)|
+------+------------------+
|female|63.633204633204635|
|  male| 68.72821576763485|
+------+------------------+



In [None]:
df.groupBy('gender').agg({'math score': 'min'}).show()

+------+---------------+
|gender|min(math score)|
+------+---------------+
|female|              0|
|  male|            100|
+------+---------------+



In [None]:
from pyspark.sql import functions as F
df.groupBy('gender').agg(F.min('math score'), F.max('math score'), F.avg('math score')).show()

+------+---------------+---------------+------------------+
|gender|min(math score)|max(math score)|   avg(math score)|
+------+---------------+---------------+------------------+
|female|              0|             99|63.633204633204635|
|  male|            100|             99| 68.72821576763485|
+------+---------------+---------------+------------------+



# Spark Immutability 
If you make a change to a dataframe like adding a col or changing any of the values in the dataframe using the same naming convetion, it will generate a new dataframe (with a new unique ID) instead of updating the existing one.

In [None]:
df.rdd.id() # check the existing dataframe id

163

In [None]:
# even if duplicate the dataframe, the id remains the same
df2 = df
df2.rdd.id()

163

In [None]:
# the id change when we make a change
df = df.withColumn('new_col', df['math score']*2)
df.rdd.id()

169

In [None]:
print(df)

DataFrame[gender: string, race/ethnicity: string, parental level of education: string, lunch: string, test preparation course: string, math score: string, reading score: string, writing score: string, new_col: double]


# Spark's Lazy Computation
As the name itself indicates its definition, lazy evaluation in Spark means that the execution will not start until it absolutely has to.

In [None]:
# this kinds of commands will not actually be run immediately
df = df.withColumn('new_col', df['math score']*2)

In [None]:
# until we execute a command like this
print(df)
print(df.rdd.id())

DataFrame[gender: string, race/ethnicity: string, parental level of education: string, lunch: string, test preparation course: string, math score: string, reading score: string, writing score: string, new_col: double]
175
