# Imports

In [43]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.sql.functions import col, lit, sum, avg, max, min, mean, count, udf
from pyspark import SparkConf, SparkContext

In [2]:
spark = SparkSession.builder.appName("SparkDF").getOrCreate()

In [0]:
df_students = spark.read.csv("data/students.csv", header=True, inferSchema=True)

In [4]:
df_students.show(5)

+---+------+----------------+------+-----+-----+--------------------+
|age|gender|            name|course| roll|marks|               email|
+---+------+----------------+------+-----+-----+--------------------+
| 28|Female| Hubert Oliveras|    DB| 2984|   59|Annika Hoffman_Na...|
| 29|Female|Toshiko Hillyard| Cloud|12899|   62|Margene Moores_Ma...|
| 28|  Male|  Celeste Lollis|    PF|21267|   45|Jeannetta Golden_...|
| 29|Female|    Elenore Choy|    DB|32877|   29|Billi Clore_Mitzi...|
| 28|  Male|  Sheryll Towler|   DSA|41487|   41|Claude Panos_Judi...|
+---+------+----------------+------+-----+-----+--------------------+
only showing top 5 rows



# select, withColumn, filter

In [6]:
df_students = df_students.withColumn("total_marks", lit(120))
df_students.show(5)

+---+------+----------------+------+-----+-----+--------------------+-----------+
|age|gender|            name|course| roll|marks|               email|total_marks|
+---+------+----------------+------+-----+-----+--------------------+-----------+
| 28|Female| Hubert Oliveras|    DB| 2984|   59|Annika Hoffman_Na...|        120|
| 29|Female|Toshiko Hillyard| Cloud|12899|   62|Margene Moores_Ma...|        120|
| 28|  Male|  Celeste Lollis|    PF|21267|   45|Jeannetta Golden_...|        120|
| 29|Female|    Elenore Choy|    DB|32877|   29|Billi Clore_Mitzi...|        120|
| 28|  Male|  Sheryll Towler|   DSA|41487|   41|Claude Panos_Judi...|        120|
+---+------+----------------+------+-----+-----+--------------------+-----------+
only showing top 5 rows



In [7]:
df_students = df_students.withColumn("average", col("marks") / col("total_marks") * 100)
df_students.show(5)

+---+------+----------------+------+-----+-----+--------------------+-----------+------------------+
|age|gender|            name|course| roll|marks|               email|total_marks|           average|
+---+------+----------------+------+-----+-----+--------------------+-----------+------------------+
| 28|Female| Hubert Oliveras|    DB| 2984|   59|Annika Hoffman_Na...|        120|49.166666666666664|
| 29|Female|Toshiko Hillyard| Cloud|12899|   62|Margene Moores_Ma...|        120| 51.66666666666667|
| 28|  Male|  Celeste Lollis|    PF|21267|   45|Jeannetta Golden_...|        120|              37.5|
| 29|Female|    Elenore Choy|    DB|32877|   29|Billi Clore_Mitzi...|        120|24.166666666666668|
| 28|  Male|  Sheryll Towler|   DSA|41487|   41|Claude Panos_Judi...|        120|34.166666666666664|
+---+------+----------------+------+-----+-----+--------------------+-----------+------------------+
only showing top 5 rows



In [10]:
df_oop_b = df_students.filter((col("course") == "OOP") & (col("average") >= 80))
df_oop_b.show(5)

+---+------+------------------+------+-------+-----+--------------------+-----------+-----------------+
|age|gender|              name|course|   roll|marks|               email|total_marks|          average|
+---+------+------------------+------+-------+-----+--------------------+-----------+-----------------+
| 28|Female|     Kizzy Brenner|   OOP|2542257|   96|Eda Neathery_Lawa...|        120|             80.0|
| 28|  Male|    Jenna Montague|   OOP|3331161|   98|Leontine Phillips...|        120|81.66666666666667|
| 29|Female|Priscila Tavernier|   OOP|3902993|   99|Celeste Lollis_Bi...|        120|             82.5|
| 29|  Male| Michelle Ruggiero|   OOP|4222829|   96|Kena Wild_Jalisa ...|        120|             80.0|
| 28|Female|      Judie Chipps|   OOP|5451977|   99|Tamera Blakley_Mi...|        120|             82.5|
+---+------+------------------+------+-------+-----+--------------------+-----------+-----------------+
only showing top 5 rows



In [11]:
df_cloud_d = df_students.filter((col("course") == "Cloud") & (col("average") >= 60))
df_cloud_d.show(5)

+---+------+--------------+------+-------+-----+--------------------+-----------+-----------------+
|age|gender|          name|course|   roll|marks|               email|total_marks|          average|
+---+------+--------------+------+-------+-----+--------------------+-----------+-----------------+
| 28|Female|  Claude Panos| Cloud|  72409|   85|Sheryll Towler_Al...|        120|70.83333333333334|
| 29|  Male|   Billi Clore| Cloud| 512047|   76|Taryn Brownlee_Ju...|        120|63.33333333333333|
| 28|Female|Somer Stoecker| Cloud| 612490|   82|Sebrina Maresca_G...|        120|68.33333333333333|
| 29|Female|  Judie Chipps| Cloud| 632793|   75|Tijuana Kropf_Ele...|        120|             62.5|
| 29|Female|  Eda Neathery| Cloud|1011971|   91|Margene Moores_El...|        120|75.83333333333333|
+---+------+--------------+------+-------+-----+--------------------+-----------+-----------------+
only showing top 5 rows



In [12]:
df_oop_b.select("name", "marks").show(5)

+------------------+-----+
|              name|marks|
+------------------+-----+
|     Kizzy Brenner|   96|
|    Jenna Montague|   98|
|Priscila Tavernier|   99|
| Michelle Ruggiero|   96|
|      Judie Chipps|   99|
+------------------+-----+
only showing top 5 rows



In [13]:
df_cloud_d.select("name", "marks").show(5)

+--------------+-----+
|          name|marks|
+--------------+-----+
|  Claude Panos|   85|
|   Billi Clore|   76|
|Somer Stoecker|   82|
|  Judie Chipps|   75|
|  Eda Neathery|   91|
+--------------+-----+
only showing top 5 rows



# Distinct Duplicate

In [16]:
display(df_students.count())
df_students.show(4)

1000

+---+------+----------------+------+-----+-----+--------------------+-----------+------------------+
|age|gender|            name|course| roll|marks|               email|total_marks|           average|
+---+------+----------------+------+-----+-----+--------------------+-----------+------------------+
| 28|Female| Hubert Oliveras|    DB| 2984|   59|Annika Hoffman_Na...|        120|49.166666666666664|
| 29|Female|Toshiko Hillyard| Cloud|12899|   62|Margene Moores_Ma...|        120| 51.66666666666667|
| 28|  Male|  Celeste Lollis|    PF|21267|   45|Jeannetta Golden_...|        120|              37.5|
| 29|Female|    Elenore Choy|    DB|32877|   29|Billi Clore_Mitzi...|        120|24.166666666666668|
+---+------+----------------+------+-----+-----+--------------------+-----------+------------------+
only showing top 4 rows



In [19]:
# select unique rows for age gender course from df_students
display(df_students.select("age", "gender", "course").distinct().count())
df_students.select("age", "gender", "course").distinct().show()

24

+---+------+------+
|age|gender|course|
+---+------+------+
| 28|  Male| Cloud|
| 28|Female|    DB|
| 28|Female|   MVC|
| 29|  Male|    PF|
| 28|Female|   OOP|
| 29|Female| Cloud|
| 29|  Male| Cloud|
| 29|Female|   DSA|
| 29|  Male|   OOP|
| 29|  Male|    DB|
| 28|Female|    PF|
| 29|Female|   MVC|
| 29|Female|    DB|
| 28|  Male|    PF|
| 29|  Male|   DSA|
| 28|Female| Cloud|
| 29|Female|   OOP|
| 29|  Male|   MVC|
| 28|  Male|   OOP|
| 28|  Male|   DSA|
+---+------+------+
only showing top 20 rows



In [21]:
display(df_students.dropDuplicates(["age", "gender", "course"]).count())
df_students.dropDuplicates(["age", "gender", "course"]).show()

24

+---+------+----------------+------+------+-----+--------------------+-----------+------------------+
|age|gender|            name|course|  roll|marks|               email|total_marks|           average|
+---+------+----------------+------+------+-----+--------------------+-----------+------------------+
| 28|Female|    Claude Panos| Cloud| 72409|   85|Sheryll Towler_Al...|        120| 70.83333333333334|
| 28|Female| Hubert Oliveras|    DB|  2984|   59|Annika Hoffman_Na...|        120|49.166666666666664|
| 28|Female|    Jc Andrepont|   DSA|232060|   58|Billi Clore_Abram...|        120|48.333333333333336|
| 28|Female|    Cheri Kenney|   MVC|321816|   24|Kena Wild_Michell...|        120|              20.0|
| 28|Female|  Alberta Freund|   OOP|251805|   83|Annika Hoffman_Sh...|        120| 69.16666666666667|
| 28|Female|  Loris Crossett|    PF|332739|   62|Michelle Ruggiero...|        120| 51.66666666666667|
| 28|  Male|  Annika Hoffman| Cloud|722193|   55|Taryn Brownlee_El...|        120|

# sort/orderBy

In [22]:
df_office = spark.read.csv("data/office.csv", header=True, inferSchema=True)
df_office.show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
+-------------+----------+-----+------+---+-----+



In [23]:
# soft bonus by ascending order
df_office2 = df_office.sort("bonus").show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
+-------------+----------+-----+------+---+-----+



In [25]:
# sort df_office age by descending order and salary by ascending order
df_office3 = df_office.sort(col("age").desc(), col("salary").asc())
df_office3.show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|      Michael|     Sales|   NY| 86000| 56|20000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|        James|     Sales|   NY| 90000| 34|10000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Maria|   Finance|   CA| 90000| 24|23000|
+-------------+----------+-----+------+---+-----+



In [26]:
# sort df_office age by desc bonus by desc salary by asc
df_office.sort(col("age").desc(), col("bonus").desc(), col("salary").asc()).show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|      Michael|     Sales|   NY| 86000| 56|20000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|        James|     Sales|   NY| 90000| 34|10000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Maria|   Finance|   CA| 90000| 24|23000|
+-------------+----------+-----+------+---+-----+



# groupBy

In [27]:
df_students.show(4)

+---+------+----------------+------+-----+-----+--------------------+-----------+------------------+
|age|gender|            name|course| roll|marks|               email|total_marks|           average|
+---+------+----------------+------+-----+-----+--------------------+-----------+------------------+
| 28|Female| Hubert Oliveras|    DB| 2984|   59|Annika Hoffman_Na...|        120|49.166666666666664|
| 29|Female|Toshiko Hillyard| Cloud|12899|   62|Margene Moores_Ma...|        120| 51.66666666666667|
| 28|  Male|  Celeste Lollis|    PF|21267|   45|Jeannetta Golden_...|        120|              37.5|
| 29|Female|    Elenore Choy|    DB|32877|   29|Billi Clore_Mitzi...|        120|24.166666666666668|
+---+------+----------------+------+-----+-----+--------------------+-----------+------------------+
only showing top 4 rows



In [34]:
# group df_students by course and count using .agg()
df_students.groupBy("course", "gender").agg(
    count('*').alias('total_enrollment'), sum('marks').alias('total_marks')) \
    .sort('course').show()

+------+------+----------------+---------+
|course|gender|total_enrollment|grade_sum|
+------+------+----------------+---------+
| Cloud|  Male|              86|     5127|
| Cloud|Female|             106|     6316|
|    DB|  Male|              82|     5073|
|    DB|Female|              75|     4197|
|   DSA|Female|              98|     6124|
|   DSA|  Male|              78|     4826|
|   MVC|Female|              71|     4344|
|   MVC|  Male|              86|     5241|
|   OOP|  Male|              70|     4234|
|   OOP|Female|              82|     4682|
|    PF|  Male|              97|     5960|
|    PF|Female|              69|     3973|
+------+------+----------------+---------+



In [35]:
# group df_students by course, age then get min, max, avg, sum of marks
df_students.groupBy("course", "age").agg(
    min('marks').alias('min_marks'), max('marks').alias('max_marks'),
    avg('marks').alias('avg_marks'), sum('marks').alias('sum_marks')) \
    .sort('course').show()

+------+---+---------+---------+------------------+---------+
|course|age|min_marks|max_marks|         avg_marks|sum_marks|
+------+---+---------+---------+------------------+---------+
| Cloud| 29|       21|       98|             61.25|     5635|
| Cloud| 28|       20|       99|             58.08|     5808|
|    DB| 28|       21|       98| 58.76829268292683|     4819|
|    DB| 29|       20|       98|59.346666666666664|     4451|
|   DSA| 28|       20|       99|  64.6867469879518|     5369|
|   DSA| 29|       20|       99| 60.01075268817204|     5581|
|   MVC| 29|       22|       99| 61.56470588235294|     5233|
|   MVC| 28|       23|       99| 60.44444444444444|     4352|
|   OOP| 29|       20|       99|59.729729729729726|     4420|
|   OOP| 28|       23|       99| 57.64102564102564|     4496|
|    PF| 29|       20|       99|56.275862068965516|     4896|
|    PF| 28|       20|       98| 63.75949367088607|     5037|
+------+---+---------+---------+------------------+---------+



# Word Count

In [38]:
df_words = spark.read.text("data/words.txt")
df_words.show(4)

+-----+
|value|
+-----+
|Apple|
|  Mic|
|  Mic|
|Apple|
+-----+
only showing top 4 rows



In [39]:
# count the number of occurrences of each word in df_words
df_words.groupBy("value").count().sort(col("count").desc()).show()

+------+-----+
| value|count|
+------+-----+
| Chair|   15|
|   Mic|   10|
| Apple|   10|
|  Book|    5|
|Laptop|    5|
|   Bag|    5|
|Mobile|    5|
+------+-----+



# UDFs

In [40]:
df_office.show(4)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
+-------------+----------+-----+------+---+-----+
only showing top 4 rows



In [45]:
def get_inc(state, salary, bonus):
    inc = 0
    if state == "CA":
        inc = salary * 0.12
        inc += bonus * 0.03
    elif state == "NY":
        inc = salary * 0.10
        inc += bonus * 0.05
    return inc


# create a UDF from get_inc
get_inc_udf = udf(get_inc, DoubleType())

df_office.withColumn(
    "increment", get_inc_udf(col("state"), col("salary"), col("bonus"))).show()

+-------------+----------+-----+------+---+-----+---------+
|employee_name|department|state|salary|age|bonus|increment|
+-------------+----------+-----+------+---+-----+---------+
|        James|     Sales|   NY| 90000| 34|10000|   9500.0|
|      Michael|     Sales|   NY| 86000| 56|20000|   9600.0|
|       Robert|     Sales|   CA| 81000| 30|23000|  10410.0|
|        Maria|   Finance|   CA| 90000| 24|23000|  11490.0|
|        Raman|   Finance|   CA| 99000| 40|24000|  12600.0|
|        Scott|   Finance|   NY| 83000| 36|19000|   9250.0|
|          Jen|   Finance|   NY| 79000| 53|15000|   8650.0|
|         Jeff| Marketing|   CA| 80000| 25|18000|  10140.0|
|        Kumar| Marketing|   NY| 91000| 50|21000|  10150.0|
+-------------+----------+-----+------+---+-----+---------+

