# Imports

In [74]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, \
    IntegerType, FloatType
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.functions import col, lit, sum, avg, max, min, mean, \
    count, udf
from pyspark import SparkConf, SparkContext

In [3]:
student_data = 'data/students.csv'

In [4]:
spark = SparkSession.builder.appName('PracticeDataframe').getOrCreate()

# Schema

In [5]:
schema = StructType([
    StructField('age', IntegerType(), True),
    StructField('gender', StringType(), True),
    StructField('name', StringType(), True),
    StructField('course', StringType(), True),
    StructField('roll', StringType(), True),
    StructField('marks', IntegerType(), True),
    StructField('email', StringType(), True),
])

In [6]:
df = spark.read.options(header=True).schema(schema).csv('data/students.csv')
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- name: string (nullable = true)
 |-- course: string (nullable = true)
 |-- roll: string (nullable = true)
 |-- marks: integer (nullable = true)
 |-- email: string (nullable = true)



In [7]:
df.show()

+---+------+----------------+------+------+-----+--------------------+
|age|gender|            name|course|  roll|marks|               email|
+---+------+----------------+------+------+-----+--------------------+
| 28|Female| Hubert Oliveras|    DB| 02984|   59|Annika Hoffman_Na...|
| 29|Female|Toshiko Hillyard| Cloud| 12899|   62|Margene Moores_Ma...|
| 28|  Male|  Celeste Lollis|    PF| 21267|   45|Jeannetta Golden_...|
| 29|Female|    Elenore Choy|    DB| 32877|   29|Billi Clore_Mitzi...|
| 28|  Male|  Sheryll Towler|   DSA| 41487|   41|Claude Panos_Judi...|
| 28|  Male|  Margene Moores|   MVC| 52771|   32|Toshiko Hillyard_...|
| 28|  Male|     Neda Briski|   OOP| 61973|   69|Alberta Freund_El...|
| 28|Female|    Claude Panos| Cloud| 72409|   85|Sheryll Towler_Al...|
| 28|  Male|  Celeste Lollis|   MVC| 81492|   64|Nicole Harwood_Cl...|
| 29|  Male|  Cordie Harnois|   OOP| 92882|   51|Judie Chipps_Clem...|
| 29|Female|       Kena Wild|   DSA|102285|   35|Dustin Feagins_Ma...|
| 29| 

# RDD to DF

In [8]:
conf = SparkConf().setAppName("RDD")
sc = SparkContext.getOrCreate(conf=conf)
rdd_students = sc.textFile(student_data)
rdd_students_header = rdd_students.first()
rdd_students = rdd_students \
    .filter(lambda row: row != rdd_students_header) \
    .map(lambda elem: elem.split(',')) \
    .map(lambda row: [int(row[0]), row[1], row[2], row[3], row[4], int(row[5]), row[6]])

rdd_students.collect()[0:2]

[[28,
  'Female',
  'Hubert Oliveras',
  'DB',
  '02984',
  59,
  'Annika Hoffman_Naoma Fritts@OOP.com'],
 [29,
  'Female',
  'Toshiko Hillyard',
  'Cloud',
  '12899',
  62,
  'Margene Moores_Marylee Capasso@DB.com']]

In [9]:
header_row = rdd_students_header.split(',')
df_from_rdd = rdd_students.toDF(header_row)
df_from_rdd.show()

+---+------+----------------+------+------+-----+--------------------+
|age|gender|            name|course|  roll|marks|               email|
+---+------+----------------+------+------+-----+--------------------+
| 28|Female| Hubert Oliveras|    DB| 02984|   59|Annika Hoffman_Na...|
| 29|Female|Toshiko Hillyard| Cloud| 12899|   62|Margene Moores_Ma...|
| 28|  Male|  Celeste Lollis|    PF| 21267|   45|Jeannetta Golden_...|
| 29|Female|    Elenore Choy|    DB| 32877|   29|Billi Clore_Mitzi...|
| 28|  Male|  Sheryll Towler|   DSA| 41487|   41|Claude Panos_Judi...|
| 28|  Male|  Margene Moores|   MVC| 52771|   32|Toshiko Hillyard_...|
| 28|  Male|     Neda Briski|   OOP| 61973|   69|Alberta Freund_El...|
| 28|Female|    Claude Panos| Cloud| 72409|   85|Sheryll Towler_Al...|
| 28|  Male|  Celeste Lollis|   MVC| 81492|   64|Nicole Harwood_Cl...|
| 29|  Male|  Cordie Harnois|   OOP| 92882|   51|Judie Chipps_Clem...|
| 29|Female|       Kena Wild|   DSA|102285|   35|Dustin Feagins_Ma...|
| 29| 

## RDD to DF with schema

In [10]:
df_from_rdd_from_schema = spark.createDataFrame(rdd_students, schema=schema)
df_from_rdd_from_schema.printSchema()

root
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- name: string (nullable = true)
 |-- course: string (nullable = true)
 |-- roll: string (nullable = true)
 |-- marks: integer (nullable = true)
 |-- email: string (nullable = true)



In [11]:
df_from_rdd_from_schema.show()

+---+------+----------------+------+------+-----+--------------------+
|age|gender|            name|course|  roll|marks|               email|
+---+------+----------------+------+------+-----+--------------------+
| 28|Female| Hubert Oliveras|    DB| 02984|   59|Annika Hoffman_Na...|
| 29|Female|Toshiko Hillyard| Cloud| 12899|   62|Margene Moores_Ma...|
| 28|  Male|  Celeste Lollis|    PF| 21267|   45|Jeannetta Golden_...|
| 29|Female|    Elenore Choy|    DB| 32877|   29|Billi Clore_Mitzi...|
| 28|  Male|  Sheryll Towler|   DSA| 41487|   41|Claude Panos_Judi...|
| 28|  Male|  Margene Moores|   MVC| 52771|   32|Toshiko Hillyard_...|
| 28|  Male|     Neda Briski|   OOP| 61973|   69|Alberta Freund_El...|
| 28|Female|    Claude Panos| Cloud| 72409|   85|Sheryll Towler_Al...|
| 28|  Male|  Celeste Lollis|   MVC| 81492|   64|Nicole Harwood_Cl...|
| 29|  Male|  Cordie Harnois|   OOP| 92882|   51|Judie Chipps_Clem...|
| 29|Female|       Kena Wild|   DSA|102285|   35|Dustin Feagins_Ma...|
| 29| 

# Selecting DataFrame Columns

In [12]:
df_students = df_from_rdd_from_schema
df_students.select("name", "gender").show(5)

+----------------+------+
|            name|gender|
+----------------+------+
| Hubert Oliveras|Female|
|Toshiko Hillyard|Female|
|  Celeste Lollis|  Male|
|    Elenore Choy|Female|
|  Sheryll Towler|  Male|
+----------------+------+
only showing top 5 rows



In [13]:
df_students.select(df_students.name, df_students.email).show(5)

+----------------+--------------------+
|            name|               email|
+----------------+--------------------+
| Hubert Oliveras|Annika Hoffman_Na...|
|Toshiko Hillyard|Margene Moores_Ma...|
|  Celeste Lollis|Jeannetta Golden_...|
|    Elenore Choy|Billi Clore_Mitzi...|
|  Sheryll Towler|Claude Panos_Judi...|
+----------------+--------------------+
only showing top 5 rows



In [14]:
df.select(col("roll"), col("name")).show(5)

+-----+----------------+
| roll|            name|
+-----+----------------+
|02984| Hubert Oliveras|
|12899|Toshiko Hillyard|
|21267|  Celeste Lollis|
|32877|    Elenore Choy|
|41487|  Sheryll Towler|
+-----+----------------+
only showing top 5 rows



## Select by column index

In [15]:
df_students.select(df_students.columns[0:3]).show(5)

+---+------+----------------+
|age|gender|            name|
+---+------+----------------+
| 28|Female| Hubert Oliveras|
| 29|Female|Toshiko Hillyard|
| 28|  Male|  Celeste Lollis|
| 29|Female|    Elenore Choy|
| 28|  Male|  Sheryll Towler|
+---+------+----------------+
only showing top 5 rows



In [16]:
df_students.select(df_students.columns[3], df_students.columns[4]).show(5)

+------+-----+
|course| roll|
+------+-----+
|    DB|02984|
| Cloud|12899|
|    PF|21267|
|    DB|32877|
|   DSA|41487|
+------+-----+
only showing top 5 rows



# .withColumn()

In [17]:
df_students2 = df_students.withColumn("roll", col("roll").cast("Int"))
df_students2.printSchema()

root
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- name: string (nullable = true)
 |-- course: string (nullable = true)
 |-- roll: integer (nullable = true)
 |-- marks: integer (nullable = true)
 |-- email: string (nullable = true)



In [18]:
df_students2 = df_students2.withColumn("marks_10", col("marks") + 10)
df_students2.show(5)

+---+------+----------------+------+-----+-----+--------------------+--------+
|age|gender|            name|course| roll|marks|               email|marks_10|
+---+------+----------------+------+-----+-----+--------------------+--------+
| 28|Female| Hubert Oliveras|    DB| 2984|   59|Annika Hoffman_Na...|      69|
| 29|Female|Toshiko Hillyard| Cloud|12899|   62|Margene Moores_Ma...|      72|
| 28|  Male|  Celeste Lollis|    PF|21267|   45|Jeannetta Golden_...|      55|
| 29|Female|    Elenore Choy|    DB|32877|   29|Billi Clore_Mitzi...|      39|
| 28|  Male|  Sheryll Towler|   DSA|41487|   41|Claude Panos_Judi...|      51|
+---+------+----------------+------+-----+-----+--------------------+--------+
only showing top 5 rows



In [19]:
df_students2 = df_students2.withColumn("country", lit('USA'))
df_students2.show(5)

+---+------+----------------+------+-----+-----+--------------------+--------+-------+
|age|gender|            name|course| roll|marks|               email|marks_10|country|
+---+------+----------------+------+-----+-----+--------------------+--------+-------+
| 28|Female| Hubert Oliveras|    DB| 2984|   59|Annika Hoffman_Na...|      69|    USA|
| 29|Female|Toshiko Hillyard| Cloud|12899|   62|Margene Moores_Ma...|      72|    USA|
| 28|  Male|  Celeste Lollis|    PF|21267|   45|Jeannetta Golden_...|      55|    USA|
| 29|Female|    Elenore Choy|    DB|32877|   29|Billi Clore_Mitzi...|      39|    USA|
| 28|  Male|  Sheryll Towler|   DSA|41487|   41|Claude Panos_Judi...|      51|    USA|
+---+------+----------------+------+-----+-----+--------------------+--------+-------+
only showing top 5 rows



In [20]:
# adding multiple columns
df_students2.withColumn("course_eng", col('course') + 'Engineering') \
    .withColumn('mark_pct', col('marks') / 100) \
    .withColumn('age_old', col('age') * 2.5).show(5)

+---+------+----------------+------+-----+-----+--------------------+--------+-------+----------+--------+-------+
|age|gender|            name|course| roll|marks|               email|marks_10|country|course_eng|mark_pct|age_old|
+---+------+----------------+------+-----+-----+--------------------+--------+-------+----------+--------+-------+
| 28|Female| Hubert Oliveras|    DB| 2984|   59|Annika Hoffman_Na...|      69|    USA|      null|    0.59|   70.0|
| 29|Female|Toshiko Hillyard| Cloud|12899|   62|Margene Moores_Ma...|      72|    USA|      null|    0.62|   72.5|
| 28|  Male|  Celeste Lollis|    PF|21267|   45|Jeannetta Golden_...|      55|    USA|      null|    0.45|   70.0|
| 29|Female|    Elenore Choy|    DB|32877|   29|Billi Clore_Mitzi...|      39|    USA|      null|    0.29|   72.5|
| 28|  Male|  Sheryll Towler|   DSA|41487|   41|Claude Panos_Judi...|      51|    USA|      null|    0.41|   70.0|
+---+------+----------------+------+-----+-----+--------------------+--------+--

# .withColumnRenamed()

In [23]:
df_students = df_students.withColumnRenamed("marks", "grades").withColumnRenamed("roll", "roll_num")
df_students.show(5)

+---+------+----------------+------+--------+------+--------------------+
|age|gender|            name|course|roll_num|grades|               email|
+---+------+----------------+------+--------+------+--------------------+
| 28|Female| Hubert Oliveras|    DB|   02984|    59|Annika Hoffman_Na...|
| 29|Female|Toshiko Hillyard| Cloud|   12899|    62|Margene Moores_Ma...|
| 28|  Male|  Celeste Lollis|    PF|   21267|    45|Jeannetta Golden_...|
| 29|Female|    Elenore Choy|    DB|   32877|    29|Billi Clore_Mitzi...|
| 28|  Male|  Sheryll Towler|   DSA|   41487|    41|Claude Panos_Judi...|
+---+------+----------------+------+--------+------+--------------------+
only showing top 5 rows



In [26]:
# .alias()
df_students.select(col('name').alias('full_name')).show(5)

+----------------+
|       full_name|
+----------------+
| Hubert Oliveras|
|Toshiko Hillyard|
|  Celeste Lollis|
|    Elenore Choy|
|  Sheryll Towler|
+----------------+
only showing top 5 rows



# .filter() .isin() .like() .startswith() .endswith() .contains()

In [30]:
df_students.filter(df_students.course == 'DB').show(5)

+---+------+---------------+------+--------+------+--------------------+
|age|gender|           name|course|roll_num|grades|               email|
+---+------+---------------+------+--------+------+--------------------+
| 28|Female|Hubert Oliveras|    DB|   02984|    59|Annika Hoffman_Na...|
| 29|Female|   Elenore Choy|    DB|   32877|    29|Billi Clore_Mitzi...|
| 29|  Male|Ernest Rossbach|    DB|  111449|    53|Maybell Duguay_Ab...|
| 28|Female| Latia Vanhoose|    DB|  122502|    27|Latia Vanhoose_Mi...|
| 29|Female| Latia Vanhoose|    DB|  152159|    27|Claude Panos_Sant...|
+---+------+---------------+------+--------+------+--------------------+
only showing top 5 rows



In [31]:
df_students.filter(col('course') == 'DB').show(5)

+---+------+---------------+------+--------+------+--------------------+
|age|gender|           name|course|roll_num|grades|               email|
+---+------+---------------+------+--------+------+--------------------+
| 28|Female|Hubert Oliveras|    DB|   02984|    59|Annika Hoffman_Na...|
| 29|Female|   Elenore Choy|    DB|   32877|    29|Billi Clore_Mitzi...|
| 29|  Male|Ernest Rossbach|    DB|  111449|    53|Maybell Duguay_Ab...|
| 28|Female| Latia Vanhoose|    DB|  122502|    27|Latia Vanhoose_Mi...|
| 29|Female| Latia Vanhoose|    DB|  152159|    27|Claude Panos_Sant...|
+---+------+---------------+------+--------+------+--------------------+
only showing top 5 rows



In [33]:
# & and | or
df_students.filter((df_students.course == 'DB') & (df_students.grades > 80)).show(5)

+---+------+------------------+------+--------+------+--------------------+
|age|gender|              name|course|roll_num|grades|               email|
+---+------+------------------+------+--------+------+--------------------+
| 28|  Male|   Marylee Capasso|    DB| 1611411|    96|Annika Hoffman_Lo...|
| 29|  Male|    Alberta Freund|    DB| 1671638|    98|Clementina Menke_...|
| 28|Female|Priscila Tavernier|    DB| 1722388|    86|Lawanda Wohlwend_...|
| 29|  Male|    Dustin Feagins|    DB| 1782045|    81|Cordie Harnois_To...|
| 29|  Male|     Tijuana Kropf|    DB| 1882795|    92|Hubert Oliveras_E...|
+---+------+------------------+------+--------+------+--------------------+
only showing top 5 rows



In [34]:
# .isin()
courses = ['DB', 'Cloud', 'OOP']
df_students.filter(df_students.course.isin(courses)).show(5)

+---+------+----------------+------+--------+------+--------------------+
|age|gender|            name|course|roll_num|grades|               email|
+---+------+----------------+------+--------+------+--------------------+
| 28|Female| Hubert Oliveras|    DB|   02984|    59|Annika Hoffman_Na...|
| 29|Female|Toshiko Hillyard| Cloud|   12899|    62|Margene Moores_Ma...|
| 29|Female|    Elenore Choy|    DB|   32877|    29|Billi Clore_Mitzi...|
| 28|  Male|     Neda Briski|   OOP|   61973|    69|Alberta Freund_El...|
| 28|Female|    Claude Panos| Cloud|   72409|    85|Sheryll Towler_Al...|
+---+------+----------------+------+--------+------+--------------------+
only showing top 5 rows



In [35]:
# also .like() .startswith() .endswith()
df_students.filter(df_students.course.startswith('D')).show(5)

+---+------+---------------+------+--------+------+--------------------+
|age|gender|           name|course|roll_num|grades|               email|
+---+------+---------------+------+--------+------+--------------------+
| 28|Female|Hubert Oliveras|    DB|   02984|    59|Annika Hoffman_Na...|
| 29|Female|   Elenore Choy|    DB|   32877|    29|Billi Clore_Mitzi...|
| 28|  Male| Sheryll Towler|   DSA|   41487|    41|Claude Panos_Judi...|
| 29|Female|      Kena Wild|   DSA|  102285|    35|Dustin Feagins_Ma...|
| 29|  Male|Ernest Rossbach|    DB|  111449|    53|Maybell Duguay_Ab...|
+---+------+---------------+------+--------+------+--------------------+
only showing top 5 rows



In [36]:
# .contains()
df_students.filter(df_students.name.contains('se')).show(5)

+---+------+--------------+------+--------+------+--------------------+
|age|gender|          name|course|roll_num|grades|               email|
+---+------+--------------+------+--------+------+--------------------+
| 28|Female|Latia Vanhoose|    DB|  122502|    27|Latia Vanhoose_Mi...|
| 29|Female|Latia Vanhoose|   MVC|  132110|    55|Eda Neathery_Nico...|
| 29|Female|Latia Vanhoose|    DB|  152159|    27|Claude Panos_Sant...|
| 29|  Male|Loris Crossett|   MVC|  161771|    36|Mitzi Seldon_Jenn...|
| 29|Female|Loris Crossett|    PF|  201487|    96|Elenore Choy_Lati...|
+---+------+--------------+------+--------+------+--------------------+
only showing top 5 rows



In [38]:
# like the SQL like operator
# "_" matches any single character and "%" matches any sequence of characters
df_students.filter(df_students.name.like('%s_e%')).show(5)

+---+------+--------------+------+--------+------+--------------------+
|age|gender|          name|course|roll_num|grades|               email|
+---+------+--------------+------+--------+------+--------------------+
| 28|  Male|Celeste Lollis|    PF|   21267|    45|Jeannetta Golden_...|
| 28|  Male|Celeste Lollis|   MVC|   81492|    64|Nicole Harwood_Cl...|
| 29|  Male|Loris Crossett|   MVC|  161771|    36|Mitzi Seldon_Jenn...|
| 29|Female|Loris Crossett|    PF|  201487|    96|Elenore Choy_Lati...|
| 28|Female|Loris Crossett|    PF|  332739|    62|Michelle Ruggiero...|
+---+------+--------------+------+--------+------+--------------------+
only showing top 5 rows



# .count() .distinct() .dropDuplicates()

In [40]:
# .count()
df_students.filter(df_students.course == 'DB').count()

157

In [41]:
# how many distinct rows are there?
df_students.distinct().count()

1000

In [44]:
# distinct will return the selected columns
display(df_students.select('gender', 'age').distinct().count())
df_students.select('gender', 'age').distinct().show()

4

+------+---+
|gender|age|
+------+---+
|Female| 29|
|Female| 28|
|  Male| 28|
|  Male| 29|
+------+---+



In [45]:
# get the rows with the first combination
df_students.drop_duplicates(['gender', 'course']).show()

+---+------+----------------+------+--------+------+--------------------+
|age|gender|            name|course|roll_num|grades|               email|
+---+------+----------------+------+--------+------+--------------------+
| 29|Female|Toshiko Hillyard| Cloud|   12899|    62|Margene Moores_Ma...|
| 28|Female| Hubert Oliveras|    DB|   02984|    59|Annika Hoffman_Na...|
| 29|Female|       Kena Wild|   DSA|  102285|    35|Dustin Feagins_Ma...|
| 29|Female|  Latia Vanhoose|   MVC|  132110|    55|Eda Neathery_Nico...|
| 28|Female|  Alberta Freund|   OOP|  251805|    83|Annika Hoffman_Sh...|
| 29|Female|  Loris Crossett|    PF|  201487|    96|Elenore Choy_Lati...|
| 29|  Male|     Billi Clore| Cloud|  512047|    76|Taryn Brownlee_Ju...|
| 29|  Male| Ernest Rossbach|    DB|  111449|    53|Maybell Duguay_Ab...|
| 28|  Male|  Sheryll Towler|   DSA|   41487|    41|Claude Panos_Judi...|
| 28|  Male|  Margene Moores|   MVC|   52771|    32|Toshiko Hillyard_...|
| 28|  Male|     Neda Briski|   OOP|  

# .orderBy() .sort()

In [46]:
df_students.sort("grades").show(5)

+---+------+-----------------+------+--------+------+--------------------+
|age|gender|             name|course|roll_num|grades|               email|
+---+------+-----------------+------+--------+------+--------------------+
| 29|Female|   Tamera Blakley|   DSA| 3911247|    20|Donna Yerby_Bonit...|
| 29|Female|Michelle Ruggiero|    DB| 9232210|    20|Donna Yerby_Latia...|
| 28|  Male|  Marylee Capasso|   DSA| 2081560|    20|Sheryll Towler_Do...|
| 29|Female|  Gonzalo Ferebee|   DSA| 5631172|    20|Jeannetta Golden_...|
| 29|  Male|Michelle Ruggiero|    PF| 6001585|    20|Paris Hutton_Marg...|
+---+------+-----------------+------+--------+------+--------------------+
only showing top 5 rows



In [47]:
df_students.sort("grades", "age").show(5)

+---+------+---------------+------+--------+------+--------------------+
|age|gender|           name|course|roll_num|grades|               email|
+---+------+---------------+------+--------+------+--------------------+
| 28|Female| Maybell Duguay| Cloud|  261439|    20|Nicole Harwood_Ju...|
| 28|Female|   Jc Andrepont|    PF|  972733|    20|Eda Neathery_Eda ...|
| 28|  Male|Marylee Capasso|   DSA| 2081560|    20|Sheryll Towler_Do...|
| 29|  Male|   Elenore Choy|    DB| 3652057|    20|Jc Andrepont_Gonz...|
| 29|Female| Tamera Blakley|   DSA| 3911247|    20|Donna Yerby_Bonit...|
+---+------+---------------+------+--------+------+--------------------+
only showing top 5 rows



In [48]:
df_students.orderBy("grades").show(5)

+---+------+---------------+------+--------+------+--------------------+
|age|gender|           name|course|roll_num|grades|               email|
+---+------+---------------+------+--------+------+--------------------+
| 28|  Male|Marylee Capasso|   DSA| 2081560|    20|Sheryll Towler_Do...|
| 29|Female| Tamera Blakley|   DSA| 3911247|    20|Donna Yerby_Bonit...|
| 28|Female| Maybell Duguay| Cloud|  261439|    20|Nicole Harwood_Ju...|
| 28|Female|   Jc Andrepont|    PF|  972733|    20|Eda Neathery_Eda ...|
| 29|  Male|   Elenore Choy|    DB| 3652057|    20|Jc Andrepont_Gonz...|
+---+------+---------------+------+--------+------+--------------------+
only showing top 5 rows



In [49]:
df_students.sort(col("grades").desc()).show(4)

+---+------+-----------------+------+--------+------+--------------------+
|age|gender|             name|course|roll_num|grades|               email|
+---+------+-----------------+------+--------+------+--------------------+
| 28|Female|   Melani Engberg| Cloud| 1872667|    99|Alberta Freund_Ni...|
| 29|  Male|   Maybell Duguay|    PF|  701486|    99|Clementina Menke_...|
| 29|Female|     Paris Hutton|   DSA|  271472|    99|Sheryll Towler_Al...|
| 29|  Male|Michelle Ruggiero|   DSA| 1022971|    99|Cordie Harnois_Cl...|
+---+------+-----------------+------+--------+------+--------------------+
only showing top 4 rows



In [51]:
# sort grades by ascending and age by descending
df_students.sort(df_students.grades.asc(), df_students.age.desc()).show(5)

+---+------+-----------------+------+--------+------+--------------------+
|age|gender|             name|course|roll_num|grades|               email|
+---+------+-----------------+------+--------+------+--------------------+
| 29|Female|   Tamera Blakley|   DSA| 3911247|    20|Donna Yerby_Bonit...|
| 29|Female|Michelle Ruggiero|    DB| 9232210|    20|Donna Yerby_Latia...|
| 29|  Male|     Elenore Choy|    DB| 3652057|    20|Jc Andrepont_Gonz...|
| 29|Female|  Gonzalo Ferebee|   DSA| 5631172|    20|Jeannetta Golden_...|
| 29|  Male|Michelle Ruggiero|    PF| 6001585|    20|Paris Hutton_Marg...|
+---+------+-----------------+------+--------+------+--------------------+
only showing top 5 rows



# .groupBy() .agg()

In [52]:
df_students.show(4)

+---+------+----------------+------+--------+------+--------------------+
|age|gender|            name|course|roll_num|grades|               email|
+---+------+----------------+------+--------+------+--------------------+
| 28|Female| Hubert Oliveras|    DB|   02984|    59|Annika Hoffman_Na...|
| 29|Female|Toshiko Hillyard| Cloud|   12899|    62|Margene Moores_Ma...|
| 28|  Male|  Celeste Lollis|    PF|   21267|    45|Jeannetta Golden_...|
| 29|Female|    Elenore Choy|    DB|   32877|    29|Billi Clore_Mitzi...|
+---+------+----------------+------+--------+------+--------------------+
only showing top 4 rows



In [56]:
df_students.groupBy("gender").sum("grades").show()

+------+-----------+
|gender|sum(grades)|
+------+-----------+
|Female|      29636|
|  Male|      30461|
+------+-----------+



In [54]:
# group df_students by course and count
df_students.groupBy("course").count().show()

+------+-----+
|course|count|
+------+-----+
|    PF|  166|
|    DB|  157|
|   MVC|  157|
|   DSA|  176|
| Cloud|  192|
|   OOP|  152|
+------+-----+



In [57]:
df_students.groupBy('gender').max('grades').show()

+------+-----------+
|gender|max(grades)|
+------+-----------+
|Female|         99|
|  Male|         99|
+------+-----------+



In [58]:
df_students.groupBy('gender').min('grades').show()

+------+-----------+
|gender|min(grades)|
+------+-----------+
|Female|         20|
|  Male|         20|
+------+-----------+



In [59]:
df_students.groupBy('age').avg('grades').show()

+---+------------------+
|age|       avg(grades)|
+---+------------------+
| 28|60.487854251012145|
| 29|59.715415019762844|
+---+------------------+



In [60]:
df_students.groupBy('age', 'gender').avg('grades').show()

+---+------+------------------+
|age|gender|       avg(grades)|
+---+------+------------------+
| 28|Female|       59.44140625|
| 29|  Male|60.524904214559385|
| 29|Female| 58.85306122448979|
| 28|  Male| 61.61344537815126|
+---+------+------------------+



In [67]:
df_students.groupBy('age', 'gender', 'course') \
    .agg(count("*").alias('student_count'),
         sum("grades").alias("grade_sum"),
         max("grades").alias("max_grade"),
         min("grades").alias("min_grade")).show()

+---+------+------+-------------+---------+---------+---------+
|age|gender|course|student_count|grade_sum|max_grade|min_grade|
+---+------+------+-------------+---------+---------+---------+
| 28|  Male| Cloud|           43|     2641|       97|       21|
| 28|Female|    DB|           40|     2261|       88|       24|
| 28|Female|   MVC|           34|     2094|       99|       24|
| 29|  Male|    PF|           53|     3055|       99|       20|
| 28|Female|   OOP|           43|     2408|       99|       24|
| 29|Female| Cloud|           49|     3149|       98|       24|
| 29|  Male| Cloud|           43|     2486|       88|       21|
| 29|Female|   DSA|           51|     2969|       99|       20|
| 29|  Male|   OOP|           35|     2146|       97|       20|
| 29|  Male|    DB|           40|     2515|       98|       20|
| 28|Female|    PF|           35|     2132|       98|       20|
| 29|Female|   MVC|           37|     2250|       97|       22|
| 29|Female|    DB|           35|     19

In [68]:
df_office = spark.read.options(header=True).csv('data/office.csv')
df_office.show(4)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
+-------------+----------+-----+------+---+-----+
only showing top 4 rows



In [69]:
# count df_office by department
df_office.groupBy('department').count().show()

+----------+-----+
|department|count|
+----------+-----+
|     Sales|    3|
|   Finance|    4|
| Marketing|    2|
+----------+-----+



In [70]:
df_students.filter(df_students.gender == 'Male').groupBy('gender').count().show()

+------+-----+
|gender|count|
+------+-----+
|  Male|  499|
+------+-----+



In [72]:
# filter > group by > having
df_students.filter(df_students.gender == 'Male').groupBy('course', 'gender') \
    .agg(count('*').alias('total_enrollments')) \
    .where(col('total_enrollments') > 85).show()

+------+------+-----------------+
|course|gender|total_enrollments|
+------+------+-----------------+
|   MVC|  Male|               86|
|    PF|  Male|               97|
| Cloud|  Male|               86|
+------+------+-----------------+



# UDFs

In [77]:
df_office.printSchema()

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- age: string (nullable = true)
 |-- bonus: string (nullable = true)



In [79]:
# convert the df_office salary and bonus columns to integer
df_office = df_office.withColumn('salary', col('salary').cast('Int')) \
    .withColumn('bonus', col('bonus').cast('Int'))
df_office.printSchema()

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- age: string (nullable = true)
 |-- bonus: integer (nullable = true)



## get_total_salary() example

In [80]:
def get_total_salary(salary, bonus):
    return salary + bonus


get_total_salary_udf = udf(
    lambda salary, bonus: get_total_salary(salary, bonus), IntegerType())

df_office.withColumn('total_salary', get_total_salary_udf(col('salary'), col('bonus'))).show(4)

+-------------+----------+-----+------+---+-----+------------+
|employee_name|department|state|salary|age|bonus|total_salary|
+-------------+----------+-----+------+---+-----+------------+
|        James|     Sales|   NY| 90000| 34|10000|      100000|
|      Michael|     Sales|   NY| 86000| 56|20000|      106000|
|       Robert|     Sales|   CA| 81000| 30|23000|      104000|
|        Maria|   Finance|   CA| 90000| 24|23000|      113000|
+-------------+----------+-----+------+---+-----+------------+
only showing top 4 rows



# DF to RDD

In [81]:
type(df_students)

pyspark.sql.dataframe.DataFrame

In [82]:
rdd_students = df_students.rdd
type(rdd_students)

pyspark.rdd.RDD

In [87]:
# rdd_students.collect()[0:5]
rdd_students.take(5)

[Row(age=28, gender='Female', name='Hubert Oliveras', course='DB', roll_num='02984', grades=59, email='Annika Hoffman_Naoma Fritts@OOP.com'),
 Row(age=29, gender='Female', name='Toshiko Hillyard', course='Cloud', roll_num='12899', grades=62, email='Margene Moores_Marylee Capasso@DB.com'),
 Row(age=28, gender='Male', name='Celeste Lollis', course='PF', roll_num='21267', grades=45, email='Jeannetta Golden_Jenna Montague@DSA.com'),
 Row(age=29, gender='Female', name='Elenore Choy', course='DB', roll_num='32877', grades=29, email='Billi Clore_Mitzi Seldon@DB.com'),
 Row(age=28, gender='Male', name='Sheryll Towler', course='DSA', roll_num='41487', grades=41, email='Claude Panos_Judie Chipps@OOP.com')]

In [90]:
# filter rdd_students for female students
# rdd_students.filter(lambda row: (row[1] == 'Female' and row[3] == 'DB')).take(5)
rdd_students.filter(lambda row: (row['gender'] == 'Female' and row['course'] == 'DB')).take(5)

[Row(age=28, gender='Female', name='Hubert Oliveras', course='DB', roll_num='02984', grades=59, email='Annika Hoffman_Naoma Fritts@OOP.com'),
 Row(age=29, gender='Female', name='Elenore Choy', course='DB', roll_num='32877', grades=29, email='Billi Clore_Mitzi Seldon@DB.com'),
 Row(age=28, gender='Female', name='Latia Vanhoose', course='DB', roll_num='122502', grades=27, email='Latia Vanhoose_Mitzi Seldon@OOP.com'),
 Row(age=29, gender='Female', name='Latia Vanhoose', course='DB', roll_num='152159', grades=27, email='Claude Panos_Santa Kerfien@DB.com'),
 Row(age=28, gender='Female', name='Mickey Cortright', course='DB', roll_num='192537', grades=62, email='Ernest Rossbach_Marylee Capasso@Cloud.com')]

# Spark SQL

In [91]:
df_students.createOrReplaceTempView('students')

In [97]:
# Spark SQL version
spark.sql('select gender, course from students where age > 28').show(4)

+------+------+
|gender|course|
+------+------+
|Female| Cloud|
|Female|    DB|
|  Male|   OOP|
|Female|   DSA|
+------+------+
only showing top 4 rows



In [98]:
# Spark DataFrame version
df_students.select("gender", "course").filter(col("age") > 28).show(4)

+------+------+
|gender|course|
+------+------+
|Female| Cloud|
|Female|    DB|
|  Male|   OOP|
|Female|   DSA|
+------+------+
only showing top 4 rows



In [102]:
df_students_agg = spark.sql('select course, gender, count(*) as total_enrollments from students group by course, gender order by course')
df_students_agg.show()

+------+------+-----------------+
|course|gender|total_enrollments|
+------+------+-----------------+
| Cloud|  Male|               86|
| Cloud|Female|              106|
|    DB|  Male|               82|
|    DB|Female|               75|
|   DSA|Female|               98|
|   DSA|  Male|               78|
|   MVC|Female|               71|
|   MVC|  Male|               86|
|   OOP|  Male|               70|
|   OOP|Female|               82|
|    PF|  Male|               97|
|    PF|Female|               69|
+------+------+-----------------+



# Write to an output file

In [103]:
df_students_agg.write.csv('data/students_agg.csv', header=True)

In [105]:
df_students.write.csv('data/df_students.csv', header=True)

In [106]:
df_students.rdd.getNumPartitions()

2

In [107]:
# read the df_students.csv output back
df_re_students = spark.read.options(header=True).csv('data/df_students.csv/')
display(df_re_students.count())
# count df_re_students by course
df_re_students.groupBy('course').count().show()

1000

+------+-----+
|course|count|
+------+-----+
|    PF|  166|
|    DB|  157|
|   MVC|  157|
|   DSA|  176|
| Cloud|  192|
|   OOP|  152|
+------+-----+



In [108]:
'''
The write modes
- append
- overwrite
- error
- errorifexists
- ignore
'''
df_students.write.csv('data/df_students.csv', header=True, mode='overwrite')