In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('DataFrameQuiz').getOrCreate()

In [4]:
df = spark.read.options(inferSchema='True', header='True').csv('data/Student_Data.csv')
df.printSchema()
df.show()

root
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- name: string (nullable = true)
 |-- course: string (nullable = true)
 |-- roll: integer (nullable = true)
 |-- marks: integer (nullable = true)
 |-- email: string (nullable = true)

+---+------+----------------+------+------+-----+--------------------+
|age|gender|            name|course|  roll|marks|               email|
+---+------+----------------+------+------+-----+--------------------+
| 28|Female| Hubert Oliveras|    DB|  2984|   59|Annika Hoffman_Na...|
| 29|Female|Toshiko Hillyard| Cloud| 12899|   62|Margene Moores_Ma...|
| 28|  Male|  Celeste Lollis|    PF| 21267|   45|Jeannetta Golden_...|
| 29|Female|    Elenore Choy|    DB| 32877|   29|Billi Clore_Mitzi...|
| 28|  Male|  Sheryll Towler|   DSA| 41487|   41|Claude Panos_Judi...|
| 28|  Male|  Margene Moores|   MVC| 52771|   32|Toshiko Hillyard_...|
| 28|  Male|     Neda Briski|   OOP| 61973|   69|Alberta Freund_El...|
| 28|Female|    Claude P

#### Addind New Column - Total_Marks

In [5]:
from pyspark.sql.functions import col, lit
df = df.withColumn('total_marks', lit(120))
df.printSchema()
df.show()

root
 |-- age: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- name: string (nullable = true)
 |-- course: string (nullable = true)
 |-- roll: integer (nullable = true)
 |-- marks: integer (nullable = true)
 |-- email: string (nullable = true)
 |-- total_marks: integer (nullable = false)

+---+------+----------------+------+------+-----+--------------------+-----------+
|age|gender|            name|course|  roll|marks|               email|total_marks|
+---+------+----------------+------+------+-----+--------------------+-----------+
| 28|Female| Hubert Oliveras|    DB|  2984|   59|Annika Hoffman_Na...|        120|
| 29|Female|Toshiko Hillyard| Cloud| 12899|   62|Margene Moores_Ma...|        120|
| 28|  Male|  Celeste Lollis|    PF| 21267|   45|Jeannetta Golden_...|        120|
| 29|Female|    Elenore Choy|    DB| 32877|   29|Billi Clore_Mitzi...|        120|
| 28|  Male|  Sheryll Towler|   DSA| 41487|   41|Claude Panos_Judi...|        120|
| 28|  Male|  Margene Moo

#### Addind New Column - Percentage

In [7]:
from pyspark.sql.functions import round
df = df.withColumn('percentage', round((col('marks')/120)*100,2))
df.show()

+---+------+----------------+------+------+-----+--------------------+-----------+----------+
|age|gender|            name|course|  roll|marks|               email|total_marks|percentage|
+---+------+----------------+------+------+-----+--------------------+-----------+----------+
| 28|Female| Hubert Oliveras|    DB|  2984|   59|Annika Hoffman_Na...|        120|     49.17|
| 29|Female|Toshiko Hillyard| Cloud| 12899|   62|Margene Moores_Ma...|        120|     51.67|
| 28|  Male|  Celeste Lollis|    PF| 21267|   45|Jeannetta Golden_...|        120|      37.5|
| 29|Female|    Elenore Choy|    DB| 32877|   29|Billi Clore_Mitzi...|        120|     24.17|
| 28|  Male|  Sheryll Towler|   DSA| 41487|   41|Claude Panos_Judi...|        120|     34.17|
| 28|  Male|  Margene Moores|   MVC| 52771|   32|Toshiko Hillyard_...|        120|     26.67|
| 28|  Male|     Neda Briski|   OOP| 61973|   69|Alberta Freund_El...|        120|      57.5|
| 28|Female|    Claude Panos| Cloud| 72409|   85|Sheryll Tow

#### Students for marks > 80%

In [10]:
studentsAboveEighty = df.filter((df.percentage > 80) & (df.course == 'OOP'))
studentsAboveEighty.show()

+---+------+------------------+------+-------+-----+--------------------+-----------+----------+
|age|gender|              name|course|   roll|marks|               email|total_marks|percentage|
+---+------+------------------+------+-------+-----+--------------------+-----------+----------+
| 28|  Male|    Jenna Montague|   OOP|3331161|   98|Leontine Phillips...|        120|     81.67|
| 29|Female|Priscila Tavernier|   OOP|3902993|   99|Celeste Lollis_Bi...|        120|      82.5|
| 28|Female|      Judie Chipps|   OOP|5451977|   99|Tamera Blakley_Mi...|        120|      82.5|
| 29|  Male|    Margene Moores|   OOP|5621072|   97|Sheryll Towler_Ma...|        120|     80.83|
| 29|  Male|      Jc Andrepont|   OOP|8022618|   97|Cordie Harnois_Ja...|        120|     80.83|
| 28|  Male|    Loris Crossett|   OOP|8172914|   98|Paris Hutton_Pari...|        120|     81.67|
| 28|  Male|    Loris Crossett|   OOP|9692316|   99|Judie Chipps_Mich...|        120|      82.5|
+---+------+------------------

#### Marks above 60

In [11]:
studentsAboveSixty = df.filter((df.percentage > 60) & (df.course == 'Cloud'))
studentsAboveSixty.show()

+---+------+-----------------+------+-------+-----+--------------------+-----------+----------+
|age|gender|             name|course|   roll|marks|               email|total_marks|percentage|
+---+------+-----------------+------+-------+-----+--------------------+-----------+----------+
| 28|Female|     Claude Panos| Cloud|  72409|   85|Sheryll Towler_Al...|        120|     70.83|
| 29|  Male|      Billi Clore| Cloud| 512047|   76|Taryn Brownlee_Ju...|        120|     63.33|
| 28|Female|   Somer Stoecker| Cloud| 612490|   82|Sebrina Maresca_G...|        120|     68.33|
| 29|Female|     Judie Chipps| Cloud| 632793|   75|Tijuana Kropf_Ele...|        120|      62.5|
| 29|Female|     Eda Neathery| Cloud|1011971|   91|Margene Moores_El...|        120|     75.83|
| 28|  Male|   Bonita Higuera| Cloud|1312294|   94|Eda Neathery_Pris...|        120|     78.33|
| 29|Female|  Hubert Oliveras| Cloud|1392791|   94|Anna Santos_Alber...|        120|     78.33|
| 28|Female|      Neda Briski| Cloud|165

In [12]:
studentsAboveEighty.select('name', 'marks').show()

+------------------+-----+
|              name|marks|
+------------------+-----+
|    Jenna Montague|   98|
|Priscila Tavernier|   99|
|      Judie Chipps|   99|
|    Margene Moores|   97|
|      Jc Andrepont|   97|
|    Loris Crossett|   98|
|    Loris Crossett|   99|
+------------------+-----+



In [14]:
studentsAboveSixty.select(df.name, df.marks).show()

+-----------------+-----+
|             name|marks|
+-----------------+-----+
|     Claude Panos|   85|
|      Billi Clore|   76|
|   Somer Stoecker|   82|
|     Judie Chipps|   75|
|     Eda Neathery|   91|
|   Bonita Higuera|   94|
|  Hubert Oliveras|   94|
|      Neda Briski|   74|
|   Melani Engberg|   99|
|     Paris Hutton|   79|
|     Eda Neathery|   95|
|      Neda Briski|   81|
|    Tijuana Kropf|   78|
|   Jenna Montague|   96|
|   Dustin Feagins|   89|
|  Ernest Rossbach|   83|
|Leontine Phillips|   76|
|  Sebrina Maresca|   97|
| Clementina Menke|   95|
|    Kizzy Brenner|   80|
+-----------------+-----+
only showing top 20 rows



In [15]:
spark.stop()