In [6]:
!pip install pyspark



In [7]:
from pyspark.sql import SparkSession

In [8]:
spark = SparkSession.builder.appName('Pyspark').getOrCreate()

In [48]:
spark

In [12]:
df = spark.read.csv("/content/data_m1.csv",header=True,inferSchema=True)

In [13]:
df.show()

+-------+-----+-------------+
|   Name|Marks|       Grades|
+-------+-----+-------------+
|Priyang|   98|not available|
| Aadhya|   89|           AB|
| Krisha|   99|           AA|
| Vedant|   87|           NA|
| Parshv|   90|           AC|
| Mittal|   36|           BA|
|Archana|   82|           BB|
+-------+-----+-------------+



## ***1. Display Top 3 Rows of the Dataset***

In [14]:
df.show(3)

+-------+-----+-------------+
|   Name|Marks|       Grades|
+-------+-----+-------------+
|Priyang|   98|not available|
| Aadhya|   89|           AB|
| Krisha|   99|           AA|
+-------+-----+-------------+
only showing top 3 rows



## ***2. Display DataTypes of Each column***

In [15]:
df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Marks: integer (nullable = true)
 |-- Grades: string (nullable = true)



## ***3. Display Column Names***

In [16]:
df.columns

['Name', 'Marks', 'Grades']

## ***4. Count Number of Rows and Columns of the Dataset***

In [17]:
df.count() # 7 Rows

7

In [18]:
len(df.columns)

3

## ***5. Get Overall Statistics About the Dataset***

In [19]:
df.describe().show()

+-------+------+------------------+-------------+
|summary|  Name|             Marks|       Grades|
+-------+------+------------------+-------------+
|  count|     7|                 7|            7|
|   mean|  NULL|              83.0|         NULL|
| stddev|  NULL|21.571586249817912|         NULL|
|    min|Aadhya|                36|           AA|
|    max|Vedant|                99|not available|
+-------+------+------------------+-------------+



## ***6. Find Unique Values Available in the Grades column***

In [20]:
df.columns

['Name', 'Marks', 'Grades']

In [21]:

df.toPandas()

Unnamed: 0,Name,Marks,Grades
0,Priyang,98,not available
1,Aadhya,89,AB
2,Krisha,99,AA
3,Vedant,87,
4,Parshv,90,AC
5,Mittal,36,BA
6,Archana,82,BB


In [22]:

df.toPandas()['Grades'].unique()

array(['not available', 'AB', 'AA', 'NA', 'AC', 'BA', 'BB'], dtype=object)

## ***7. How to Select Single Column?***

In [23]:

df.select('Name').show()

+-------+
|   Name|
+-------+
|Priyang|
| Aadhya|
| Krisha|
| Vedant|
| Parshv|
| Mittal|
|Archana|
+-------+



## ***8. How to Select Multiple Columns?***

In [24]:
df.select(['Name','Grades']).show()

+-------+-------------+
|   Name|       Grades|
+-------+-------------+
|Priyang|not available|
| Aadhya|           AB|
| Krisha|           AA|
| Vedant|           NA|
| Parshv|           AC|
| Mittal|           BA|
|Archana|           BB|
+-------+-------------+



## ***9.Create New Column With Marks + 1 And Also Update Existing DataFrame***

In [26]:
df = df.withColumn('New_Marks',df.Marks + 1)

In [27]:
df.show()

+-------+-----+-------------+---------+
|   Name|Marks|       Grades|New_Marks|
+-------+-----+-------------+---------+
|Priyang|   98|not available|       99|
| Aadhya|   89|           AB|       90|
| Krisha|   99|           AA|      100|
| Vedant|   87|           NA|       88|
| Parshv|   90|           AC|       91|
| Mittal|   36|           BA|       37|
|Archana|   82|           BB|       83|
+-------+-----+-------------+---------+



## ***10. Rename Name Column And Give New Name "Student_Name"***

In [28]:
df.columns

['Name', 'Marks', 'Grades', 'New_Marks']

In [29]:
df= df.withColumnRenamed('Name','Student_Name')

In [30]:
df.show()

+------------+-----+-------------+---------+
|Student_Name|Marks|       Grades|New_Marks|
+------------+-----+-------------+---------+
|     Priyang|   98|not available|       99|
|      Aadhya|   89|           AB|       90|
|      Krisha|   99|           AA|      100|
|      Vedant|   87|           NA|       88|
|      Parshv|   90|           AC|       91|
|      Mittal|   36|           BA|       37|
|     Archana|   82|           BB|       83|
+------------+-----+-------------+---------+



## ***11. Display Name of The Students Having Marks Greater Than 90***

In [34]:
df.filter(df['Marks']>90).show()

+------------+-----+-------------+---------+
|Student_Name|Marks|       Grades|New_Marks|
+------------+-----+-------------+---------+
|     Priyang|   98|not available|       99|
|      Krisha|   99|           AA|      100|
+------------+-----+-------------+---------+



In [36]:
df.filter(df['Marks']>90).select('Student_Name').show()

+------------+
|Student_Name|
+------------+
|     Priyang|
|      Krisha|
+------------+



## ***12. Display Name and Grade column Having Marks Greater Than 90***

In [38]:
df.filter(df['Marks']>90).select(['Student_Name','Grades']).show()

+------------+-------------+
|Student_Name|       Grades|
+------------+-------------+
|     Priyang|not available|
|      Krisha|           AA|
+------------+-------------+



## ***13. Sort Every Row of The Dataset into Descending Order***

In [39]:
df.columns

['Student_Name', 'Marks', 'Grades', 'New_Marks']

In [40]:
df.orderBy(df['Marks']).show()

+------------+-----+-------------+---------+
|Student_Name|Marks|       Grades|New_Marks|
+------------+-----+-------------+---------+
|      Mittal|   36|           BA|       37|
|     Archana|   82|           BB|       83|
|      Vedant|   87|           NA|       88|
|      Aadhya|   89|           AB|       90|
|      Parshv|   90|           AC|       91|
|     Priyang|   98|not available|       99|
|      Krisha|   99|           AA|      100|
+------------+-----+-------------+---------+



In [41]:
df.orderBy(df['Marks'].desc()).show()

+------------+-----+-------------+---------+
|Student_Name|Marks|       Grades|New_Marks|
+------------+-----+-------------+---------+
|      Krisha|   99|           AA|      100|
|     Priyang|   98|not available|       99|
|      Parshv|   90|           AC|       91|
|      Aadhya|   89|           AB|       90|
|      Vedant|   87|           NA|       88|
|     Archana|   82|           BB|       83|
|      Mittal|   36|           BA|       37|
+------------+-----+-------------+---------+



## ***14. Handling Missing Values***

In [42]:
df.show()

+------------+-----+-------------+---------+
|Student_Name|Marks|       Grades|New_Marks|
+------------+-----+-------------+---------+
|     Priyang|   98|not available|       99|
|      Aadhya|   89|           AB|       90|
|      Krisha|   99|           AA|      100|
|      Vedant|   87|           NA|       88|
|      Parshv|   90|           AC|       91|
|      Mittal|   36|           BA|       37|
|     Archana|   82|           BB|       83|
+------------+-----+-------------+---------+



In [46]:
from pyspark.sql import functions as F

# Replace 'NA' and 'not available' with None (null)
df_cleaned = df.replace(['NA', 'not available'], None, subset=['Grades'])

# Show the cleaned DataFrame
df_cleaned.show()


+------------+-----+------+---------+
|Student_Name|Marks|Grades|New_Marks|
+------------+-----+------+---------+
|     Priyang|   98|  NULL|       99|
|      Aadhya|   89|    AB|       90|
|      Krisha|   99|    AA|      100|
|      Vedant|   87|  NULL|       88|
|      Parshv|   90|    AC|       91|
|      Mittal|   36|    BA|       37|
|     Archana|   82|    BB|       83|
+------------+-----+------+---------+



In [47]:
# Fill NULL values in the 'Grades' column with 'Unknown'
df_filled = df.fillna(value='Unknown', subset=['Grades'])

# Show the DataFrame after filling NULL values
df_filled.show()


+------------+-----+-------------+---------+
|Student_Name|Marks|       Grades|New_Marks|
+------------+-----+-------------+---------+
|     Priyang|   98|not available|       99|
|      Aadhya|   89|           AB|       90|
|      Krisha|   99|           AA|      100|
|      Vedant|   87|           NA|       88|
|      Parshv|   90|           AC|       91|
|      Mittal|   36|           BA|       37|
|     Archana|   82|           BB|       83|
+------------+-----+-------------+---------+

