### TASK 1 : USE PYSPARK TO CLEAN AND PREPROCESS A LARGE DATASET, HANDLING MISSING VALUES AND DUPLICATES.
### DELIVERABLE: A PYTHON SCRIPT OR NOTEBOOK SHOWCASING THE DATA CLEANING PROCESS.

In [None]:
# Install PySpark
!pip install pyspark



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count, isnan

In [None]:
#Create a SparkSession

spark = SparkSession.builder \
    .appName("DataCleaning") \
    .getOrCreate()

In [None]:
#Load the Dataset

df = spark.read.csv("/content/Dataset .csv", header=True, inferSchema=True)
df.printSchema()
df.show(5)

root
 |-- Restaurant ID: string (nullable = true)
 |-- Restaurant Name: string (nullable = true)
 |-- Country Code: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Locality: string (nullable = true)
 |-- Locality Verbose: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Cuisines: string (nullable = true)
 |-- Average Cost for two: string (nullable = true)
 |-- Currency: string (nullable = true)
 |-- Has Table booking: string (nullable = true)
 |-- Has Online delivery: string (nullable = true)
 |-- Is delivering now: string (nullable = true)
 |-- Switch to order menu: string (nullable = true)
 |-- Price range: string (nullable = true)
 |-- Aggregate rating: string (nullable = true)
 |-- Rating color: string (nullable = true)
 |-- Rating text: string (nullable = true)
 |-- Votes: integer (nullable = true)

+-------------+--------------------+------------+----------------

In [None]:
# Count of missing/null values in each column
df.select([count(when(col(c).isNull() | isnan(c), c)).alias(c) for c in df.columns]).show()

+-------------+---------------+------------+----+-------+--------+----------------+---------+--------+--------+--------------------+--------+-----------------+-------------------+-----------------+--------------------+-----------+----------------+------------+-----------+-----+
|Restaurant ID|Restaurant Name|Country Code|City|Address|Locality|Locality Verbose|Longitude|Latitude|Cuisines|Average Cost for two|Currency|Has Table booking|Has Online delivery|Is delivering now|Switch to order menu|Price range|Aggregate rating|Rating color|Rating text|Votes|
+-------------+---------------+------------+----+-------+--------+----------------+---------+--------+--------+--------------------+--------+-----------------+-------------------+-----------------+--------------------+-----------+----------------+------------+-----------+-----+
|            0|              0|           0|   0|      0|       6|               6|        6|       6|      15|                   6|       6|                6|    

In [None]:
#Drop rows with any nulls
df_clean = df.dropna()

In [None]:
#Remove Duplicates
df_clean = df_clean.dropDuplicates()

In [None]:
#Save or Show the Cleaned Data
df_clean.show(10)  # View first 10 cleaned records

# Optionally save to new CSV
df_clean.coalesce(1).write.csv("cleaned_dataset.csv", header=True)

+-------------+-----------------+------------+---------+--------------------+--------------------+--------------------+-----------+-----------+--------------------+--------------------+------------------+-----------------+-------------------+-----------------+--------------------+-----------+----------------+------------+-----------+-----+
|Restaurant ID|  Restaurant Name|Country Code|     City|             Address|            Locality|    Locality Verbose|  Longitude|   Latitude|            Cuisines|Average Cost for two|          Currency|Has Table booking|Has Online delivery|Is delivering now|Switch to order menu|Price range|Aggregate rating|Rating color|Rating text|Votes|
+-------------+-----------------+------------+---------+--------------------+--------------------+--------------------+-----------+-----------+--------------------+--------------------+------------------+-----------------+-------------------+-----------------+--------------------+-----------+----------------+------