In [5]:
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder \
    .appName("KrishnaSparkSession") \
    .getOrCreate()

# Confirm Spark is working
spark

file_path = "Files/employees.csv"

# Load CSV into Spark DataFrame
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Show top records
df.show()


+----------+-------+---+----------+------+
|EmployeeID|   Name|Age|Department|Salary|
+----------+-------+---+----------+------+
|       101|   Anil| 29|        IT| 60000|
|       102|  Sneha| 32|        HR| 58000|
|       103|  Rahul| 28|   Finance| 62000|
|       104|  Priya| 35|        IT| 75000|
|       105|Karthik| 30| Marketing| 50000|
+----------+-------+---+----------+------+



In [6]:
def display(spark_df, n=20):
    """
    Display Spark DataFrame in a tabular format in Jupyter
    using Pandas (only for small datasets).

    Args:
        spark_df: Spark DataFram
        e
        n: Number of rows to display (default=20)
    """
    return spark_df.limit(n).toPandas()

In [7]:
display(df)

Unnamed: 0,EmployeeID,Name,Age,Department,Salary
0,101,Anil,29,IT,60000
1,102,Sneha,32,HR,58000
2,103,Rahul,28,Finance,62000
3,104,Priya,35,IT,75000
4,105,Karthik,30,Marketing,50000


In [8]:
 %pip install ydata-profiling

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [9]:
# 3. Sample the DataFrame (demonstration: sample 50% of rows, then limit to 5 rows)
approx_sample_size = 5
total_rows = df.count()
sample_fraction = min(1.0, approx_sample_size / total_rows)
sampled_df = df.sample(withReplacement=False, fraction=sample_fraction, seed=42)
sampled_df = sampled_df.limit(approx_sample_size)

In [10]:
# 4. Convert to pandas DataFrame
pandas_df = sampled_df.toPandas()

In [11]:
# 5. Profile with ydata-profiling
from ydata_profiling import ProfileReport

profile = ProfileReport(pandas_df, minimal=True)
profile.to_file("output.html")

# 6. To display the report in Databricks notebook (optional)
# displayHTML(open("output.html", "r").read())

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 174.11it/s][A


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]