In [0]:
from pyspark.sql import SparkSession

# Create Spark session
spark = SparkSession.builder.appName("DataQualityCheck").getOrCreate()



In [0]:
%sql
SELECT * FROM workspace.default.raw_data LIMIT 10;


In [0]:
%sql
SELECT 
    COUNT(*) AS total_records,
    SUM(CASE WHEN name IS NULL THEN 1 ELSE 0 END) AS missing_name,
    SUM(CASE WHEN speciality IS NULL THEN 1 ELSE 0 END) AS missing_speciality,
    SUM(CASE WHEN education IS NULL THEN 1 ELSE 0 END) AS missing_education,
    SUM(CASE WHEN experience IS NULL THEN 1 ELSE 0 END) AS missing_experience,
    SUM(CASE WHEN city IS NULL THEN 1 ELSE 0 END) AS missing_city,
    SUM(CASE WHEN state IS NULL THEN 1 ELSE 0 END) AS missing_state,
    SUM(CASE WHEN pincode IS NULL THEN 1 ELSE 0 END) AS missing_pincode
FROM workspace.default.raw_data;


In [0]:
%sql
SELECT name, speciality, education, experience, COUNT(*) AS count
FROM workspace.default.raw_data
GROUP BY name, speciality, education, experience
HAVING COUNT(*) > 1;


In [0]:
%sql
SELECT * FROM workspace.default.raw_data 
WHERE TRY_CAST(pincode AS INT) IS NULL AND pincode IS NOT NULL;


In [0]:
%sql
SELECT * FROM workspace.default.raw_data
WHERE email IS NULL OR email NOT LIKE '%@%.%';


In [0]:
%sql
UPDATE workspace.default.raw_data
SET speciality = 'Unknown'
WHERE speciality IS NULL;


In [0]:
%sql
SELECT name, COUNT(DISTINCT clinic_name) AS clinic_count
FROM workspace.default.raw_data
GROUP BY name
HAVING COUNT(DISTINCT clinic_name) > 1;


In [0]:
%sql
SELECT * FROM workspace.default.raw_data
WHERE phone IS NULL OR TRIM(phone) = '';


In [0]:
%sql
CREATE OR REPLACE TABLE workspace.default.cleaned_data AS 
WITH ranked_data AS (
    SELECT 
        record_id, 
        url, 
        sheet_doc_name, 
        sheet_speciality, 
        name, 
        clinic_name, 
        education, 
        experience, 
        speciality, 
        address, 
        mci, 
        passing_year, 
        memberships, 
        fees, 
        timing, 
        awards, 
        specializations, 
        full_education, 
        full_experience, 
        services, 
        name_ratio, 
        speciality_ratio, 
        clinic_Locationn, 
        doctor_location, 
        `Languages spoken` AS languages_spoken, 
        clinic__name1, 
        address1, 
        fee1, 
        clinic__name2, 
        address2, 
        fee2, 
        clinic__name3, 
        address3, 
        fee3, 
        phone, 
        alternate_email, 
        email, 
        locality, 
        city, 
        pincode, 
        state, 
        clinic__name4, 
        address4, 
        fee4, 
        ROW_NUMBER() OVER (
            PARTITION BY name, speciality, education, experience 
            ORDER BY record_id
        ) AS row_num
    FROM workspace.default.raw_data
)
SELECT * FROM ranked_data WHERE row_num = 1;

In [0]:
%sql
select * from workspace.default.cleaned_data limit 10 ;

In [0]:
%sql
select 'cleaned',count (*) from workspace.default.cleaned_data 
union all 
select 'dedupe',count (*) from workspace.default.raw_data

In [0]:
%sql
UPDATE workspace.default.cleaned_data
SET speciality = 'Unknown'
WHERE speciality IS NULL;




In [0]:
%sql
UPDATE workspace.default.cleaned_data
SET education = 'Not Available'
WHERE education IS NULL;

In [0]:
%sql
DELETE FROM workspace.default.cleaned_data
WHERE email IS NOT NULL AND email NOT LIKE '%@%.%';

In [0]:
%sql
UPDATE workspace.default.cleaned_data
SET phone = REGEXP_REPLACE(phone, '[^0-9]', '')
WHERE phone IS NOT NULL;


In [0]:
%sql
DELETE FROM workspace.default.cleaned_data
WHERE TRY_CAST(pincode AS INT) IS NULL AND pincode IS NOT NULL;


In [0]:
%sql
UPDATE workspace.default.cleaned_data
SET phone = CASE 
                WHEN phone IS NULL OR TRIM(phone) = '' THEN 'Unknown' 
                ELSE phone 
            END,
    email = CASE 
                WHEN email IS NULL OR TRIM(email) = '' THEN 'Unknown' 
                ELSE email 
            END,
    alternate_email = CASE 
                         WHEN alternate_email IS NULL OR TRIM(alternate_email) = '' THEN 'Unknown' 
                         ELSE alternate_email 
                     END;


In [0]:
%sql
select count(*) from workspace.default.cleaned_data

In [0]:
%sql
UPDATE workspace.default.cleaned_data
SET experience = LEFT(REGEXP_REPLACE(experience, '[^0-9]', ''), 2);






In [0]:
%sql
UPDATE workspace.default.cleaned_data
SET name = TRIM(name),
    speciality = TRIM(speciality),
    clinic_name = TRIM(clinic_name),
    education = TRIM(education),
    experience = TRIM(experience),
    address = TRIM(address),
    city = TRIM(city),
    state = TRIM(state);


In [0]:
%sql
CREATE OR REPLACE TABLE workspace.default.final_cleaned_data AS 
SELECT * FROM workspace.default.cleaned_data;


In [0]:
%sql
SELECT * FROM workspace.default.cleaned_data;

In [0]:
%sql
UPDATE workspace.default.cleaned_data
SET phone = NULL
WHERE LENGTH(phone) < 10;


In [0]:
%sql
SELECT phone FROM workspace.default.cleaned_data WHERE LENGTH(phone) < 10;


In [0]:
%sql
UPDATE workspace.default.cleaned_data
SET pincode = NULL
WHERE LENGTH(pincode) < 6;


In [0]:
%sql
SELECT city, COUNT(*) as doctor_count
FROM workspace.default.cleaned_data
GROUP BY city
ORDER BY doctor_count DESC;


In [0]:
%sql
UPDATE workspace.default.cleaned_data
SET locality = INITCAP(locality),  -- Convert locality to Camel Case
    city = INITCAP(
              CASE 
                  WHEN locality LIKE '%,%' 
                  THEN SPLIT(locality, ',')[0]  -- Extract text before the first comma
                  ELSE SPLIT(locality, ' ')[0]  -- Extract the first word if no comma exists
              END
           );




In [0]:
%sql
select * from workspace.default.cleaned_data

In [0]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load data from Databricks table
df = spark.sql("""
    SELECT city, speciality, COUNT(*) as doctor_count
    FROM workspace.default.cleaned_data
    GROUP BY city, speciality
""").toPandas()

# Pivot for Heatmap
df_pivot = df.pivot(index="city", columns="speciality", values="doctor_count")

# Plot Heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(df_pivot, cmap="coolwarm", annot=True, fmt=".0f", linewidths=0.5)

plt.title("Doctor Distribution by City and Specialization")
plt.xlabel("Specialization")
plt.ylabel("City")
plt.xticks(rotation=45)
plt.show()


In [0]:
df = spark.sql("""
    SELECT city, COUNT(*) as doctor_count
    FROM workspace.default.cleaned_data
    GROUP BY city
""").toPandas()

plt.figure(figsize=(12, 6))
sns.barplot(data=df, x="city", y="doctor_count", palette="viridis")
plt.xticks(rotation=45)
plt.xlabel("City")
plt.ylabel("Number of Doctors")
plt.title("Distribution of Doctors by City")
plt.show()


In [0]:
df = spark.sql("""
    SELECT city, speciality, COUNT(*) as doctor_count
    FROM workspace.default.cleaned_data
    GROUP BY city, speciality
""").toPandas()

df_pivot = df.pivot(index="city", columns="speciality", values="doctor_count")

plt.figure(figsize=(12, 8))
sns.heatmap(df_pivot, cmap="coolwarm", annot=True, fmt=".0f", linewidths=0.5)
plt.title("Doctor Distribution by City and Specialization")
plt.xlabel("Specialization")
plt.ylabel("City")
plt.xticks(rotation=45)
plt.show()


In [0]:
df = spark.sql("""
    SELECT speciality, COUNT(*) as doctor_count
    FROM workspace.default.cleaned_data
    GROUP BY speciality
""").toPandas()

plt.figure(figsize=(10, 10))
plt.pie(df["doctor_count"], labels=df["speciality"], autopct="%1.1f%%", colors=sns.color_palette("pastel"))
plt.title("Distribution of Doctors by Specialization")
plt.show()


In [0]:
df = spark.sql("""
    SELECT state, COUNT(*) as doctor_count
    FROM workspace.default.cleaned_data
    GROUP BY state
""").toPandas()

plt.figure(figsize=(10, 10))
plt.pie(df["doctor_count"], labels=df["state"], autopct="%1.1f%%", colors=sns.color_palette("coolwarm"))
plt.title("Distribution of Doctors by State")
plt.show()


In [0]:
%sql
select experience from workspace.default.cleaned_data

In [0]:
%sql
select count(*) ,speciality, experience from workspace.default.cleaned_data group by 2,3

In [0]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load data from Databricks table
df = spark.sql("""
    SELECT COUNT(*) as doctor_count, speciality, experience
    FROM workspace.default.cleaned_data
    GROUP BY speciality, experience
""").toPandas()

# Plot bar chart
plt.figure(figsize=(14, 8))
sns.barplot(data=df, x="experience", y="doctor_count", hue="speciality", dodge=True)

plt.title("Doctor Count by Speciality and Experience")
plt.xlabel("Years of Experience")
plt.ylabel("Number of Doctors")
plt.xticks(rotation=45)
plt.legend(title="Speciality")
plt.show()


In [0]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load data from Databricks table
df = spark.sql("""
    SELECT COUNT(*) as doctor_count, speciality, experience
    FROM workspace.default.cleaned_data
    WHERE experience IS NOT NULL AND speciality IS NOT NULL
    GROUP BY speciality, experience
    HAVING COUNT(*) > 0
""").toPandas()

# Pivot for Heatmap
df_pivot = df.pivot(index="experience", columns="speciality", values="doctor_count")

# Plot Heatmap
plt.figure(figsize=(14, 8))
sns.heatmap(df_pivot, cmap="coolwarm", annot=True, fmt=".0f", linewidths=0.5)

plt.title("Doctor Count by Speciality and Experience")
plt.xlabel("Speciality")
plt.ylabel("Years of Experience")
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.show()


In [0]:
%python
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load data from Databricks table
df = spark.sql("""
    SELECT city, speciality, experience, COUNT(*) as doctor_count
    FROM workspace.default.cleaned_data
    WHERE city IS NOT NULL AND speciality IS NOT NULL AND experience IS NOT NULL
    GROUP BY city, speciality, experience
    HAVING COUNT(*) > 0
""").toPandas()

# Pivot for Heatmap
df_pivot = df.pivot_table(index=["city", "experience"], columns="speciality", values="doctor_count", aggfunc="sum")

# Check if the pivot table is empty
if not df_pivot.empty:
    # Plot Heatmap
    plt.figure(figsize=(16, 10))
    sns.heatmap(df_pivot, cmap="coolwarm", annot=True, fmt=".0f", linewidths=0.5)

    plt.title("Doctor Count by City, Speciality, and Experience")
    plt.xlabel("Speciality")
    plt.ylabel("City & Experience")
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.show()
else:
    print("No data available to plot the heatmap.")

In [0]:
%sql
 SELECT city, speciality, experience, COUNT(*) as doctor_count
    FROM workspace.default.cleaned_data
    WHERE city IS NOT NULL AND speciality IS NOT NULL AND experience IS NOT NULL
    GROUP BY city, speciality, experience
    HAVING COUNT(*) > 0