In [1]:
import pandas as pd
import numpy as np
import re
# Load dataset
df = pd.read_csv("Uncleaned_DS_jobs.csv")

# Basic checks
print("Dataset Shape:", df.shape)
print("\nFirst 5 Rows:")
print(df.head())

print("\nDataset Info:")
df.info()

print("\nBasic Statistics:")
print(df.describe(include="all"))
print("\nDuplicate rows before:", df.duplicated().sum())

df = df.drop_duplicates()

print("Duplicate rows after:", df.duplicated().sum())
# Standardize column names
df.columns = (
    df.columns
    .str.lower()
    .str.strip()
    .str.replace(" ", "_")
)

print("\nColumns after standardization:")
print(df.columns)
irrelevant_cols = [
    "job_description_html",
    "company_profile",
    "unnamed:_0"
]

df = df.drop(columns=[c for c in irrelevant_cols if c in df.columns])
print("\nMissing values per column:")
print(df.isna().sum())
# Critical columns
critical_cols = ["job_title", "company_name", "location"]

critical_cols = [c for c in critical_cols if c in df.columns]
df = df.dropna(subset=critical_cols)

# Fill non-critical categorical columns
for col in df.select_dtypes(include="object").columns:
    df[col] = df[col].fillna("unknown")
# Convert salary columns safely
salary_cols = ["min_salary", "max_salary", "avg_salary"]

for col in salary_cols:
    if col in df.columns:
        df[col] = (
            df[col]
            .astype(str)
            .str.replace(r"[^\d]", "", regex=True)
        )
        df[col] = pd.to_numeric(df[col], errors="coerce")

# Remove extreme salary outliers
if "avg_salary" in df.columns:
    q1 = df["avg_salary"].quantile(0.25)
    q3 = df["avg_salary"].quantile(0.75)
    iqr = q3 - q1

    df = df[
        (df["avg_salary"] >= q1 - 1.5 * iqr) &
        (df["avg_salary"] <= q3 + 1.5 * iqr)
    ]
if "location" in df.columns:
    df["location"] = df["location"].str.lower().str.strip()

    # Extract city & state
    df["city"] = df["location"].str.split(",").str[0].str.strip()
    df["state"] = df["location"].str.split(",").str[1].str.strip()
if "job_description" in df.columns:
    df["job_description"] = (
        df["job_description"]
        .str.lower()
        .str.replace(r"<.*?>", "", regex=True)   # remove HTML
        .str.replace(r"[^a-zA-Z ]", " ", regex=True)
        .str.replace(r"\s+", " ", regex=True)
        .str.strip()
    )
if "job_title" in df.columns:
    df["job_title"] = df["job_title"].str.lower().str.strip()

    title_map = {
        "data scientist ii": "data scientist",
        "sr data scientist": "senior data scientist",
        "jr data scientist": "junior data scientist"
    }

    df["job_title"] = df["job_title"].replace(title_map)
print("\nFinal Dataset Shape:", df.shape)

print("\nFinal Missing Values:")
print(df.isna().sum())

assert df.duplicated().sum() == 0, "Duplicates still exist!"

print("\nFinal Dataset Info:")
df.info()
df.to_csv("cleaned_ds_job_postings.csv", index=False)

print("\n‚úÖ Data Science Job Postings cleaned successfully!")
print("üìÅ Saved as: cleaned_ds_job_postings.csv")


Dataset Shape: (672, 15)

First 5 Rows:
   index          Job Title               Salary Estimate  \
0      0  Sr Data Scientist  $137K-$171K (Glassdoor est.)   
1      1     Data Scientist  $137K-$171K (Glassdoor est.)   
2      2     Data Scientist  $137K-$171K (Glassdoor est.)   
3      3     Data Scientist  $137K-$171K (Glassdoor est.)   
4      4     Data Scientist  $137K-$171K (Glassdoor est.)   

                                     Job Description  Rating  \
0  Description\n\nThe Senior Data Scientist is re...     3.1   
1  Secure our Nation, Ignite your Future\n\nJoin ...     4.2   
2  Overview\n\n\nAnalysis Group is one of the lar...     3.8   
3  JOB DESCRIPTION:\n\nDo you have a passion for ...     3.5   
4  Data Scientist\nAffinity Solutions / Marketing...     2.9   

              Company Name       Location            Headquarters  \
0         Healthfirst\n3.1   New York, NY            New York, NY   
1             ManTech\n4.2  Chantilly, VA             Herndon, VA   
2