In [5]:
import pandas as pd
import numpy as np

# Load dataset
file_path = "dataset/job_descriptions.csv"
df = pd.read_csv(file_path)



In [7]:
print(df.columns)




Index(['job id', 'experience', 'qualifications', 'salary range', 'Location',
       'country', 'Latitude', 'Longitude', 'work type', 'company size',
       'job posting date', 'preference', 'contact person', 'contact',
       'job title', 'role', 'job portal', 'job description', 'benefits',
       'Skills', 'responsibilities', 'Company Name', 'company profile'],
      dtype='object')


In [8]:
# Standardize column names (strip spaces, make lowercase for consistency)
df.columns = df.columns.str.strip().str.lower()

# Rename columns for consistency
df.rename(columns={
    'location': 'location',
    'latitude': 'latitude',
    'longitude': 'longitude',
    'skills': 'skills',
    'company name': 'company name',
    'salary range': 'salary range'
}, inplace=True)

# Handling missing values: Fill NaN values with appropriate defaults
df.fillna({
    'experience': 'Not Specified',
    'qualifications': 'Not Specified',
    'salary range': 'Not Specified',
    'location': 'Unknown',
    'company name': 'Unknown',
    'work type': 'Unknown',
    'job description': 'Not Available',
    'skills': 'None',
    'responsibilities': 'Not Available'
}, inplace=True)

# Standardize Job Titles & Company Names (Title Case)
df['job title'] = df['job title'].str.title()
df['company name'] = df['company name'].str.title()

# Convert Salary Range: "$59K-$99K" → "59000-99000"
df['salary range'] = df['salary range'].str.replace(r'[\$,K]', '', regex=True)

# Convert Job Posting Date to datetime format
df['job posting date'] = pd.to_datetime(df['job posting date'], errors='coerce')

# Extract Min & Max Experience (if available)
df[['min experience', 'max experience']] = df['experience'].str.extract(r'(\d+)\s*to\s*(\d+)')
df['min experience'] = pd.to_numeric(df['min experience'], errors='coerce').fillna(0)
df['max experience'] = pd.to_numeric(df['max experience'], errors='coerce').fillna(0)

In [9]:
# Save cleaned dataset
cleaned_file_path = "dataset/job_descriptions_cleaned.csv"
df.to_csv(cleaned_file_path, index=False)

print(f"✅ Dataset cleaned and saved to {cleaned_file_path}")

✅ Dataset cleaned and saved to dataset/job_descriptions_cleaned.csv
