In [1]:
import pandas as pd
import numpy as np
import re

In [None]:
# Define parsing functions 
def parse_salary(salary_str):
    if not isinstance(salary_str, str):
        return np.nan
    salary_str = salary_str.replace('$', '').replace('K', '')
    numbers = re.findall(r'\d+', salary_str)
    if numbers:
        return int(numbers[0]) * 1000
    return np.nan

def parse_experience(experience_str):
    if not isinstance(experience_str, str):
        return np.nan
    numbers = re.findall(r'\d+', experience_str)
    if numbers:
        return int(numbers[0])
    return np.nan

In [None]:
try:
    df = pd.read_csv('job_descriptions.csv')
    print(f"Original dataset size: {len(df)} rows")
except FileNotFoundError:
    print("Error: The file 'job_data.csv' was not found. Please check the file path.")
    exit()

In [None]:
# data cleaning and parsing
df['Experience'] = df['Experience'].apply(parse_experience)
df['Salary Range'] = df['Salary Range'].apply(parse_salary)
df['Job Posting Date'] = pd.to_datetime(df['Job Posting Date'], errors='coerce')
df.dropna(subset=['Experience', 'Salary Range', 'Job Posting Date'], inplace=True)
print(f"Dataset size after cleaning: {len(df)} rows")


In [None]:
# Filter for Qualification 'B.tech'
df_filtered = df[df['Qualifications'].str.contains('B.tech', case=False, na=False)]
print(f"After 'Qualification' filter: {len(df_filtered)} rows")

#  Filter for work type 'Full time'
df_filtered = df_filtered[df_filtered['Work Type'].str.contains('Full-Time', case=False, na=False)]
print(f"After 'Work Type' filter: {len(df_filtered)} rows")

#  Filter for countries 'India' and 'Germany'
df_filtered = df_filtered[df_filtered['Country'].isin(['India', 'Germany'])]
print(f"After 'Country' filter: {len(df_filtered)} rows")

#  Filter for experience more than 2 years
df_filtered = df_filtered[df_filtered['Experience'] > 2]
print(f"After 'Experience' filter: {len(df_filtered)} rows")

#  Filter for job titles
job_titles = ['Data Scientist', 'Art Teacher', 'Aerospace Engineer']
df_filtered = df_filtered[df_filtered['Job Title'].isin(job_titles)]
print(f"After 'Job Title' filter: {len(df_filtered)} rows")

#  Filter for salary range > $10k
df_filtered = df_filtered[df_filtered['Salary Range'] > 10000]
print(f"After 'Salary Range' filter: {len(df_filtered)} rows")

#  Filter for job posting date
end_date = pd.to_datetime('2023-08-01')
df_filtered = df_filtered[df_filtered['Job Posting Date'] < end_date]
print(f"After 'Job Posting Date' filter: {len(df_filtered)} rows")

#  Filter for job portal 'indeed'
df_filtered = df_filtered[df_filtered['Job Portal'].str.contains('indeed', case=False, na=False)]
#print(f"After 'Job Portal' filter: {len(df_filtered)} rows")

#  Filter for preference 'Female'
df_filtered = df_filtered[df_filtered['Preference'] == 'Female']
print(f"After 'Preference' filter: {len(df_filtered)} rows")


In [None]:
output_file = 'task5_filtered_data.csv'
df_filtered.to_csv(output_file, index=False)

print(f"Filtered data for Task 5 has been saved to {output_file}")