In [27]:
import pandas as pd
import numpy as np
import re

#function to parse the salary string
def parse_salary(salary_str):
    if not isinstance(salary_str, str):
        return np.nan
    
    salary_str = salary_str.replace('$', '').replace('K', '')
    numbers = re.findall(r'\d+', salary_str)
    
    if numbers:
        return int(numbers[0]) * 1000
    return np.nan

#function to parse the experience string
def parse_experience(experience_str):
    if not isinstance(experience_str, str):
        return np.nan
    
    # Extract all numbers from the string
    numbers = re.findall(r'\d+', experience_str)
    if numbers:
        # Use the first number as the minimum experience
        return int(numbers[0])
    return np.nan


In [28]:
try:
    df = pd.read_csv('job_descriptions.csv')
except FileNotFoundError:
    print("Error: The file 'job_descriptions.csv' was not found. Please check the file path.")
    exit()

In [29]:
# Apply the parsing functions to the 'Salary Range' and 'Experience' columns
df['Salary Range'] = df['Salary Range'].apply(parse_salary)
df['Experience'] = df['Experience'].apply(parse_experience)

In [30]:
# Convert Company Size to numeric and drop NaN values
df['Company Size'] = pd.to_numeric(df['Company Size'], errors='coerce')
df.dropna(subset=['Experience', 'Salary Range', 'Company Size'], inplace=True)

In [31]:
df_filtered = df[
    (df['Company Size'] < 50000) &
    (df['Job Title'] == 'Mechanical Engineer') &
    (df['Experience'] > 5) &
    (df['Country'].isin(['China', 'India', 'Japan', 'South Korea', 'Singapore', 'Malaysia', 'Thailand', 'Indonesia', 'Vietnam', 'Philippines'])) &
    (df['Salary Range'] > 50000) &
    (df['Work Type'].isin(['Part-time', 'Full-time'])) &
    (df['Preference'] == 'Male') &
    (df['Job Portal'] == 'idealist')
]

In [32]:
output_file = 'task2_filtered_data.csv'
df_filtered.to_csv(output_file, index=False)

print(f"Filtered data for Task 2 has been saved to {output_file}")

Filtered data for Task 2 has been saved to task2_filtered_data.csv
