In [1]:
import pandas as pd
import numpy as np
import re


In [2]:
# function to parse the salary string
def parse_salary(salary_str):
    if not isinstance(salary_str, str):
        return np.nan
    salary_str = salary_str.replace('$', '').replace('K', '')
    numbers = re.findall(r'\d+', salary_str)
    
    if numbers:
        return int(numbers[0]) * 1000
    return np.nan

#function to parse the experience string
def parse_experience(experience_str):
    if not isinstance(experience_str, str):
        return np.nan
    numbers = re.findall(r'\d+', experience_str)
    if numbers:
        return int(numbers[0])
    return np.nan


In [3]:
try:
    df = pd.read_csv('job_descriptions.csv')
except FileNotFoundError:
    print("Error: The file 'job_data.csv' was not found. Please check the file path.")
    exit()

In [None]:
# data cleaning and parsing
df['latitude'] = pd.to_numeric(df['latitude'], errors='coerce')
df['Company Size'] = pd.to_numeric(df['Company Size'], errors='coerce')
df['Salary Range'] = df['Salary Range'].apply(parse_salary)
df['Experience'] = df['Experience'].apply(parse_experience)
df['Job Posting Date'] = pd.to_datetime(df['Job Posting Date'], errors='coerce')
df.dropna(subset=['Experience', 'Salary Range', 'Company Size', 'latitude', 'Job Posting Date'], inplace=True)

In [5]:
# List of African countries
african_countries = [
    'Algeria', 'Angola', 'Benin', 'Botswana', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cameroon', 'Central African Republic',
    'Chad', 'Comoros', 'Congo', 'Democratic Republic of the Congo', 'Cote d\'Ivoire', 'Djibouti', 'Egypt', 'Equatorial Guinea',
    'Eritrea', 'Eswatini', 'Ethiopia', 'Gabon', 'Gambia', 'Ghana', 'Guinea', 'Guinea-Bissau', 'Kenya', 'Lesotho', 'Liberia',
    'Libya', 'Madagascar', 'Malawi', 'Mali', 'Mauritania', 'Mauritius', 'Morocco', 'Mozambique', 'Namibia', 'Niger', 'Nigeria',
    'Rwanda', 'Sao Tome and Principe', 'Senegal', 'Seychelles', 'Sierra Leone', 'Somalia', 'South Africa', 'South Sudan',
    'Sudan', 'Tanzania', 'Togo', 'Tunisia', 'Uganda', 'Zambia', 'Zimbabwe'
]


In [None]:
#  Qualification (B.Tech, M.Tech, or PhD)
df_filtered = df[df['Qualifications'].str.contains('B.tech|M.tech|PhD', case=False, na=False)]

# Filter for work type 'Full time'
df_filtered = df_filtered[df_filtered['Work Type'] == 'Full-Time']

# Filter for African countries
df_filtered = df_filtered[df_filtered['Country'].isin(african_countries)]

# Filter for Job title starting with 'D'
# This is the corrected line to fix the error
df_filtered = df_filtered[df_filtered['Job Title'].str.lower().str.startswith('d')]

# Filter for preference 'Male'
df_filtered = df_filtered[df_filtered['Preference'] == 'Male']

#  Filter for company size > 80,000
df_filtered = df_filtered[df_filtered['Company Size'] > 80000]

# Filter for contact person starting with 'A'
df_filtered = df_filtered[df_filtered['Contact Person'].str.startswith('A', na=False)]

# Filter for job portal 'indeed'
df_filtered = df_filtered[df_filtered['Job Portal'] == 'indeed']


output_file = 'task4_filtered_data.csv'
df_filtered.to_csv(output_file, index=False)

print(f"Filtered data for Task 4 has been saved to {output_file}")
print(f"Number of rows in output file: {len(df_filtered)}")