In [3]:
#extract salary from description

import pandas as pd
import re
import numpy as np

file_path = 'ai_ml_jobs_linkedin.csv'
data = pd.read_csv(file_path, encoding='latin1')

def is_valid_salary(description, match, salary, threshold=40000):
    """
    Print words around salary that are less than the threshold.
    
    Parameters:
    description (str): The job description.
    threshold (float): The salary threshold.
    """
    if float(salary) < threshold:
        words = description.split()
        for i, word in enumerate(words):
            if match.group() in word:    

                    return False
    return True


def extract_salary(description):
    """
    Extract salary from job description.
    
    Parameters:
    description (str): The job description.
    
    Returns:
    float: The extracted salary or None if no salary is found.
    """
 
    patterns = [
    r'\$\s?(\d{1,3}(?:,\d{3})?(?:\.\d{1,2})?)\s?to\s?\$\s?(\d{1,3}(?:,\d{3})?(?:\.\d{1,2})?)\s?/hr',  # $80.00 to $86.65/hr

    r'\$\s?(\d{1,3}(?:,\d{3})?(?:\.\d{2})?)\s?-\s?\$\s?(\d{1,3}(?:,\d{3})?(?:\.\d{2})?)\s?per\s?year',  # $140,100—$210,100 per year
    r'\$\s?(\d{1,3}(?:,\d{3})?)k\s?-\s?\$\s?(\d{1,3}(?:,\d{3})?)k',  # $120k - $135k
    r'\$\s?(\d{1,3}(?:,\d{3})?(?:\.\d{2})?)\s?to\s?\$\s?(\d{1,3}(?:,\d{3})?(?:\.\d{2})?)',  # $80.00 to $86.65
    r'\$\s?(\d{1,3}(?:,\d{3})?)\s?-\s?\$\s?(\d{1,3}(?:,\d{3})?)\s?per\s?hour',  # $30-$32 per hour

    r'\$\s?(\d{1,3}(?:,\d{3})?(?:\.\d{2})?)\s?-\s?\$\s?(\d{1,3}(?:,\d{3})?(?:\.\d{2})?)',  # $110,000 - $120,000
    r'between\s?\$\s?(\d{1,3}(?:,\d{3})?)\s?and\s?\$\s?(\d{1,3}(?:,\d{3})?)',  # between $110,000 and $120,000
    r'from\s?\$\s?(\d{1,3}(?:,\d{3})?)\s?to\s?\$\s?(\d{1,3}(?:,\d{3})?)',  # from $110,000 to $120,000
    r'\$\s?(\d{1,3}(?:,\d{3})?(?:\.\d{2})?)k',  # $110k
    r'up\s?to\s?\$\s?(\d{1,3}(?:,\d{3})?)',  # up to $120,000
    r'\$\s?(\d{1,3}(?:,\d{3})?(?:\.\d{2})?)\s?USD',  # $110,000 USD
    
    r'\$\s?(\d{1,3}(?:,\d{3})?(?:\.\d{2})?)\s?per\s?annum',  # $75,000 per annum
    r'\$\s?(\d{1,3}(?:,\d{3})?)\s?-\s?\$\s?(\d{1,3}(?:,\d{3})?)\s?per\s?annum',  # $65,000 - $75,000 per annum
    r'\$\s?(\d{1,3}(?:,\d{3})?(?:\.\d{2})?)\s?per\s?year',  # $100,000 per year
    r'\$\s?(\d{1,3}(?:,\d{3})?(?:\.\d{2})?)k\s?per\s?year',  # $110k per year
    r'\$\s?(\d{1,3}(?:,\d{3})?(?:\.\d{2})?)\s?-\s?\$\s?(\d{1,3}(?:,\d{3})?)\s?USD'  # $140,100—$210,100 USD
    r'\$\s?(\d{1,3}(?:,\d{3})?(?:\.\d{2})?)',  # $110,000
    
]

    
    for pattern in patterns:
        match = re.search(pattern, description, re.IGNORECASE)
        
        if match:
            
            if len(match.groups()) == 1:
                # Single salary value
                salary = match.group(1).replace(',', '')
                if 'k' in pattern:
                    return float(salary.replace('k', '')) * 1000
                if 'hr' in pattern:
                    print("catch hour")
                    return float(salary) * 1040  # Convert hourly rate to annual salary

                if is_valid_salary(description, match, salary):
                    
                    return salary
            elif len(match.groups()) == 2:
                # Salary range
                salary1 = match.group(1).replace(',', '')
                salary2 = match.group(2).replace(',', '')
                if 'k' in pattern:
                    salary1 = float(salary1.replace('k', '')) * 1000
                    salary2 = float(salary2.replace('k', '')) * 1000
                    return np.mean([float(salary1), float(salary2)])
                elif 'hr' in pattern:
                    salary1 = float(salary1) * 1040  # Convert hourly rate to annual salary
                    salary2 = float(salary2) * 1040  # Convert hourly rate to annual salary

                    return np.mean([float(salary1), float(salary2)])
        
                else:
                    reslt = np.mean([float(salary1), float(salary2)])
                    # print_words_around_salary(description, match, reslt)
                    if is_valid_salary(description, match, reslt):
                       
                        return reslt
    return np.nan

data['salary'] = data['description'].apply(extract_salary)
#count number of non-np.nan salaries
print(data['salary'].count() / len(data) * 100, "% of salaries can be extracted")


data



# $110- $120,000 data['description'][0]
#count the number description that contain $ or word dollar
print(data['description'].str.contains(r'\$', case=False).sum() / len(data) * 100, "% of descriptions contain money, which include startup funding raised, equity, etc.")

data.to_csv('ai_ml_jobs_linkedin.csv', index=False)
print("Done")

31.670533642691417 % of salaries can be extracted
42.92343387470998 % of descriptions contain money, which include startup funding raised, equity, etc.
Done
