In [2]:
#Importing nexessary libraries
import pandas as pd
import spacy

In [3]:
# Loading the spaCy model for NLP tasks
nlp = spacy.load("en_core_web_sm")

In [4]:
# Loading the occupational data
file_path = '/Users/kiranmaireddy/Desktop/Semester 4/datasets/combined_table2_data_with_industry_new.csv'  
df = pd.read_csv(file_path)

In [5]:
# Filtering out rows where the 'Tasks' or 'Detailed Work Activities' column is empty or NaN
df = df[(df['Tasks'].notna() & df['Tasks'].str.strip().astype(bool)) | 
        (df['Detailed Work Activities'].notna() & df['Detailed Work Activities'].str.strip().astype(bool))]

In [6]:
# Combining 'Tasks' and 'Detailed Work Activities' columns
df['combined_description'] = df['Tasks'].fillna('') + ' ' + df['Detailed Work Activities'].fillna('')

In [7]:
# Defining keywords for each algorithm function
keywords = {
    'Descriptive': ['report', 'describe', 'summarize', 'overview', 'aggregate', 'examine', 'analyze', 'characterize', 'detail', 'record', 'inspect', 'measure', 'profile', 'interpret'],
    'Assistive': ['recommend', 'suggest', 'assist', 'guide', 'advise', 'support', 'help', 'aid', 'facilitate', 'mentor', 'provide guidance', 'consult', 'counsel'],
    'Predictive': ['predict', 'forecast', 'estimate', 'anticipate', 'project', 'envision', 'foresee', 'assess future', 'calculate likelihood', 'trend', 'probability'],
    'Prescriptive': ['optimize', 'prescribe', 'decision', 'recommendation', 'determine', 'select', 'direct', 'define', 'advise action', 'choose', 'control', 'regulate', 'enforce'],
    'Automation': ['automate', 'operate', 'machine', 'mechanize', 'robot', 'automated', 'self-operating', 'automated process', 'machine learning', 'run automatically', 'self-manage'],
    'Support EF': ['manage', 'support', 'execute', 'coordinate', 'supervise', 'maintain', 'organize', 'handle', 'administrate', 'direct', 'conduct', 'facilitate', 'monitor', 'implement', 'oversee']
}

In [8]:
# Preprocessing text: Tokenization, lemmatization, and lowercasing
def preprocess(text):
    doc = nlp(text.lower())
    return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

In [9]:
# Applying preprocessing to the combined description column
df['Polished Description'] = df['combined_description'].apply(preprocess)

In [12]:
# Defining a relevance threshold
RELEVANCE_THRESHOLD = 2 

In [13]:
# Mapping functions based on keyword matching with relevance threshold
def map_algorithm_function(description):
    scores = {}
    
    # Calculating scores for each function based on keyword matches
    for function, words in keywords.items():
        score = sum(description.count(word) for word in words)
        if score > 0:
            scores[function] = score

    # Filtering functions that meet the relevance threshold
    relevant_functions = [func for func, score in scores.items() if score >= RELEVANCE_THRESHOLD]
    
    # Sorting functions by score in descending order
    relevant_functions = sorted(relevant_functions, key=lambda x: scores[x], reverse=True)
    
    # Limiting to a maximum of 3, if more than 3 are available
    return relevant_functions[:3] if relevant_functions else ['Unknown']


In [14]:
# Applying the mapping function to the dataset
df['algorithm_functions'] = df['Polished Description'].apply(map_algorithm_function)

In [15]:
# Displaying a sample of the results
print(df[['Occupation', 'Tasks', 'Detailed Work Activities', 'algorithm_functions']].head())

                                          Occupation  \
0                                            Tellers   
1                                New Accounts Clerks   
2                             Insurance Underwriters   
3                  Insurance Appraisers, Auto Damage   
4  Securities, Commodities, and Financial Service...   

                                               Tasks  \
0  Balance currency, coin, and checks in cash dra...   
1  Perform teller duties as required.Related occu...   
2  Examine documents to determine degree of risk ...   
3  Evaluate practicality of repair as opposed to ...   
4  Make bids or offers to buy or sell securities....   

                            Detailed Work Activities  \
0  Verify accuracy of financial or transactional ...   
1  Execute sales or other financial transactions....   
2  Analyze health-related data.Related occupation...   
3  Estimate costs of goods or services.Related oc...   
4  Negotiate prices or other sales terms.Relat

In [20]:
# Save the results to a CSV file
output_path = '/Users/kiranmaireddy/Desktop/Semester 4/Result/mapped_occupational_data_with_algorithm_functions.xlsx'  
df.to_excel(output_path, index=False)

In [21]:
print("Mapping done")

Mapping done
