In [2]:
#Importing necessary libraries
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Loading the Algorithm Data
df = pd.read_excel('/Users/kiranmaireddy/Desktop/Semester 4/input/Algorithm Data.xlsx')

In [4]:
# Loading spaCy model
nlp = spacy.load("en_core_web_sm")

In [6]:
# Preprocessing text by removing stopwords, punctuation, and applying lemmatization
def preprocess(text):
    if isinstance(text, str):
        doc = nlp(text.lower())
        return " ".join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])
    return ""

df['Polished Description'] = df['Description'].apply(preprocess)

In [7]:
# Defining keywords for each function 
function_keywords = {
    'Descriptive': [
        'report', 'describe', 'summarize', 'overview', 'explain', 'analyze', 
        'characterize', 'detail', 'review', 'monitor', 'examine', 'breakdown', 
        'inspect', 'interpret', 'identify', 'outline', 'list', 'profile', 'survey'
    ],
    'Assistive': [
        'recommend', 'suggest', 'advise', 'help', 'support', 'facilitate', 
        'guide', 'assist', 'recommendation', 'aid', 'ease', 'prompt', 'stimulate',
        'counsel', 'empower', 'enhance', 'contribute', 'promote', 'improve', 'bolster'
    ],
    'Predictive': [
        'predict', 'forecast', 'future', 'estimate', 'anticipate', 'project', 
        'forecasting', 'trend', 'extrapolate', 'infer', 'speculate', 'calculate',
        'predictive analysis', 'projection', 'risk assessment', 'forecasting trends', 
        'predictive model', 'outlook'
    ],
    'Prescriptive': [
        'decision', 'recommendation', 'optimize', 'determine', 'select', 'prescribe', 
        'solution', 'advise', 'recommend', 'guide action', 'best course', 'strategy',
        'advise on action', 'directive', 'prescription', 'strategy optimization', 
        'best decision', 'optimum'
    ],
    'Automation': [
        'automate', 'operate', 'mechanize', 'machine', 'robot', 'routine', 
        'automated', 'automatic', 'streamline', 'process automation', 'systematic',
        'computerize', 'automatic execution', 'self-operating', 'mechanization', 
        'workflow', 'routine task', 'repetition'
    ],
    'Support EF': [
        'manage', 'support', 'execute', 'coordinate', 'supervise', 'maintain', 
        'organize', 'administer', 'oversee', 'implement', 'direct', 'facilitate', 
        'collaborate', 'control', 'oversee implementation', 'ensure execution', 
        'backup', 'manage operation', 'assist'
    ]
}

In [8]:
# Defining a function to map algorithm functions based on keywords and similarity with a high threshold
def map_algorithm_function(text):
    text = text.lower()
    matched_functions = []

    # Keyword matching
    for function, keywords in function_keywords.items():
        if any(keyword in text for keyword in keywords):
            matched_functions.append(function)

    # If no functions are matched via keywords, use similarity-based fallback
    if not matched_functions:
        tfidf = TfidfVectorizer()
        tfidf_matrix = tfidf.fit_transform([text] + list(function_keywords.keys()))
        similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()

        # Setting  a high similarity threshold and find matches above it
        threshold = 0.75
        matched_functions = [list(function_keywords.keys())[i] for i, score in enumerate(similarities) if score > threshold]

    # Limiting to a maximum of 3 functions
    return matched_functions[:3] if matched_functions else []

In [9]:
# Applying mapping function
df['algorithm_functions'] = df['Polished Description'].apply(map_algorithm_function)

In [10]:
# Removing rows with empty lists in 'algorithm_functions'
df = df[df['algorithm_functions'].map(len) > 0]

In [11]:
# Displaying results
print(df[['Description', 'algorithm_functions']])

                                          Description  \
0   The naive Bayes algorithm is based on the Baye...   
1    Linear Discriminant Analysis (LDA) is a linea...   
2   Another common probabilistic based statistical...   
4    In machine learning, another common technique...   
5   Decision tree (DT) is a well known non-paramet...   
..                                                ...   
61  Another common association rule learning techn...   
62  RARM (Rapid Association Rule Mining) is an alg...   
63  This algorithm effectively identifies the redu...   
64  ONE R (One Rule) is a simple classification al...   
65  Zero R (Zero Rule) is a simple classification ...   

                        algorithm_functions  
0                              [Predictive]  
1                [Predictive, Prescriptive]  
2     [Descriptive, Predictive, Automation]  
4       [Assistive, Automation, Support EF]  
5                [Prescriptive, Support EF]  
..                                     

In [18]:
# Saving to Excel
df.to_excel('/Users/kiranmaireddy/Desktop/Semester 4/Result/mapped_algorithm_data_with_AF.xlsx', index=False)

In [19]:
print("Mapping done")

Mapping done
