# Importing Libraries

In [51]:
import pandas as pd
import random
import difflib


# Creating Data Generation Functions

In [8]:

# Introduce spelling mistakes (typos) for mocking human based inputs
def introduce_typo(text):
    if len(text) < 4:
        return text
    idx = random.randint(0, len(text) - 2)
    return text[:idx] + text[idx+1] + text[idx] + text[idx+2:]



# Generate language dictionary with CEFR levels
def generate_language_dict(typos=False):
    langs = random.sample(languages_master, k=random.randint(1, len(languages_master)))
    lang_dict = {}
    for lang in langs:
        key = introduce_typo(lang) if typos and random.random() < 0.5 else lang
        lang_dict[key] = random.choice(fluency_levels)
    return lang_dict



# Generate mock projects data
def generate_projects(n=10):
    projects = []
    for i in range(n):
        proj_id = f"P{i+1}"
        products = [introduce_typo(p) if random.random() < 0.5 else p
                    for p in random.sample(products_master, k=random.randint(1, 3))]
        location = introduce_typo(random.choice(locations_master)) if random.random() < 0.5 else random.choice(locations_master)
        flexibility = random.choice(work_flexibility_options)
        languages = generate_language_dict(typos=True)
        projects.append({
            "ProjectID": proj_id,
            "Products": products,
            "Work Location": location,
            "Work Flexibility": flexibility,
            "Languages Required": languages
        })
    return pd.DataFrame(projects)

# Generate mock employees data
def generate_employees(n=10):
    employees = []
    for i in range(n):
        emp_id = f"E{i+1}"
        products = random.sample(products_master, k=random.randint(1, 3))
        location = random.choice(locations_master)
        flexibility = random.choice(work_flexibility_options)
        languages = generate_language_dict(typos=False)
        employees.append({
            "EmployeeID": emp_id,
            "Products Experience": products,
            "Work Location": location,
            "Work Flexibility": flexibility,
            "Languages Known": languages
        })
    return pd.DataFrame(employees)


# Predefined Probable Values

In [11]:
# Predefined vocabularies
products_master = ["AIScan", "Workflow2000", "Print2.0"]
locations_master = ["Berlin", "Vienna", "London"]
work_flexibility_options = ["onsite", "remote", "hybrid"]
languages_master = ["English", "French", "German", "Italian"]
fluency_levels = ["A1", "A2", "B1", "B2", "C1", "C2"]

# Creating Mock Data

In [14]:
projects_df = generate_projects(10)
employees_df = generate_employees(10)

In [20]:
projects_df.head()

Unnamed: 0,ProjectID,Products,Work Location,Work Flexibility,Languages Required
0,P1,[Print20.],Vienna,hybrid,"{'French': 'B1', 'nEglish': 'A2', 'tIalian': '..."
1,P2,"[AIcSan, Print2.0]",Vienan,onsite,"{'Geramn': 'A1', 'English': 'A2', 'French': 'A..."
2,P3,[Print2.0],Vienna,remote,"{'French': 'A1', 'Italian': 'C2'}"
3,P4,"[AIScan, oWrkflow2000, Print2.0]",eBrlin,hybrid,"{'English': 'B1', 'Itlaian': 'B2', 'Germna': '..."
4,P5,[Workflow2000],Brelin,remote,"{'French': 'A2', 'Gemran': 'A1', 'Italian': 'A..."


In [22]:
employees_df.head()

Unnamed: 0,EmployeeID,Products Experience,Work Location,Work Flexibility,Languages Known
0,E1,[Print2.0],Vienna,hybrid,"{'Italian': 'C1', 'German': 'B1', 'English': '..."
1,E2,"[AIScan, Print2.0, Workflow2000]",Berlin,onsite,{'English': 'A2'}
2,E3,"[Workflow2000, Print2.0, AIScan]",London,onsite,"{'Italian': 'B1', 'French': 'C1', 'English': '..."
3,E4,"[Workflow2000, AIScan]",Berlin,remote,"{'French': 'B1', 'German': 'B2'}"
4,E5,"[Workflow2000, Print2.0]",London,onsite,"{'German': 'B1', 'Italian': 'C1'}"


# Creating Scoring Functions

In [75]:
# Normalizing text and doing fuzzy match
def normalize(text):
    return text.lower().strip()

def fuzzy_match(val1, val2, threshold=0.7):
    val1, val2 = normalize(val1), normalize(val2)
    return difflib.SequenceMatcher(None, val1, val2).ratio() >= threshold

### Product Matching

In [78]:
def product_score(project_products, employee_products):
    match_count = 0
    for p_prod in project_products:
        if any(fuzzy_match(p_prod, e_prod) for e_prod in employee_products):
            match_count += 1
    return match_count / len(project_products) if project_products else 0

### Location Matching with Work Flexibility Logic

In [81]:
def location_score(project_location, project_flex, employee_location, employee_flex):
    if project_flex == "remote":
        return 1.0
    location_match = fuzzy_match(project_location, employee_location)
    
    if project_flex == "onsite":
        if employee_flex == "onsite" and location_match:
            return 1.0
        elif employee_flex == "hybrid" and location_match:
            return 0.5
        else:
            return 0.0
    elif project_flex == "hybrid":
        if employee_flex == "onsite" and location_match:
            return 1.0
        elif employee_flex == "hybrid" and location_match:
            return 1.0
        elif employee_flex == "remote" and location_match:
            return 0.5
        else:
            return 0.0
    return 0.0

### Language Matching and Fluency Scoring

In [84]:
cefr_scale = {"A1": 1, "A2": 2, "B1": 3, "B2": 4, "C1": 5, "C2": 6}

def best_fuzzy_match(input_lang, employee_langs, threshold=0.70):
    best_match = None
    best_score = 0
    for e_lang in employee_langs:
        score = difflib.SequenceMatcher(None, normalize(input_lang), normalize(e_lang)).ratio()
        if score > best_score:
            best_match = e_lang
            best_score = score
    return best_match if best_score >= threshold else None

def language_score(project_langs, employee_langs):
    matched = []
    for p_lang, p_level in project_langs.items():
        matched_lang = best_fuzzy_match(p_lang, employee_langs)
        if matched_lang:
            matched.append((p_lang, matched_lang, p_level, employee_langs[matched_lang]))

    if not matched:
        return 0.0

    coverage = len(matched) / len(project_langs)
    scores = []
    for _, _, p_level, e_level in matched:
        required = cefr_scale.get(p_level, 0)
        actual = cefr_scale.get(e_level, 0)
        if actual >= required:
            score = 1.0
        else:
            score = max(0, 1 - (required - actual) / 6)
        scores.append(score)
    avg_fit = sum(scores) / len(scores)
    return round(coverage * avg_fit, 2)




# Creating Merged Table For Scoring All Employees by Project

In [87]:
projects_df["key"] = 1
employees_df["key"] = 1
merged_df = pd.merge(projects_df, employees_df, on="key").drop(columns="key")


In [89]:
merged_df.head()

Unnamed: 0,ProjectID,Products,Work Location_x,Work Flexibility_x,Languages Required,EmployeeID,Products Experience,Work Location_y,Work Flexibility_y,Languages Known
0,P1,[Print20.],Vienna,hybrid,"{'French': 'B1', 'nEglish': 'A2', 'tIalian': '...",E1,[Print2.0],Vienna,hybrid,"{'Italian': 'C1', 'German': 'B1', 'English': '..."
1,P1,[Print20.],Vienna,hybrid,"{'French': 'B1', 'nEglish': 'A2', 'tIalian': '...",E2,"[AIScan, Print2.0, Workflow2000]",Berlin,onsite,{'English': 'A2'}
2,P1,[Print20.],Vienna,hybrid,"{'French': 'B1', 'nEglish': 'A2', 'tIalian': '...",E3,"[Workflow2000, Print2.0, AIScan]",London,onsite,"{'Italian': 'B1', 'French': 'C1', 'English': '..."
3,P1,[Print20.],Vienna,hybrid,"{'French': 'B1', 'nEglish': 'A2', 'tIalian': '...",E4,"[Workflow2000, AIScan]",Berlin,remote,"{'French': 'B1', 'German': 'B2'}"
4,P1,[Print20.],Vienna,hybrid,"{'French': 'B1', 'nEglish': 'A2', 'tIalian': '...",E5,"[Workflow2000, Print2.0]",London,onsite,"{'German': 'B1', 'Italian': 'C1'}"


# Scoring Each Employee per Project

In [92]:
scores = []
for _, row in merged_df.iterrows():
    p_products = row["Products"]
    e_products = row["Products Experience"]
    p_location = row["Work Location_x"]
    e_location = row["Work Location_y"]
    p_flex = row["Work Flexibility_x"]
    e_flex = row["Work Flexibility_y"]
    p_langs = row["Languages Required"]
    e_langs = row["Languages Known"]

    score_product = product_score(p_products, e_products)
    score_location = location_score(p_location, p_flex, e_location, e_flex)
    score_language = language_score(p_langs, e_langs)

    scores.append({
        "ProjectID": row["ProjectID"],
        "EmployeeID": row["EmployeeID"],
        "Product Match Score": round(score_product, 2),
        "Location Match Score": round(score_location, 2),
        "Language Match Score": score_language
    })


scored_df = pd.DataFrame(scores)

In [94]:
scored_df.head()

Unnamed: 0,ProjectID,EmployeeID,Product Match Score,Location Match Score,Language Match Score
0,P1,E1,1.0,1.0,0.88
1,P1,E2,1.0,0.0,0.25
2,P1,E3,1.0,0.0,1.0
3,P1,E4,0.0,0.0,0.42
4,P1,E5,1.0,0.0,0.38


In [96]:
merged_df = merged_df.merge(scored_df, how = 'left', left_on = ['ProjectID','EmployeeID'], right_on = ['ProjectID','EmployeeID'])

In [98]:
pd.DataFrame(merged_df.head(100))

Unnamed: 0,ProjectID,Products,Work Location_x,Work Flexibility_x,Languages Required,EmployeeID,Products Experience,Work Location_y,Work Flexibility_y,Languages Known,Product Match Score,Location Match Score,Language Match Score
0,P1,[Print20.],Vienna,hybrid,"{'French': 'B1', 'nEglish': 'A2', 'tIalian': '...",E1,[Print2.0],Vienna,hybrid,"{'Italian': 'C1', 'German': 'B1', 'English': '...",1.0,1.0,0.88
1,P1,[Print20.],Vienna,hybrid,"{'French': 'B1', 'nEglish': 'A2', 'tIalian': '...",E2,"[AIScan, Print2.0, Workflow2000]",Berlin,onsite,{'English': 'A2'},1.0,0.0,0.25
2,P1,[Print20.],Vienna,hybrid,"{'French': 'B1', 'nEglish': 'A2', 'tIalian': '...",E3,"[Workflow2000, Print2.0, AIScan]",London,onsite,"{'Italian': 'B1', 'French': 'C1', 'English': '...",1.0,0.0,1.00
3,P1,[Print20.],Vienna,hybrid,"{'French': 'B1', 'nEglish': 'A2', 'tIalian': '...",E4,"[Workflow2000, AIScan]",Berlin,remote,"{'French': 'B1', 'German': 'B2'}",0.0,0.0,0.42
4,P1,[Print20.],Vienna,hybrid,"{'French': 'B1', 'nEglish': 'A2', 'tIalian': '...",E5,"[Workflow2000, Print2.0]",London,onsite,"{'German': 'B1', 'Italian': 'C1'}",1.0,0.0,0.38
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,P10,"[Workflow2000, Print2.0]",Vinena,hybrid,"{'nEglish': 'C2', 'German': 'C1', 'Frecnh': 'A...",E6,"[Print2.0, Workflow2000, AIScan]",London,hybrid,{'French': 'B1'},1.0,0.0,0.25
96,P10,"[Workflow2000, Print2.0]",Vinena,hybrid,"{'nEglish': 'C2', 'German': 'C1', 'Frecnh': 'A...",E7,[Workflow2000],Berlin,remote,"{'Italian': 'C2', 'English': 'A1', 'German': '...",0.5,0.0,0.67
97,P10,"[Workflow2000, Print2.0]",Vinena,hybrid,"{'nEglish': 'C2', 'German': 'C1', 'Frecnh': 'A...",E8,"[Print2.0, Workflow2000]",Berlin,onsite,"{'German': 'C1', 'Italian': 'B1'}",1.0,0.0,0.50
98,P10,"[Workflow2000, Print2.0]",Vinena,hybrid,"{'nEglish': 'C2', 'German': 'C1', 'Frecnh': 'A...",E9,"[AIScan, Print2.0, Workflow2000]",Berlin,remote,"{'French': 'B1', 'Italian': 'B2'}",1.0,0.0,0.50


In [100]:
merged_df.to_csv('MVPScoring2.csv')