<a href="https://colab.research.google.com/github/kav0001/Capstone-Project-Predicting-Career-Domain-and-Seniority-from-LinkedIn-Profiles/blob/main/PDS_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import unicodedata

def normalize_title(title: str) -> str:
    s = str(title).lower()
    s = unicodedata.normalize("NFKD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))  # remove accents
    s = re.sub(r"[^\w\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

# 1) Director (включая managing director / direktor)
DIRECTOR_PAT = re.compile(
    r"\b(director|direktor|directeur|managing director|vertriebsdirektor|direktion)\b"
)

# 2) Management (C-level / VP / President / Geschäftsführer / Vorstand / Founder / Owner)
MANAGEMENT_PAT = re.compile(
    r"\b(ceo|cto|cfo|coo|cio|cdo|chief|svp|evp|vp|vice president|president|"
    r"owner|founder|co founder|gruender|grunder|"
    r"geschaftsfuhrer|geschaftsfuhrerin|geschaftsfuhrung|vorstand|vorsitzender|partner)\b"
)

# 3) Lead (Head, Leiter*, Leitung*, Teamlead*, Projektleiter* … + составные слова)
HEAD_PAT = re.compile(r"\bhead\b")
LEAD_PAT = re.compile(
    r"\b(\w*leiter\w*|\w*leitung\w*|teamlead|team lead|principal|staff)\b"
)
# (слово "lead" отдельно можно добавить, но оно иногда шумит)
LEAD_WORD_PAT = re.compile(r"\blead\b")

# 4) Junior (явные junior-сигналы + analyst и немецкие junior-роли)
JUNIOR_EXPLICIT_PAT = re.compile(
    r"\b(junior|jr|praktikant|werkstudent|trainee|intern|internship|apprentice|entry|graduate)\b"
)
JUNIOR_ROLE_PAT = re.compile(
    r"\b(mitarbeiterin|mitarbeiter|referentin|referent|assistentin|assistent|analyst)\b"
)

SENIOR_CUES_PAT = re.compile(r"\b(senior|sr|manager|engineer|consultant|specialist|expert)\b")

def predict_seniority(position: str) -> str:
    s = normalize_title(position)

    # 1) Director
    if DIRECTOR_PAT.search(s):
        return "Director"

    # 2) Management
    if MANAGEMENT_PAT.search(s):
        return "Management"

    # 3) Lead
    # "Head of ..." чаще Lead, но "Assistant to Head of ..." лучше не относить к Lead
    if HEAD_PAT.search(s):
        if ("assistant" in s) or re.search(r"\bmanager\b", s):
            pass
        else:
            return "Lead"

    if LEAD_PAT.search(s):
        return "Lead"

    # если хочешь, включи слово "lead" как отдельный сигнал:
    if LEAD_WORD_PAT.search(s):
        return "Lead"

    # 4) Junior
    if JUNIOR_EXPLICIT_PAT.search(s):
        return "Junior"

    if JUNIOR_ROLE_PAT.search(s):
        # если явно senior, то senior
        if re.search(r"\b(senior|sr)\b", s):
            return "Senior"
        return "Junior"

    # 5) Senior (по умолчанию)
    if SENIOR_CUES_PAT.search(s):
        return "Senior"

    return "Senior"

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


url = "https://drive.google.com/uc?export=download&id=17rxzgzyZbgtBV0gj9Ai1sBmnrrggShZj"
df = pd.read_csv(url)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])

test_pred = test_df["text"].map(predict_seniority)
print(classification_report(test_df["label"], test_pred))

              precision    recall  f1-score   support

    Director       0.98      0.97      0.98       197
      Junior       0.93      0.91      0.92        82
        Lead       0.91      0.96      0.94       709
  Management       0.83      0.94      0.88       151
      Senior       0.94      0.88      0.91       747

    accuracy                           0.92      1886
   macro avg       0.92      0.93      0.92      1886
weighted avg       0.93      0.92      0.92      1886



In [1]:
import re, unicodedata

def normalize_title(title: str) -> str:
    s = str(title).lower()
    s = unicodedata.normalize("NFKD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    s = re.sub(r"[^\w\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

DIRECTOR_PAT = re.compile(r"\b(managing director|director|direktor|directeur)\b")

CLEVEL_PAT = re.compile(r"\b(ceo|cto|cfo|coo|cio|cdo|chief|svp|evp|vp|vice president|president)\b")
ASSIST_PAT = re.compile(r"\b(assistant|executive assistant|assistent|assistentin|assistante)\b")

# Management без "partner" (слишком шумит)
MANAGEMENT_PAT = re.compile(
    r"\b(ceo|cto|cfo|coo|cio|cdo|chief|svp|evp|vp|vice president|president|"
    r"owner|founder|co founder|gruender|grunder|"
    r"geschaftsfuhrer|geschaftsfuhrerin|geschaftsfuhrung|vorstand|vorsitzender)\b"
)
MANAGING_PARTNER_PAT = re.compile(r"\bmanaging partner\b")
PARTNER_CONTEXT_NONMGMT_PAT = re.compile(
    r"\b(business partner|hr business partner|partner manager|partner management|channel partner|partner sales|sales partner)\b"
)

HEAD_PAT = re.compile(r"\bhead\b")
LEAD_PAT = re.compile(r"\b(\w*leiter\w*|\w*leitung\w*|teamlead|team lead|principal|staff)\b")
LEAD_WORD_PAT = re.compile(r"\blead\b")

TEAM_LEAD_EN_PAT = re.compile(r"\bteam lead\b")
PROJECT_LEAD_EN_PAT = re.compile(r"\bproject lead\b")
LEAD_FUNCTIONAL_FOLLOW_PAT = re.compile(r"\blead\s+(engineer|analyst|consultant|specialist|architect|developer|scientist|designer|product owner)\b")

JUNIOR_EXPLICIT_PAT = re.compile(r"\b(junior|jr|praktikant|werkstudent|trainee|intern|internship|apprentice|entry|graduate)\b")
JUNIOR_ROLE_PAT = re.compile(r"\b(mitarbeiterin|mitarbeiter|referentin|referent|assistentin|assistent|analyst)\b")
SENIOR_CUES_PAT = re.compile(r"\b(senior|sr|manager|engineer|consultant|specialist|expert)\b")

def predict_seniority(position: str) -> str:
    s = normalize_title(position)

    if DIRECTOR_PAT.search(s):
        return "Director"

    # ассистенты при C-level обычно НЕ Management в твоей разметке
    if ASSIST_PAT.search(s) and CLEVEL_PAT.search(s):
        return "Senior"

    if (MANAGEMENT_PAT.search(s) or MANAGING_PARTNER_PAT.search(s)):
        if PARTNER_CONTEXT_NONMGMT_PAT.search(s):
            return "Senior"
        return "Management"

    # в твоей разметке "team lead"/"project lead" чаще Senior
    if TEAM_LEAD_EN_PAT.search(s) or PROJECT_LEAD_EN_PAT.search(s):
        return "Senior"
    if LEAD_FUNCTIONAL_FOLLOW_PAT.search(s):
        return "Senior"

    if HEAD_PAT.search(s):
        if ("assistant" in s) or re.search(r"\bmanager\b", s):
            pass
        else:
            return "Lead"

    if LEAD_PAT.search(s) or LEAD_WORD_PAT.search(s):
        return "Lead"

    if JUNIOR_EXPLICIT_PAT.search(s):
        return "Junior"
    if JUNIOR_ROLE_PAT.search(s):
        if re.search(r"\b(senior|sr)\b", s):
            return "Senior"
        return "Junior"

    if SENIOR_CUES_PAT.search(s):
        return "Senior"

    return "Senior"


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


url = "https://drive.google.com/uc?export=download&id=17rxzgzyZbgtBV0gj9Ai1sBmnrrggShZj"
df = pd.read_csv(url)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])

test_pred = test_df["text"].map(predict_seniority)
print(classification_report(test_df["label"], test_pred))

              precision    recall  f1-score   support

    Director       0.98      0.96      0.97       197
      Junior       0.91      0.91      0.91        82
        Lead       0.92      0.96      0.94       709
  Management       0.88      0.91      0.90       151
      Senior       0.94      0.89      0.92       747

    accuracy                           0.93      1886
   macro avg       0.93      0.93      0.93      1886
weighted avg       0.93      0.93      0.93      1886



In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("https://drive.google.com/uc?export=download&id=17rxzgzyZbgtBV0gj9Ai1sBmnrrggShZj")  # столбцы: text, label
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])

test_df = test_df.copy()
test_df["pred"] = test_df["text"].map(predict_seniority)

err = test_df[test_df["label"] != test_df["pred"]]


In [6]:
cases = err[(err["label"]=="Senior") & (err["pred"]=="Management")]["text"]
cases.head(50).to_list()

['Senior Vice President International Sales Tulip Food Company',
 'Senior Vice President (Messe Kontakt)',
 'CEO / Recruiting Manager (Executive Search)',
 'Solution Architect | Managing Partner',
 'Senior Product Owner CRM',
 'Recruiting Specialist & Managing Partner',
 'Executive Assistant / Assistentin der Geschäftsführung',
 'Senior Vice President IT & Prozesse - Mitglied der operativen Geschäftsleitung',
 'Chief Executive Manager',
 'Senior Vice President',
 'Assistentin der Geschäftsführung / Management Office']

In [7]:
cases_partner = cases[cases.str.contains("partner", case=False, na=False)]
cases_partner.head(50).to_list()

['Solution Architect | Managing Partner',
 'Recruiting Specialist & Managing Partner']

In [8]:
top_pair = err.groupby(["label","pred"]).size().sort_values(ascending=False).index[0]
true_lab, pred_lab = top_pair

sample = err[(err["label"]==true_lab) & (err["pred"]==pred_lab)]["text"].head(80).to_list()
true_lab, pred_lab, sample[:20]

('Senior',
 'Lead',
 ['Lead Sales Management',
  'Bereichsleitung Key Account Management CRM',
  'Growth LEAD Manager',
  'Teamleiter Außendienst / Key Account Manager D-A-CH',
  'Gruppenleiter Project Management Office & Proce...',
  'Head of Visitor Management',
  'CRM Lead Solution Consultant - Salesforce@TietoEVRY',
  'Head Business Application Management',
  'Prokurist / Senior Projektleiter',
  'Marketingleiterin / Marketing Manager',
  'Sales Manager / Vertriebsleiter',
  'Head of Key Account Management Hotel & Gastronomy',
  'Leiter Akquisition & Key Account Management Modernisation',
  'Senior Integration Engineer (Teamleiter)',
  'CRM & Lead Manager',
  'Marketingleitung /Marketing Manager',
  'Head of Area Sales Management',
  'Deputy Head of CRM / Senior Credit Officer (NPL...',
  'Head of Managed Hosting',
  'Leiter Vertrieb Managed Services / IT-Strategie...'])