In [1]:
import re
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

In [2]:
def filter_location(location):
    result = re.findall(r"\,\s[A-Z]{2}$", location)
    if len(result):
        return result[0][2:]
    else:
        return location

data = pd.read_excel('./final_project.ods', engine='odf', dtype=str)
data = data.dropna(axis=0)


In [3]:
data['location'] = data['location'].apply(filter_location)
target = 'career_level'
y = data[target]
x = data.drop(target, axis=1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
preprocessor = ColumnTransformer(transformers=[
    ('title', TfidfVectorizer(stop_words='english', ngram_range=(1, 1)), 'title'),
    ('location', OneHotEncoder(handle_unknown='ignore'), ['location']),
    ('description', TfidfVectorizer(stop_words='english', ngram_range=(1, 2)), 'description'),  # this line make performance issue because of ngram_range=(1, 2) can lead to a lot of features
    ('industry', TfidfVectorizer(stop_words='english', ngram_range=(1, 1)), 'industry'),
    ('function', OneHotEncoder(), ['function']),
])

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC())
])

In [6]:
model.fit(x_train, y_train)

In [7]:
y_predict = model.predict(x_test)

In [8]:
print(classification_report(y_test, y_predict))

                                        precision    recall  f1-score   support

                        bereichsleiter       0.66      0.30      0.41       192
         director_business_unit_leader       0.75      0.21      0.33        14
                   manager_team_leader       0.67      0.69      0.68       534
managing_director_small_medium_company       0.00      0.00      0.00         1
  senior_specialist_or_project_manager       0.82      0.92      0.87       868
                            specialist       0.00      0.00      0.00         6

                              accuracy                           0.76      1615
                             macro avg       0.48      0.35      0.38      1615
                          weighted avg       0.75      0.76      0.74      1615



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
print(data['career_level'].value_counts())

career_level
senior_specialist_or_project_manager      4337
manager_team_leader                       2672
bereichsleiter                             960
director_business_unit_leader               70
specialist                                  30
managing_director_small_medium_company       4
Name: count, dtype: int64
