In [38]:
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest, chi2, SelectPercentile
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report

data = pd.read_excel("job_dataset.ods", engine="odf", dtype="str")

In [39]:
data = data.dropna(axis=0)

In [40]:
def filter_location(location):
    result = location.split(",")
    if len(result) > 1:
        return result[1][1:]
    else:
        return location
    
data['location'] = data['location'].apply(filter_location)

In [41]:
target = 'career_level'
# print(data[target].value_counts())

x = data.drop(target, axis=1)
y = data[target]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [42]:
# print(x_train.shape, x_test.shape)

# print(y_train.value_counts())
# print(y_test.value_counts())

# vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 1))
# processed_data = vectorizer.fit_transform(x_train['title'])
# print(vectorizer.vocabulary_)
# print(len(vectorizer.vocabulary_))
# print(processed_data.shape)

# encoder = OneHotEncoder()
# processed_data = encoder.fit_transform(x_train[['location']])
# print(processed_data.shape)

# unigram: 66745 features (TFIDF gán điểm cho các tokens này: token nào quan trọng điểm sẽ cao)
# unigram + bigram: 847124
# unigram + bigram + min df + max df: 4359
# Loại bỏ: các tokens ko cần thiết/ ít quan trong 1) token xuất hiện trong ít document quá 2) token xuất hiện trong nhiều document quá
# vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), min_df=0.01, max_df=0.99)
# processed_data = vectorizer.fit_transform(x_train['description'])
# print(vectorizer.vocabulary_)
# print(len(vectorizer.vocabulary_))
# print(processed_data.shape)

# encoder = OneHotEncoder()
# processed_data = encoder.fit_transform(x_train[['function']])
# print(processed_data.shape)

# encoder = OneHotEncoder()
# processed_data = encoder.fit_transform(x_train[['industry']])
# print(processed_data.shape)

In [43]:
preprocessor = ColumnTransformer(transformers=[
    ("title", TfidfVectorizer(stop_words='english', ngram_range=(1, 1)), "title"),
    ("location", OneHotEncoder(handle_unknown='ignore'), ["location"]),
    ("description", TfidfVectorizer(stop_words='english', ngram_range=(1, 2), min_df=0.01, max_df=0.99), "description"),
    ("function", OneHotEncoder(handle_unknown='ignore'), ["function"]),
    ("industry", TfidfVectorizer(stop_words='english', ngram_range=(1, 1)), "industry"),
])

model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    # Tính điểm cho từng feature: feature nào có liên quan/ ảnh hưởng đến target nhiều nhất sẽ có số điểm cao
    # ("feature_selector", SelectKBest(chi2, k=500)), 
    ("feature_selector", SelectPercentile(chi2, percentile=5)), 
    ("classifier", RandomForestClassifier(random_state=100))
])

# processed_data = model.fit_transform(x_train, y_train)
# processed_data.shape

In [None]:
model.fit(x_train, y_train)

y_predict = model.predict(x_test)

print("RandomForestClassifier: \n", classification_report(y_test, y_predict))   

# Without min_df + max_df
#                                         precision    recall  f1-score   support

#                         bereichsleiter       0.69      0.05      0.09       192
#          director_business_unit_leader       1.00      0.29      0.44        14
#                    manager_team_leader       0.62      0.51      0.56       534
# managing_director_small_medium_company       0.00      0.00      0.00         1
#   senior_specialist_or_project_manager       0.71      0.95      0.81       868
#                             specialist       0.00      0.00      0.00         6

#                               accuracy                           0.69      1615
#                              macro avg       0.50      0.30      0.32      1615
#                           weighted avg       0.68      0.69      0.64      1615

# With min_df + max_df
#                                          precision    recall  f1-score   support

#                         bereichsleiter       0.64      0.05      0.09       192
#          director_business_unit_leader       1.00      0.21      0.35        14
#                    manager_team_leader       0.62      0.68      0.65       534
# managing_director_small_medium_company       0.00      0.00      0.00         1
#   senior_specialist_or_project_manager       0.79      0.92      0.85       868
#                             specialist       0.00      0.00      0.00         6

#                               accuracy                           0.73      1615
#                              macro avg       0.51      0.31      0.32      1615
#                           weighted avg       0.72      0.73      0.69      1615

#                                          precision    recall  f1-score   support

# With min_df + max_df + top 1000 features
#                         bereichsleiter       0.69      0.11      0.20       192
#          director_business_unit_leader       1.00      0.29      0.44        14
#                    manager_team_leader       0.63      0.72      0.67       534
# managing_director_small_medium_company       0.00      0.00      0.00         1
#   senior_specialist_or_project_manager       0.82      0.91      0.87       868
#                             specialist       0.00      0.00      0.00         6

#                               accuracy                           0.75      1615
#                              macro avg       0.52      0.34      0.36      1615
#                           weighted avg       0.74      0.75      0.71      1615

# With min_df + max_df + top 500 features
#                                          precision    recall  f1-score   support

#                         bereichsleiter       0.72      0.12      0.21       192
#          director_business_unit_leader       1.00      0.29      0.44        14
#                    manager_team_leader       0.64      0.75      0.69       534
# managing_director_small_medium_company       0.00      0.00      0.00         1
#   senior_specialist_or_project_manager       0.84      0.92      0.88       868
#                             specialist       1.00      0.17      0.29         6

#                               accuracy                           0.76      1615
#                              macro avg       0.70      0.37      0.42      1615
#                           weighted avg       0.76      0.76      0.73      1615

#                                          precision    recall  f1-score   support

# With min_df + max_df + top 5% features
#                         bereichsleiter       0.62      0.15      0.24       192
#          director_business_unit_leader       1.00      0.29      0.44        14
#                    manager_team_leader       0.65      0.76      0.70       534
# managing_director_small_medium_company       0.00      0.00      0.00         1
#   senior_specialist_or_project_manager       0.85      0.92      0.88       868
#                             specialist       1.00      0.17      0.29         6

#                               accuracy                           0.77      1615
#                              macro avg       0.69      0.38      0.42      1615
#                           weighted avg       0.76      0.77      0.74      1615


RandomForestClassifier: 
                                         precision    recall  f1-score   support

                        bereichsleiter       0.62      0.15      0.24       192
         director_business_unit_leader       1.00      0.29      0.44        14
                   manager_team_leader       0.65      0.76      0.70       534
managing_director_small_medium_company       0.00      0.00      0.00         1
  senior_specialist_or_project_manager       0.85      0.92      0.88       868
                            specialist       1.00      0.17      0.29         6

                              accuracy                           0.77      1615
                             macro avg       0.69      0.38      0.42      1615
                          weighted avg       0.76      0.77      0.74      1615



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [45]:
# 1. Sử dụng các mô hình khác nhau
# Thử nghiệm với Logistic Regression

# from sklearn.linear_model import LogisticRegression

# model.set_params(classifier=LogisticRegression(random_state=100))
# model.fit(x_train, y_train)
# y_predict = model.predict(x_test)
# print("LogisticRegression: \n", classification_report(y_test, y_predict))   

In [46]:
# Thử nghiệm với Support Vector Machine (SVM)

# from sklearn.svm import SVC

# model.set_params(classifier=SVC(random_state=100))
# model.fit(x_train, y_train)
# y_predict = model.predict(x_test)
# print("SVM: \n", classification_report(y_test, y_predict))