In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler, SMOTEN
from sklearn.feature_selection import SelectKBest, chi2, SelectPercentile

In [None]:
# RandomOverSampler : dùng randomer sample theo phương pháp bình thương nhân bản các class thiểu số 
# SMOTEN: tính toán theo phương pháp smote dể tạo ra những sample có cùng tính chất với các class thiểu số chỉ dùng được với categorical feature
# SMOTENC: tính toán theo phương pháp smote dể tạo ra những sample có cùng tính chất với các class thiểu số chỉ dùng được với  Nominal and Continuous

In [2]:
df = pd.read_excel('final_project.ods', engine='odf', dtype= str)  #dtype= str ép kiểu cho dữ liệu từ đầu


In [3]:
def my_function(loc):
    result = re.findall("\s[A-Z]{2}$",loc)
    if len(result) >0:
        return result[0][1:]
    else:
        return loc


In [4]:
df['location'] = df['location'].apply(my_function)

In [5]:
df.head()

Unnamed: 0,title,location,description,function,industry,career_level
0,Technical Professional Lead - Process,TX,"Responsible for the study, design, and specifi...",production_manufacturing,Machinery and Industrial Facilities Engineering,senior_specialist_or_project_manager
1,Cnslt - Systems Eng- Midrange 1,WA,"Participates in design, development and implem...",information_technology_telecommunications,Financial Services,senior_specialist_or_project_manager
2,SharePoint Developers and Solution Architects,TX,We are currently in need of Developers who can...,consulting,IT Consulting,senior_specialist_or_project_manager
3,Business Information Services - Strategic Acco...,North Carolina,Experian is seeking an experienced Account Exe...,sales,"Security, Risk, Restructuring Consulting",senior_specialist_or_project_manager
4,Strategic Development Director (procurement),TX,Â Want to join a world-class global procuremen...,procurement_materials_logistics,Information Technology,bereichsleiter


In [6]:
len(df['location'].unique())

97

# check null data

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8074 entries, 0 to 8073
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         8074 non-null   object
 1   location      8074 non-null   object
 2   description   8074 non-null   object
 3   function      8074 non-null   object
 4   industry      8074 non-null   object
 5   career_level  8074 non-null   object
dtypes: object(6)
memory usage: 378.6+ KB


In [8]:
df.columns

Index(['title', 'location', 'description', 'function', 'industry',
       'career_level'],
      dtype='object')

# check balance data

In [9]:
df['career_level'].value_counts()

senior_specialist_or_project_manager      4338
manager_team_leader                       2672
bereichsleiter                             960
director_business_unit_leader               70
specialist                                  30
managing_director_small_medium_company       4
Name: career_level, dtype: int64

In [10]:
target = 'career_level'
x = df.drop(target, axis= 1)
y = df[target]

In [12]:
x.columns

Index(['title', 'location', 'description', 'function', 'industry'], dtype='object')

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size= 0.2, random_state= 42, stratify = y)

In [14]:
ros = SMOTEN(random_state= 0, sampling_strategy={"director_business_unit_leader":500, "specialist" : 500,"managing_director_small_medium_company": 500 }, k_neighbors = 2)

In [15]:
x_train, y_train = ros.fit_resample(x_train,y_train )

In [16]:
y_train.value_counts()

senior_specialist_or_project_manager      3470
manager_team_leader                       2138
bereichsleiter                             768
specialist                                 500
director_business_unit_leader              500
managing_director_small_medium_company     500
Name: career_level, dtype: int64

In [17]:
preprocessor = ColumnTransformer(transformers=[
    ("title", TfidfVectorizer(stop_words="english"), "title"),
    ("location", OneHotEncoder(handle_unknown="ignore"), ["location"]),
    ("description", TfidfVectorizer(stop_words="english", ngram_range=(1, 2), min_df=0.01, max_df=0.99), "description"),
    ("function", OneHotEncoder(handle_unknown="ignore"), ["function"]),
    ("industry", TfidfVectorizer(stop_words="english"), "industry")
])

In [23]:
cls = Pipeline(steps = [
    ("preprocessor", preprocessor),
    ("feature_selection", SelectPercentile(chi2, percentile= 5)),
    ("model", RandomForestClassifier())
])

In [24]:
# feature selection: giúp loại bỏ các feature ít ảnh hưởng đến cột target 
# chi2: phổ biến nhất và chỉ dùng cho non-negative numerical  cho bài toán classifier
# NOVA: dùng cho cả negative and non negative trong bài toán classifier

# đối với bài toán regression thì ta dùng 

In [25]:
cls.fit(x_train, y_train)

In [26]:
y_pred = cls.predict(x_test)

In [27]:
print(classification_report(y_test, y_pred))

                                        precision    recall  f1-score   support

                        bereichsleiter       0.59      0.22      0.32       192
         director_business_unit_leader       0.61      0.79      0.69        14
                   manager_team_leader       0.63      0.65      0.64       534
managing_director_small_medium_company       0.00      0.00      0.00         1
  senior_specialist_or_project_manager       0.80      0.89      0.84       868
                            specialist       1.00      0.33      0.50         6

                              accuracy                           0.73      1615
                             macro avg       0.60      0.48      0.50      1615
                          weighted avg       0.72      0.73      0.71      1615



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# training with original x, y

In [65]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size= 0.2, random_state= 42, stratify = y) #  stratify chia đúng tỉ lệ cho các lablel

In [66]:
cls = RandomForestClassifier()
cls.fit(x_train, y_train)

In [67]:
y_predict = cls.predict(x_test)

In [68]:
print(classification_report(y_test,y_predict))

                                        precision    recall  f1-score   support

                        bereichsleiter       0.61      0.06      0.10       192
         director_business_unit_leader       1.00      0.21      0.35        14
                   manager_team_leader       0.62      0.64      0.63       534
managing_director_small_medium_company       0.00      0.00      0.00         1
  senior_specialist_or_project_manager       0.78      0.94      0.85       868
                            specialist       0.00      0.00      0.00         6

                              accuracy                           0.73      1615
                             macro avg       0.50      0.31      0.32      1615
                          weighted avg       0.71      0.73      0.68      1615



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
