In [192]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [193]:
test_info = pd.read_csv('data/info_test.csv')
train_info = pd.read_csv('data/info_train.csv')

test_label = pd.read_csv('data/label_test.csv')
train_label = pd.read_csv('data/label_train.csv')

test_work = pd.read_csv('data/work_test.csv')
train_work = pd.read_csv('data/work_train.csv')

## Data Preprocessing

In [194]:
train_work.head(10)

Unnamed: 0,id,id_bh,id_management,id_office,company_type,job/role,from_date,to_date,employee_lv,address
0,1,113039360,106,TF2212F,-1,Giám đốc,20130100,20151200,7.0,Hà Nội
1,1,113039360,106,TF2212F,-1,Giám đốc,20160100,20220400,10.0,Hà Nội
2,2,116074930,102,TB16010,-1,Nhân viên lễ tân,20160600,20161200,7.0,Hà Nội
3,2,116074930,102,TB16010,-1,Nhân viên lễ tân,20170100,20170300,8.0,Hà Nội
4,2,116074930,102,,-1,,20170400,20170700,-1.0,
5,2,116074930,102,TB16010,-1,Nhân viên Sales Admin,20170800,20191200,8.0,Hà Nội
6,2,116074930,102,TB16010,-1,Nhân viên Sales Admin,20200100,20200600,9.0,Việt Nam
7,2,116074930,102,TB16010,-1,,20200700,20201200,9.0,
8,2,116074930,102,TB16010,-1,Nhân viên Sales Admin,20210100,20210100,9.0,Hà Nội
9,4,203060233,102,CB16291,-1,Kế toán,20030300,20050900,1.0,


### Rename columns

In [195]:
train_work.rename(columns = {'address': 'work_address'}, inplace = True)
test_work.rename(columns = {'address': 'work_address'}, inplace = True)

train_info.rename(columns = {'address': 'home_address'}, inplace = True)
test_info.rename(columns = {'address': 'home_address'}, inplace = True)

### Drop old data

In [196]:
train_work = train_work.drop_duplicates(subset=['id'], keep='last')
train_work.head()

Unnamed: 0,id,id_bh,id_management,id_office,company_type,job/role,from_date,to_date,employee_lv,work_address
1,1,113039360,106,TF2212F,-1,Giám đốc,20160100,20220400,10.0,Hà Nội
8,2,116074930,102,TB16010,-1,Nhân viên Sales Admin,20210100,20210100,9.0,Hà Nội
22,4,203060233,102,TB0280B,-1,Phó giám đốc,20200100,20220400,10.0,Hà Nội
31,5,131373210,105,TE0785E,-1,,20210600,20220400,9.0,
36,6,198079441,102,QW00999,-1,,20151000,20220400,8.0,


In [197]:
test_work = test_work.drop_duplicates(subset=['id'], keep='last')
test_work.head()

Unnamed: 0,id,id_bh,id_management,id_office,company_type,job/role,from_date,to_date,employee_lv,work_address
1,3,116303809,102,IC0041B,3,"Giám đốc khối, khối tài chính",20190700,20210100,59.0,Việt Nam
3,10,116301808,102,TB1378B,-1,Nhân viên,20200100,20210200,9.0,TP. Hà Nội
4,11,131644973,102,IC0108B,6,Chuyên gia,20190900,20220100,59.0,TP. Hà Nội
8,12,131264273,2706,TF0010F,-1,Công nhân bông,20210500,20220400,7.0,Bắc Ninh
25,14,113003795,100,HW01180,6,"Chuyên viên cao cấp, Phó vụ trưởng",20140100,20220400,18.0,HN


In [198]:
X_train = pd.merge(train_work, train_info, on='id_bh')
X_train.head()

Unnamed: 0,id,id_bh,id_management,id_office,company_type,job/role,from_date,to_date,employee_lv,work_address,bithYear,gender,home_address
0,1,113039360,106,TF2212F,-1,Giám đốc,20160100,20220400,10.0,Hà Nội,1971,MALE,hà Nội
1,2,116074930,102,TB16010,-1,Nhân viên Sales Admin,20210100,20210100,9.0,Hà Nội,1993,FEMALE,Thành phố Hà Nội
2,4,203060233,102,TB0280B,-1,Phó giám đốc,20200100,20220400,10.0,Hà Nội,1977,MALE,
3,5,131373210,105,TE0785E,-1,,20210600,20220400,9.0,,1996,FEMALE,
4,6,198079441,102,QW00999,-1,,20151000,20220400,8.0,,1971,MALE,


In [199]:
print(X_train["home_address"].isna().sum())

14288


In [200]:
train_info.head()

Unnamed: 0,bithYear,gender,home_address,id_bh
0,1971,MALE,hà Nội,113039360
1,1993,FEMALE,Thành phố Hà Nội,116074930
2,1977,MALE,,203060233
3,1996,FEMALE,,131373210
4,1971,MALE,,198079441


In [201]:
train_work.head()

Unnamed: 0,id,id_bh,id_management,id_office,company_type,job/role,from_date,to_date,employee_lv,work_address
1,1,113039360,106,TF2212F,-1,Giám đốc,20160100,20220400,10.0,Hà Nội
8,2,116074930,102,TB16010,-1,Nhân viên Sales Admin,20210100,20210100,9.0,Hà Nội
22,4,203060233,102,TB0280B,-1,Phó giám đốc,20200100,20220400,10.0,Hà Nội
31,5,131373210,105,TE0785E,-1,,20210600,20220400,9.0,
36,6,198079441,102,QW00999,-1,,20151000,20220400,8.0,


In [202]:
X_test = pd.merge(test_work, test_info, on='id_bh')
X_test.head()

Unnamed: 0,id,id_bh,id_management,id_office,company_type,job/role,from_date,to_date,employee_lv,work_address,bithYear,gender,home_address
0,3,116303809,102,IC0041B,3,"Giám đốc khối, khối tài chính",20190700,20210100,59.0,Việt Nam,1975,FEMALE,số 16 Phan Chu Trinh
1,10,116301808,102,TB1378B,-1,Nhân viên,20200100,20210200,9.0,TP. Hà Nội,1971,MALE,Nam Định
2,11,131644973,102,IC0108B,6,Chuyên gia,20190900,20220100,59.0,TP. Hà Nội,1989,MALE,
3,12,131264273,2706,TF0010F,-1,Công nhân bông,20210500,20220400,7.0,Bắc Ninh,1983,FEMALE,
4,14,113003795,100,HW01180,6,"Chuyên viên cao cấp, Phó vụ trưởng",20140100,20220400,18.0,HN,1962,MALE,Hà Nội


In [203]:
y_train = pd.merge(X_train, train_label, on='id_bh')['label'].astype('category')
y_train.head()

0    4
1    2
2    4
3    2
4    2
Name: label, dtype: category
Categories (7, int64): [1, 2, 3, 4, 5, 6, 7]

### Fill in NaN value

In [204]:
X_train["job/role"] = X_train["job/role"].replace(np.nan, "thiếu")
X_train["work_address"] = X_train["work_address"].replace(np.nan, "việt nam")
X_train["home_address"] = X_train["home_address"].replace(np.nan, "việt nam")

X_test["job/role"] = X_test["job/role"].replace(np.nan, "thiếu")
X_test["work_address"] = X_test["work_address"].replace(np.nan, "việt nam")
X_test["home_address"] = X_test["home_address"].replace(np.nan, "việt nam")

In [205]:
X_train["home_address"].value_counts()

việt nam              14288
Vĩnh Phúc              2984
Bắc Giang              1664
Nghệ An                1246
Hà Nội                 1082
                      ...  
Khối 8                    1
TP Lạng Sơn               1
Bắc Ninh_409416           1
Thành phố Lạng Sơn        1
xã Văn Tiến               1
Name: home_address, Length: 924, dtype: int64

### Delete Vietnamese accent

In [206]:
import re

In [207]:
def delete_vn_accent(s):
    s = re.sub('[ÁÀẢÃẠĂẮẰẲẴẶÂẤẦẨẪẬ]', 'A', s)
    s = re.sub('[áàảãạăắằẳẵặâấầẩẫậ]', 'a', s)
    s = re.sub('Đ', 'D', s)
    s = re.sub('đ', 'd', s)
    s = re.sub('[ÉÈẺẼẸÊẾỀỂỄỆ]', 'E', s)
    s = re.sub('[éèẻẽẹêếềểễệ]', 'e', s)
    s = re.sub('[ÍÌỈĨỊ]', 'I', s)
    s = re.sub('[íìỉĩị]', 'i', s)
    s = re.sub('[óòỏõọôốồổỗộơớờởỡợ]', 'o', s)
    s = re.sub('[ÓÒỎÕỌÔỐỒỔỖỘƠỚỜỞỠỢ]', 'O', s)
    s = re.sub('[ÚÙỦŨỤƯỨỪỬỮỰ]', 'U', s)
    s = re.sub('[úùủũụưứừửữự]', 'u', s)
    s = re.sub('[ÝỲỶỸỴ]', 'Y', s)
    s = re.sub('[ýỳỷỹỵ]', 'y', s)
    return s

In [208]:
X_train['home_address'] = X_train['home_address'].apply(delete_vn_accent)
X_train['work_address'] = X_train['work_address'].apply(delete_vn_accent)
X_train['job/role'] = X_train['job/role'].apply(delete_vn_accent) 

X_test['home_address'] = X_test['home_address'].apply(delete_vn_accent)
X_test['work_address'] = X_test['work_address'].apply(delete_vn_accent)
X_test['job/role'] = X_test['job/role'].apply(delete_vn_accent) 

In [209]:
X_train.head()

Unnamed: 0,id,id_bh,id_management,id_office,company_type,job/role,from_date,to_date,employee_lv,work_address,bithYear,gender,home_address
0,1,113039360,106,TF2212F,-1,Giam doc,20160100,20220400,10.0,Ha Noi,1971,MALE,ha Noi
1,2,116074930,102,TB16010,-1,Nhan vien Sales Admin,20210100,20210100,9.0,Ha Noi,1993,FEMALE,Thanh pho Ha Noi
2,4,203060233,102,TB0280B,-1,Pho giam doc,20200100,20220400,10.0,Ha Noi,1977,MALE,viet nam
3,5,131373210,105,TE0785E,-1,thieu,20210600,20220400,9.0,viet nam,1996,FEMALE,viet nam
4,6,198079441,102,QW00999,-1,thieu,20151000,20220400,8.0,viet nam,1971,MALE,viet nam


In [210]:
X_test.head()

Unnamed: 0,id,id_bh,id_management,id_office,company_type,job/role,from_date,to_date,employee_lv,work_address,bithYear,gender,home_address
0,3,116303809,102,IC0041B,3,"Giam doc khoi, khoi tai chinh",20190700,20210100,59.0,Viet Nam,1975,FEMALE,so 16 Phan Chu Trinh
1,10,116301808,102,TB1378B,-1,Nhan vien,20200100,20210200,9.0,TP. Ha Noi,1971,MALE,Nam Dinh
2,11,131644973,102,IC0108B,6,Chuyen gia,20190900,20220100,59.0,TP. Ha Noi,1989,MALE,viet nam
3,12,131264273,2706,TF0010F,-1,Cong nhan bong,20210500,20220400,7.0,Bac Ninh,1983,FEMALE,viet nam
4,14,113003795,100,HW01180,6,"Chuyen vien cao cap, Pho vu truong",20140100,20220400,18.0,HN,1962,MALE,Ha Noi


### Logisticalize Gender

In [211]:
X_train['gender'].replace('FEMALE', 0 ,inplace=True)
X_train['gender'].replace('MALE', 1, inplace=True)

X_test['gender'].replace('FEMALE', 0 ,inplace=True)
X_test['gender'].replace('MALE', 1, inplace=True)

X_train['gender'] = X_train['gender'].astype('category')
X_test['gender'] = X_test['gender'].astype('category')

In [212]:
X_train['gender'].dtype

CategoricalDtype(categories=[0, 1], ordered=False)

### Synchronize address to 1 form

#### Change assdress's data type

In [213]:
X_train.info()
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27502 entries, 0 to 27501
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   id             27502 non-null  int64   
 1   id_bh          27502 non-null  int64   
 2   id_management  27502 non-null  int64   
 3   id_office      27502 non-null  object  
 4   company_type   27502 non-null  int64   
 5   job/role       27502 non-null  object  
 6   from_date      27502 non-null  int64   
 7   to_date        27502 non-null  int64   
 8   employee_lv    27502 non-null  float64 
 9   work_address   27502 non-null  object  
 10  bithYear       27502 non-null  int64   
 11  gender         27502 non-null  category
 12  home_address   27502 non-null  object  
dtypes: category(1), float64(1), int64(7), object(4)
memory usage: 2.8+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 18134 entries, 0 to 18133
Data columns (total 13 columns):
 #   Column         Non-Null Cou

#### Delete special symbols


In [214]:
def delete_special_symbol(s):
    s = re.sub('[!@#$%^&*/*-+=_.,~`\"\']', ' ', s)
    return s

In [215]:
X_train['home_address'] = X_train['home_address'].apply(delete_special_symbol)
X_train['work_address'] = X_train['work_address'].apply(delete_special_symbol)
X_train["job/role"] = X_train["job/role"].apply(delete_special_symbol)

X_test['home_address'] = X_test['home_address'].apply(delete_special_symbol)
X_test['work_address'] = X_test['work_address'].apply(delete_special_symbol)
X_test["job/role"] = X_test["job/role"].apply(delete_special_symbol)

#### Lowercase Remove redundant white space

In [216]:
X_train['home_address'] = X_train['home_address'].str.lower()
X_test['home_address'] = X_test['home_address'].str.lower()

X_train['home_address'] = X_train['home_address'].replace(r'\s+', ' ', regex=True)
X_test['home_address'] = X_test['home_address'].replace(r'\s+', ' ', regex=True)

In [217]:
X_train['work_address'] = X_train['work_address'].str.lower()
X_test['work_address'] = X_test['work_address'].str.lower()

X_train['work_address'] = X_train['work_address'].replace(r'\s+', ' ', regex=True)
X_test['work_address'] = X_test['work_address'].replace(r'\s+', ' ', regex=True)

In [218]:
X_train['job/role'] = X_train["job/role"].str.lower()
X_test["job/role"] = X_test["job/role"].str.lower()

X_train["job/role"] = X_train["job/role"].replace(r'\s+', ' ', regex=True)
X_test["job/role"] = X_test["job/role"].replace(r'\s+', ' ', regex=True)

#### Keep province's name only

In [219]:
def address_group(s):
    if type(s) == str:
        if ('ha noi' in s) or ('hn' in s) or ('ha no' in s) or ('thµnh phe hµ nei' in s) or ('ha n?i' in s) or ('ha n' in s)\
            or ('dang tien dong' in s) or ('duong k2' in s) or ('tu liem' in s) or ('kim ma' in s)\
            or ('ha dong' in s) or ('tay ho' in s) or ('hai ba' in s) or ('dong da' in s) or ('thanh xuan' in s)\
            or ('phuong mai' in s) or ('hh2abc' in s) or ('quan thanh' in s) or ('tho quan' in s)\
            or ('ct 10b' in s) or ('khu tap the 664' in s) or ('dang van ngu' in s) or ('nha d4 tap the det 8 3' in s)\
            or ('khu toi dinh cu duong 32' in s) or ('khu chung cu viet hung' in s) or ('tt138a giang vu' in s)\
            or ('dao tan' in s) or ('hoang dao thyy' in s) or ('to ngoc van' in s) or ('bach dang' in s)\
            or ('ba dinh' in s) or ('nguyen ngoc nai' in s) or ('ngo thai thinh 1' in s) or ('tan lap' in s)\
            or ('chung cu dai thanh' in s) or ('hoang van thai' in s) or ('trinh luong' in s) or ('doi cung' in s)\
            or ('thanh van' in s) or ('nghia dung' in s) or ('van ho 3' in s) or ('nghia tan' in s)\
            or ('bach khoa' in s) or ('ngai cau' in s) or ('tap the thanh cong' in s)\
            or ('thanh binh' in s) or ('tan trieu' in s) or ('vinh tuy' in s) or ('giang vo' in s)\
            or ('kham thien' in s) or ('nguyen chi thanh' in s) or ('pham tuan tai' in s) or ('nui truc' in s)\
            or ('keangnam' in s) or ('ngach 46' in s) or ('truong dinh' in s) or ('nhan chinh' in s)\
            or ('thon trung' in s) or ('ngoc hoi' in s) or ('doc tam da' in s) or ('de la thanh' in s)\
            or ('doi can' in s) or ('dong chanh' in s) or ('mandarin garden' in s)\
            or ('hoang minh giam' in s) or ('bo de' in s) or ('hoang hoa tham' in s) or ('hoai duc' in s)\
            or ('phuc xa' in s) or ('chelsea park' in s) or ('le loi' in s) or ('linh dam' in s)\
            or ('nguyen khang' in s) or ('nguyen trai' in s) or ('ngo quyen' in s) or ('me tro thuong' in s)\
            or ('nguyen xien' in s) or ('hung vuong' in s) or ('phu kieu' in s) or ('quan nhan' in s)\
            or ('chung cu 24t1' in s) or ('ngoc chi' in s) or ('phan dinh giot' in s) or ('quan tho' in s)\
            or ('yen phu' in s) or ('thuong coc' in s) or ('thai ha' in s) or ('van phuc' in s)\
            or ('an hoa' in s) or ('yen thi' in s) or ('lang ha' in s) or ('lotte center' in s)\
            or ('vong duc' in s) or ('nguyen van cu' in s) or ('huynh cung' in s) or ('dich vong' in s)\
            or ('duong co' in s) or ('van cao' in s) or ('huu thanh oai' in s) or ('yen hoa' in s)\
            or ('kim lien moi' in s) or ('h.noi' in s) or ('ba trieu' in s) or ('dong anh' in s)\
            or ('thuong din' in s) or ('hoan kiem' in s) or ('to huu' in s) or ('cau giay' in s)\
            or ('minh khai' in s) or ('kim giang' in s) or ('thanh pho ha' in s) or ('trung tu' in s)\
            or ('tran thanh tong' in s) or ('tan tay do' in s) or ('dinh cong' in s) or ('xom dinh' in s)\
            or ('ho dac di' in s) or ('giai phong' in s) or ('nguyen thai hoc' in s) or ('long bien' in s)\
            or ('huynh thuc khang' in s) or ('my dinh' in s) or ('hong ha' in s) or ('xom soi' in s)\
            or ('phuc tan' in s) or ('xuan dinh' in s) or ('tan dan' in s) or ('tran nguyen han' in s)\
            or ('phu ha' in s) or ('duong buoi' in s) or ('cau trang' in s) or ('trieu khuc' in s)\
            or ('nguyen bieu' in s) or ('hang dau' in s) or ('ao sen' in s) or ('dai linh' in s)\
            or ('dong xa' in s) or ('kim nguu' in s) or ('quang tho' in s) or ('phan chu trinh' in s)\
            or ('nguyen khanh toan' in s) or ('chien thang' in s) or ('thai thinh' in s)\
            or ('vien hoa' in s) or ('dai hoc xay dung' in s) or ('an duong' in s):
                return 'ha noi'
        elif ('bac ninh' in s) or ('an binh' in s) or ('bn' in s) or ('dong tho' in s)\
            or ('bac binh' in s) or ('bacninh' in s):
            return 'bac ninh'
        elif ('tay ninh' in s) or ('trang bang' in s):
            return 'tay ninh'
        elif ('vinh long' in s):
            return 'vinh long'
        elif ('tra vinh' in s):
            return 'tra vinh'
        elif ('an giang' in s) or ('tinh bien' in s):
            return 'an giang'
        elif ('hoa binh' in s) or ('hb' in s) or ('huu nghi' in s) or ('ngoc son' in s):
            return 'hoa binh'
        elif ('binh thuan' in s) or ('bt' in s):
            return 'binh thuan'
        elif ('kien giang' in s):
            return 'kien giang'
        elif ('phu yen' in s):
            return 'phu yen'
        elif ('cao bang' in s):
            return 'cao bang'
        elif ('dalat' in s) or ('da lat' in s) or ("le hong phong" in s)\
            or ('duc trong' in s) or ('bao loc' in s) or ('lam ha' in s)\
            or ('lam dong' in s) or ('ld' in s) or ('don duong' in s) or ('di linh' in s) or ('bao lam' in s)\
            or ('cat lam' in s) or ('phi nom' in s) or ('lam hong' in s) or ('lac duong' in s)\
            or ('tan vuong' in s) or ('nghia hiep' in s):
                return 'lam dong'
        elif ('quang binh' in s):
            return 'quang binh'
        elif ('quang ngai' in s):
            return 'quang ngai'
        elif ('thai nguyen' in s) or ('tn' in s) or ('dai tu') in s or ('dong tien' in s):
            return 'thai nguyen'
        elif ('thanh hoa' in s) or ('t.hoa' in s):
            return 'thanh hoa'
        elif ('khe sanh' in s) or ('quang tri' in s) or ('qt' in s):
            return 'quang tri'
        elif ('ca mau' in s):
            return 'ca mau'
        elif ('long an' in s):
            return 'long an'
        elif ('khanh hoa' in s) or ('tinh lo 2' in s):
            return 'khanh hoa'
        elif ('bac lieu' in s):
            return 'bac lieu'
        elif ('ha tinh' in s) or ('xuan yen' in s):
            return 'ha tinh'
        elif ('dong nai' in s) or ('ql 20' in s) or ('long khanh' in s) or ('quoc lo 20' in s)\
            or ('ql20' in s) or ('dn' in s):
            return 'dong nai'
        elif ('bg' in s) or ('bac giang' in s) or ('bac gian' in s) or ('b.giang' in s) or ('bac giag' in s)\
            or ('yen the' in s) or ('lang giang' in s) or ('hiep hoa' in s) or ('son dong' in s)\
            or ('luc nam' in s) or ('tan yen' in s) or ('viet yen' in s) or ('xom gia' in s)\
            or ('bacgiang' in s) or ('dinh ke' in s) or ('yen dung' in s):
                return 'bac giang'
        elif ('nghe an' in s) or ('nghee an' in s) or ('anh son' in s) or ('thai hoa' in s) or ('nghi loc' in s)\
            or ('quynh luu' in s) or ('q.luu' in s) or('dien chau' in s) or ('do luong' in s) or ('tuong duong' in s)\
            or ('hien son' in s) or ('con cuong' in s) or ('qluu' in s) or ('na' in s) or ('yen thanh' in s)\
            or ('nge an' in s) or ('n an' in s) or ('nghe a' in s) or ('nghia dan' in s):
            return 'nghe an' 
        elif ('ninh binh' in s):
            return 'ninh binh'
        elif ('daklak' in s) or ('dak lak' in s) or ('dakk lak' in s) or ('lak' in s) or ('dac lac' in s)\
            or ('dak lac' in s) or ('krong pac' in s) or ('krongpac' in s) or ('bmt' in s) or ('krong pak' in s)\
            or ('cu kuin' in s) or ('buon ma thuot' in s) or ('krongpak' in s) or ('krong bong' in s):
                return 'dak lak'
        elif ('lang son' in s) or ("na sam" in s) or ('van lang' in s) or ('huu lung' in s)\
            or ('chi lang' in s) or ('hoang viet' in s) or ('ls' in s) or ('lson' in s):
            return 'lang son'
        elif ('vinh phuc' in s) or ('vinh phu' in s) or ('vinhphu' in s) or ('vp' in s) or ('v inh phuc' in s)\
            or ('van tien' in s) or ('thon dam noi' in s) or ('tam dao' in s) or ('luu quang' in s)\
            or ('dong tam' in s) or ('vinh phyc' in s) or ('tam duong' in s) or ('phu nong' in s)\
            or ('kha do' in s):
            return 'vinh phuc'
        elif ('lai chau' in s):
            return 'lai chau'
        elif ("phu tho" in s) or ('que trao' in s) or ('phy tho' in s):
            return 'phu tho'
        elif ('tien giang' in s):
            return 'tien giang'
        elif ('quang ninh' in s) or ('qn' in s) or ('bai chay' in s) or ('ha long' in s)\
            or ('quang nhinh' in s):
            return 'quang ninh'
        elif ('quang nam' in s) or ('dien hong' in s):
            return 'quang nam'
        elif ("binh phuoc" in s) or ('bp' in s) or ('bonh phuoc' in s) or ('binh phu' in s)\
            or ('long thuy' in s) or ('b×nh ph­ic' in s):
            return 'binh phuoc'
        elif ('bac kan' in s) or ('bac can' in s):
            return 'bac kan'
        elif ("hai phong" in s) or ('khu mieu' in s) or ('phuong khe' in s):
            return 'hai phong'
        elif ('dak nong' in s) or ('daknong' in s) or ('dacknong' in s) or ('dak mol' in s) or ('dac nong' in s)\
            or ('dak song' in s):
            return 'dak nong'
        elif ('nam dinh' in s) or ('y yen' in s) or ('duong dien bion' in s):
            return 'nam dinh'
        elif ('hung yen' in s) or ('lac long quan' in s) or ('hung yon' in s) or ('h­ng yªn' in s)\
            or ('phu thinh' in s):
            return 'hung yen'
        elif ('hue' in s) or ('phu le' in s):
            return 'hue'
        elif ('dong thap' in s) or ('dt' in s):
            return 'dong thap'
        elif ('thai binh' in s) or ('dong long' in s) or ('bich du' in s) or ('tb' in s) or ('ly bon' in s):
            return 'thai binh'
        elif ('brvt' in s) or ('vung tau' in s) or ('ba ria' in s):
            return 'ba ria vung tau'
        elif ('binh duong' in s) or ('bd' in s) or ('phuoc hoa' in s):
            return 'binh duong'
        elif ('yen bai' in s):
            return 'yen bai'
        elif ('son la' in s):
            return 'son la'
        elif ('tuyen quang' in s) or ('dong quy' in s):
            return 'tuyen quang'
        elif ('hai duong' in s) or ('an nhan tay' in s) or ('buom' in s):
            return 'hai duong'
        elif ('ninh thuan' in s):
            return 'ninh thuan'
        elif ('lao cai' in s):
            return 'lao cai'
        elif ('binh dinh' in s):
            return 'binh dinh'
        elif ('kon tum' in s) or ('kt' in s):
            return 'kon tum'
        elif ('hcm' in s) or ('ho chi minh' in s) or ('dl nguyen van linh' in s) or ('khu 7a' in s)\
            or ('q.12' in s) or ("quan 9" in s) or ('long phuoc' in s) or ('phan van hon' in s)\
            or ('q.5' in s) or ('ap 1b' in s) or ('q2' in s) or ('phu nhuan' in s)\
            or ('tan thuan dong' in s) or ('hiep thanh' in s):
                return 'ho chi minh'
        else:
            return s

In [220]:
X_train["home_address"] = X_train["home_address"].apply(address_group)
X_test["home_address"] = X_test["home_address"].apply(address_group)

X_train["work_address"] = X_train["work_address"].apply(address_group)
X_test["work_address"] = X_test["work_address"].apply(address_group)

In [221]:
X_train["home_address"].value_counts()
print(X_train["home_address"].isna().sum())

0


In [222]:
X_test["home_address"].value_counts()
print(X_test["home_address"].isna().sum())

0


In [223]:
X_train["home_address"].value_counts()

nghe an                   15983
vinh phuc                  3299
bac giang                  2214
ha noi                     1694
thai nguyen                 952
lam dong                    756
lang son                    386
dak lak                     317
quang ninh                  292
phu tho                     290
hoa binh                    210
binh phuoc                  160
dak nong                    152
thai binh                    62
thanh hoa                    61
bac ninh                     59
hai duong                    32
ho chi minh                  31
yen bai                      30
ha tay                       29
ha tinh                      25
hung yen                     25
tuyen quang                  23
dong nai                     20
hai phong                    17
ninh binh                    16
bac kan                      10
hue                           9
cao bang                      9
binh duong                    8
ha bac                        8
khanh ho

In [224]:
X_test["home_address"].value_counts()

nghe an                                       10567
vinh phuc                                      2139
bac giang                                      1412
ha noi                                         1115
thai nguyen                                     623
lam dong                                        488
lang son                                        267
dak lak                                         214
quang ninh                                      213
phu tho                                         186
hoa binh                                        155
binh phuoc                                      129
dak nong                                        100
thanh hoa                                        46
bac ninh                                         36
thai binh                                        27
hai duong                                        22
ha tay                                           19
ho chi minh                                      19
ha tinh     

### Job

#### Change job's data type

In [225]:
X_train["job/role"] = X_train["job/role"].astype('string')
X_test["job/role"] = X_test["job/role"].astype('string')

In [226]:
def job_group(x):
    if type(x) == str:
        if "cong nhan vien" in x or "nhan vien" in x or "giao dich vien" in x or "van thu" in x \
            or "thu ky" in x or "tro ly" in x or "nv" in x or "kiem soat vien" in x  or 'kiem tra vien' in x\
            or "van phong" in x or "xa vien" in x or "kiem thu vien" in x or "vien chuc" in x \
            or "kiem lam vien" in x or "chap hanh vien" in x or "hanh chinh" in x or "kiem tra chat luong" in x \
            or "thu vien" in x or "quan trac vien" in x or "phu trach quan he nguoi tieu dung" in x:
                return "nhan vien"
        elif "cong nhan" in x or "may cong nghiep" in x or "san pham" in x or "cn" in x or "khai thac" in x or "tho" in x \
            or "phet keo de va mu giay" in x or "ve sinh may chai ,day truyen soi ,det ,nhuom" in x or "c.n" in x \
            or "son, in da va pha che hoa chat de son, in da" in x or "sua chua" in x or "kt 3d" in x \
            or "cung nhon lap rop mach dien tu " in x or "luu hoa cac san pham cao su" in x or "qc" in x \
            or "thuy thu" in x or "cat vai trong cong nghe may" in x or "son, in da va pha che hoa chat de son, in da" in x \
            or "phay" in x or "phet keo mu giay" in x:
                return "cong nhan"
        elif "can bo" in x or "can su" in x:
            return "can bo/ can su"
        elif "chuyen vien" in x or "chuyen gia" in x:
            return "chuyen vien"
        elif "chu tich" in x:
            return "chu tich"
        elif "pho chu tich" in x:
            return "pho chu tich"
        elif "giam doc" in x or "tong giam doc" in x:
            return "giam doc"
        elif "pho giam doc" in x or "pho tong giam doc" in x or "pho gd" in x \
            or "p. giam doc" in x or "p.giam doc" in x or 'phu giom doc' in x:
            return "pho giam doc"
        elif "hieu truong" in x:
            return "hieu truong"
        elif "pho hieu truong" in x or "hieu pho" in x:
            return "pho hieu truong"
        elif "to truong" in x or "truong phong" in x or "quan ly" in x or "chuyen truong " in x or "doi truong" in x or 'doi truuong' in x\
            or "chu nhiem" in x or "quan ly" in x or "giam sat" in x or "nhom truong" in x or "truong ca" in x \
            or "truong nhom" in x or "quan doc" in x or "cua hang truong" in x or "thuyen truong" in x \
            or "kiem soat truong" in x or "doc cong" in x or "chu quan" in x or "tram truong" in x or "ca truong" in x:
                return "quan ly truong"
        elif "to pho " in x or "pho truong phong" in x or "pho phong" in x or "pho chanh" in x or "pho chu nhiem" in x \
            or "pho quan doc" in x or "doi pho" in x or "pho chi cuc truong" in x:
                return "quan ly pho"
        elif "bi thu" in x or "uy vien ban thuong vu" in x or "uy vien uy ban kiem tra" in x or "thuong truc dang uy" in x:
            return "bi thu"
        elif "pho bi thu" in x or 'pho bt' in x:
            return "pho bi thu"
        elif "chi huy truong quan su " in x or "truong cong an" in x:
            return "truong quan su"
        elif 'pho chi huy' in x:
            return 'phi chi huy'
        elif "bo doi" in x or "thanh tra vien" in x or "chien sy" in x:
            return "quan su"
        elif "ky su" in x or "ky thuat" in x or "kien truc su" in x or "lap trinh vien" in x or "ky thuat" in x:
            return "ky su"
        elif "giao vien" in x or "giang vien" in x:
            return "giao vien"
        elif "lai xe" in x or "tai xe " in x or "lai" in x or "lx" in x or "phu xe" in x or 'van hanh xe' in x:
            return "lai xe"
        elif "bao ve" in x:
            return "bao ve"
        elif "ke toan truong" in x:
            return "ke toan truong"
        elif "ke toan" in x or "kinh te vien" in x or "mau dich vien" in x or "kinh te" in x or "thong ke" in x:
            return "kinh te vien"
        elif "lao dong" in x or "ldpt" in x:
            return "lao dong"
        elif "y sy" in x or "bac sy" in x or "trinh duoc vien" in x or "duoc sy" in x:
            return "y sy"
        elif "dieu duong" in x or "ho ly" in x or "ho sinh" in x or "y ta" in x or "duoc ta" in x:
            return "dieu duong"
        elif "thu quy" in x or "thu kho" in x:
            return "thu nhan"
        elif "phong vien" in x or "bien tap vien" in x:
            return "truyen hinh"
        elif "nghien cuu vien" in x:
            return "nghien cuu vien"
        elif "ban hang" in x or "kinh doanh" in x:
            return "kinh doanh"
        elif "phien dich" in x or "phien dich tieng han" in x:
            return "phien dich"
        elif "dia chinh xay dung " in x:
            return "dia chinh xay dung"
        elif "van hoa xa hoi" in x:
            return "van hoa xa hoi"
        elif "tu phap ho tich" in x or "luat su" in x or "kiem sat vien" in x or "tu phap ho tich" in x:
            return "tu phap ho tich"
        elif "tham phan" in x:
            return "tham phan"
        elif "tap vu" in x:
            return "tap vu"
        elif "nghi thai san" in x:
            return "nghi thai san"
        elif "cap duong" in x:
            return "cap duong"
        elif "dien vien" in x:
            return "dien vien"
        elif "nau an" in x or "phu bep" in x:
            return "nau an"
        else:
            return x

In [227]:
X_train["job/role"] = X_train["job/role"].apply(job_group)
X_test["job/role"] = X_test["job/role"].apply(job_group)

In [228]:
X_train["job/role"].value_counts()

cong nhan                   9074
nhan vien                   7018
thieu                       2144
giam doc                    1515
quan ly truong              1398
                            ... 
cong chuc vptk                 1
cong chuc dc-xd                1
tuyen truyen vien              1
pho ban to chuc kiem tra       1
linh lieu                      1
Name: job/role, Length: 881, dtype: int64

In [229]:
X_test["job/role"].value_counts()

cong nhan                                     5926
nhan vien                                     4514
thieu                                         1417
giam doc                                      1056
quan ly truong                                 952
                                              ... 
truong tram y te                                 1
bac si da khoa                                   1
dia chinh-xay dung -nong nghiep-moi truong       1
giam dinh vien                                   1
truong may be 机长                                 1
Name: job/role, Length: 666, dtype: int64

In [230]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27502 entries, 0 to 27501
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   id             27502 non-null  int64   
 1   id_bh          27502 non-null  int64   
 2   id_management  27502 non-null  int64   
 3   id_office      27502 non-null  object  
 4   company_type   27502 non-null  int64   
 5   job/role       27502 non-null  object  
 6   from_date      27502 non-null  int64   
 7   to_date        27502 non-null  int64   
 8   employee_lv    27502 non-null  float64 
 9   work_address   27502 non-null  object  
 10  bithYear       27502 non-null  int64   
 11  gender         27502 non-null  category
 12  home_address   27502 non-null  object  
dtypes: category(1), float64(1), int64(7), object(4)
memory usage: 2.8+ MB


In [231]:
X_train

Unnamed: 0,id,id_bh,id_management,id_office,company_type,job/role,from_date,to_date,employee_lv,work_address,bithYear,gender,home_address
0,1,113039360,106,TF2212F,-1,giam doc,20160100,20220400,10.0,ha noi,1971,1,ha noi
1,2,116074930,102,TB16010,-1,nhan vien,20210100,20210100,9.0,ha noi,1993,0,ha noi
2,4,203060233,102,TB0280B,-1,giam doc,20200100,20220400,10.0,ha noi,1977,1,nghe an
3,5,131373210,105,TE0785E,-1,thieu,20210600,20220400,9.0,nghe an,1996,0,nghe an
4,6,198079441,102,QW00999,-1,thieu,20151000,20220400,8.0,nghe an,1971,1,nghe an
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27497,55001,2615101097,2600,YN0199Z,-1,cong nhan,20210300,20220400,10.0,vinh phuc,1985,0,vinh phuc
27498,55002,2611027516,2600,YN0094Z,-1,cong nhan,20200400,20210300,11.0,vinh phuc,1985,0,nghe an
27499,55005,2612210147,2600,YN0239Z,-1,cong nhan,20190300,20210300,9.0,vinh phuc,1986,0,vinh phuc
27500,55006,7516183730,7509,YN0168I,-1,cong nhan,20200100,20220400,10.0,dong nai,1975,1,nghe an


#### Change data type

In [232]:
X_train["home_address"] = X_train["home_address"].astype('string')
X_train["work_address"] = X_train["work_address"].astype('string')

X_test["home_address"] = X_test["home_address"].astype('string')
X_test["work_address"] = X_test["work_address"].astype('string')

In [233]:
# X_train['id_office']  = X_train['id_office'].astype('string')
# X_test['id_office'] = X_test['id_office'].astype('string')

In [234]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18134 entries, 0 to 18133
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   id             18134 non-null  int64   
 1   id_bh          18134 non-null  int64   
 2   id_management  18134 non-null  int64   
 3   id_office      18134 non-null  object  
 4   company_type   18134 non-null  int64   
 5   job/role       18134 non-null  object  
 6   from_date      18134 non-null  int64   
 7   to_date        18134 non-null  int64   
 8   employee_lv    18134 non-null  float64 
 9   work_address   18134 non-null  string  
 10  bithYear       18134 non-null  int64   
 11  gender         18134 non-null  category
 12  home_address   18134 non-null  string  
dtypes: category(1), float64(1), int64(7), object(2), string(2)
memory usage: 1.8+ MB


In [235]:
X_train["company_type"] = X_train["company_type"].replace(-1, 9)
X_test["company_type"] = X_test["company_type"].replace(-1, 9)

## Feature Engineering

### Age

In [236]:
import datetime
X_train['age'] = datetime.datetime.now().year - X_train['bithYear']
X_train['age_class'] = (X_train['age'] // 10).astype('int')

X_test['age'] = datetime.datetime.now().year - X_test['bithYear']
X_test['age_class'] = (X_test['age'] // 10).astype('int')

### Address 

In [237]:
group_1 = ['ba ria vung tau', 'quang ninh', 'binh duong', 'bac ninh', 'hai phong', 'ho chi minh', 'ha noi', 'dong nai', 'vinh phuc']
group_2 = ['thai nguyen', 'da nang', 'long an', 'lao cai', 'can tho', 'ninh binh', 'khanh hoa', 'quang nam', 'lam dong']
group_3 = ['tay ninh', 'quang ngai', 'binh phuoc', 'hai duong', 'hung yen', 'ha nam', 'bac giang', 'nam dinh', 'binh thuan']
group_4 = ['ha tinh', 'binh dinh', 'hoa binh', 'kien giang', 'tien giang', 'tuyen quang', 'gia lai', 'dak nong', 'vinh long']
group_5 = ['tra vinh', 'quang tri', 'ca mau', 'bac lieu', 'thanh hoa', 'dak lak', 'hue', 'dong thap', 'phu yen']
group_6 = ['ninh thuan', 'phu tho', 'lang son', 'hau giang', 'thai binh', 'son la', 'quang binh', 'soc trang', 'kon tum']
group_7 = ['nghe an', 'an giang', 'yen bai', 'lai chau', 'ben tre', 'bac kan', 'dien bien', 'cao bang', 'ha giang']

In [238]:
len(group_1), len(group_2), len(group_3), len(group_4), len(group_5), len(group_6), len(group_7)

(9, 9, 9, 9, 9, 9, 9)

In [239]:
area_group_train = []
for i in X_train["work_address"]:
    if i == 'viet nam':
        area_group_train.append(0)
    elif i in group_1:
        area_group_train.append(1)
    elif i in group_2:
        area_group_train.append(2)
    elif i in group_3:
        area_group_train.append(3)
    elif i in group_4:
        area_group_train.append(4)
    elif i in group_5:
        area_group_train.append(5)
    elif i in group_6:
        area_group_train.append(6)   
    elif i in group_7:
        area_group_train.append(7)
    else:
        area_group_train.append(8)

X_train["work_area_group"] = area_group_train
X_train["work_area_group"] = X_train["work_area_group"].astype('category')

In [240]:
area_group_test = []
for i in X_test["work_address"]:
    if i == 'viet nam':
        area_group_test.append(0)
    elif i in group_1:
        area_group_test.append(1)
    elif i in group_2:
        area_group_test.append(2)
    elif i in group_3:
        area_group_test.append(3)
    elif i in group_4:
        area_group_test.append(4)
    elif i in group_5:
        area_group_test.append(5)
    elif i in group_6:
        area_group_test.append(6)   
    elif i in group_7:
        area_group_test.append(7)
    else:
        area_group_test.append(8)

X_test["work_area_group"] = area_group_test
X_test["work_area_group"] = X_test["work_area_group"].astype('category')

In [241]:
X_train.head()

Unnamed: 0,id,id_bh,id_management,id_office,company_type,job/role,from_date,to_date,employee_lv,work_address,bithYear,gender,home_address,age,age_class,work_area_group
0,1,113039360,106,TF2212F,9,giam doc,20160100,20220400,10.0,ha noi,1971,1,ha noi,51,5,1
1,2,116074930,102,TB16010,9,nhan vien,20210100,20210100,9.0,ha noi,1993,0,ha noi,29,2,1
2,4,203060233,102,TB0280B,9,giam doc,20200100,20220400,10.0,ha noi,1977,1,nghe an,45,4,1
3,5,131373210,105,TE0785E,9,thieu,20210600,20220400,9.0,nghe an,1996,0,nghe an,26,2,7
4,6,198079441,102,QW00999,9,thieu,20151000,20220400,8.0,nghe an,1971,1,nghe an,51,5,7


In [242]:
X_train.columns.to_list

<bound method IndexOpsMixin.tolist of Index(['id', 'id_bh', 'id_management', 'id_office', 'company_type', 'job/role',
       'from_date', 'to_date', 'employee_lv', 'work_address', 'bithYear',
       'gender', 'home_address', 'age', 'age_class', 'work_area_group'],
      dtype='object')>

In [243]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27502 entries, 0 to 27501
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   id               27502 non-null  int64   
 1   id_bh            27502 non-null  int64   
 2   id_management    27502 non-null  int64   
 3   id_office        27502 non-null  object  
 4   company_type     27502 non-null  int64   
 5   job/role         27502 non-null  object  
 6   from_date        27502 non-null  int64   
 7   to_date          27502 non-null  int64   
 8   employee_lv      27502 non-null  float64 
 9   work_address     27502 non-null  string  
 10  bithYear         27502 non-null  int64   
 11  gender           27502 non-null  category
 12  home_address     27502 non-null  string  
 13  age              27502 non-null  int64   
 14  age_class        27502 non-null  int32   
 15  work_area_group  27502 non-null  category
dtypes: category(2), float64(1), int32(1), in

### Job/Role

#### Encode to Integer

In [244]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(X_train["job/role"])

X_train["job/role"] = integer_encoded

In [245]:
integer_encoded = label_encoder.fit_transform(X_test["job/role"])

X_test["job/role"] = integer_encoded

#### Scale

In [246]:
mean_job = X_train["job/role"].mean()
min_job = X_train["job/role"].min()
max_job = X_train["job/role"].max()

scaled_job = []
for i in X_train["job/role"]:
    scaled_job.append((i - mean_job)/(max_job - min_job))

X_train["job/role"] = scaled_job
X_train.head()

Unnamed: 0,id,id_bh,id_management,id_office,company_type,job/role,from_date,to_date,employee_lv,work_address,bithYear,gender,home_address,age,age_class,work_area_group
0,1,113039360,106,TF2212F,9,-0.053339,20160100,20220400,10.0,ha noi,1971,1,ha noi,51,5,1
1,2,116074930,102,TB16010,9,0.10348,20210100,20210100,9.0,ha noi,1993,0,ha noi,29,2,1
2,4,203060233,102,TB0280B,9,-0.053339,20200100,20220400,10.0,ha noi,1977,1,nghe an,45,4,1
3,5,131373210,105,TE0785E,9,0.342116,20210600,20220400,9.0,nghe an,1996,0,nghe an,26,2,7
4,6,198079441,102,QW00999,9,0.342116,20151000,20220400,8.0,nghe an,1971,1,nghe an,51,5,7


In [247]:
mean_job = X_test["job/role"].mean()
min_job = X_test["job/role"].min()
max_job = X_test["job/role"].max()

scaled_job = []
for i in X_test["job/role"]:
    scaled_job.append((i - mean_job)/(max_job - min_job))

X_test["job/role"] = scaled_job
X_test.head()

Unnamed: 0,id,id_bh,id_management,id_office,company_type,job/role,from_date,to_date,employee_lv,work_address,bithYear,gender,home_address,age,age_class,work_area_group
0,3,116303809,102,IC0041B,3,-0.053123,20190700,20210100,59.0,nghe an,1975,0,ha noi,47,4,7
1,10,116301808,102,TB1378B,9,0.107779,20200100,20210200,9.0,ha noi,1971,1,nghe an,51,5,1
2,11,131644973,102,IC0108B,6,-0.248612,20190900,20220100,59.0,ha noi,1989,1,nghe an,33,3,1
3,12,131264273,2706,TF0010F,9,-0.186958,20210500,20220400,7.0,bac ninh,1983,0,nghe an,39,3,1
4,14,113003795,100,HW01180,6,-0.248612,20140100,20220400,18.0,ha noi,1962,1,ha noi,60,6,1


In [248]:
X_train.head()

Unnamed: 0,id,id_bh,id_management,id_office,company_type,job/role,from_date,to_date,employee_lv,work_address,bithYear,gender,home_address,age,age_class,work_area_group
0,1,113039360,106,TF2212F,9,-0.053339,20160100,20220400,10.0,ha noi,1971,1,ha noi,51,5,1
1,2,116074930,102,TB16010,9,0.10348,20210100,20210100,9.0,ha noi,1993,0,ha noi,29,2,1
2,4,203060233,102,TB0280B,9,-0.053339,20200100,20220400,10.0,ha noi,1977,1,nghe an,45,4,1
3,5,131373210,105,TE0785E,9,0.342116,20210600,20220400,9.0,nghe an,1996,0,nghe an,26,2,7
4,6,198079441,102,QW00999,9,0.342116,20151000,20220400,8.0,nghe an,1971,1,nghe an,51,5,7


In [249]:
X_test.head()

Unnamed: 0,id,id_bh,id_management,id_office,company_type,job/role,from_date,to_date,employee_lv,work_address,bithYear,gender,home_address,age,age_class,work_area_group
0,3,116303809,102,IC0041B,3,-0.053123,20190700,20210100,59.0,nghe an,1975,0,ha noi,47,4,7
1,10,116301808,102,TB1378B,9,0.107779,20200100,20210200,9.0,ha noi,1971,1,nghe an,51,5,1
2,11,131644973,102,IC0108B,6,-0.248612,20190900,20220100,59.0,ha noi,1989,1,nghe an,33,3,1
3,12,131264273,2706,TF0010F,9,-0.186958,20210500,20220400,7.0,bac ninh,1983,0,nghe an,39,3,1
4,14,113003795,100,HW01180,6,-0.248612,20140100,20220400,18.0,ha noi,1962,1,ha noi,60,6,1


### Date

In [250]:
def date_normalize(s):
    s = str(s)[0:6]
    s = pd.to_datetime(s, format="%Y%m")
    return s

In [251]:
X_train["from_date"] = X_train["from_date"].apply(date_normalize)
X_test["from_date"] = X_test["from_date"].apply(date_normalize)

X_train["to_date"] = X_train["to_date"].apply(date_normalize)
X_test["to_date"] = X_test["to_date"].apply(date_normalize)

In [252]:
X_train["months_distance"] = (X_train["to_date"] - X_train["from_date"]).astype('timedelta64[M]').astype('int')
X_test["months_distance"] = (X_test["to_date"] - X_test["from_date"]).astype('timedelta64[M]').astype('int')

### Drop unuse features

In [253]:
features = ['id_management', 'company_type', "job/role", 'months_distance', 'employee_lv', 'age_class', 'work_area_group']

In [254]:
id_bh = X_test['id_bh']
X_train = X_train[features]
X_test = X_test[features]
X_train.head()

Unnamed: 0,id_management,company_type,job/role,months_distance,employee_lv,age_class,work_area_group
0,106,9,-0.053339,74,10.0,5,1
1,102,9,0.10348,0,9.0,2,1
2,102,9,-0.053339,26,10.0,4,1
3,105,9,0.342116,9,9.0,2,7
4,102,9,0.342116,77,8.0,5,7


In [255]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27502 entries, 0 to 27501
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   id_management    27502 non-null  int64   
 1   company_type     27502 non-null  int64   
 2   job/role         27502 non-null  float64 
 3   months_distance  27502 non-null  int32   
 4   employee_lv      27502 non-null  float64 
 5   age_class        27502 non-null  int32   
 6   work_area_group  27502 non-null  category
dtypes: category(1), float64(2), int32(2), int64(2)
memory usage: 1.3 MB


## ML Model

In [256]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, train_size=0.8, test_size=0.2, random_state=0)

In [257]:
from sklearn.ensemble import RandomForestClassifier

dt_model = RandomForestClassifier(random_state=1, n_estimators=1000)

In [258]:
# Fit training data into model
dt_model.fit(X_train, y_train)

RandomForestClassifier(n_estimators=1000, random_state=1)

### Train

In [263]:
from sklearn.metrics import f1_score, classification_report, confusion_matrix, accuracy_score

y_pred = dt_model.predict(X_valid)

print(classification_report(y_valid, y_pred))
print(accuracy_score(y_valid, y_pred))

f1_score(y_valid, y_pred, average='macro')

              precision    recall  f1-score   support

           1       1.00      0.47      0.64        15
           2       0.92      0.92      0.92      2023
           3       0.90      0.88      0.89       739
           4       0.92      0.93      0.93      1576
           5       0.89      0.96      0.92       491
           6       0.91      0.87      0.89       642
           7       1.00      0.87      0.93        15

    accuracy                           0.91      5501
   macro avg       0.93      0.84      0.87      5501
weighted avg       0.91      0.91      0.91      5501

0.9147427740410834


0.8737381151943543

In [260]:
y_pred = dt_model.predict(X_test)

d = {'id_bh': id_bh, 'label': y_pred}
df = pd.DataFrame(data=d)

df.to_csv("./submission.csv", index=False)

In [261]:
df.shape

(18134, 2)

In [262]:
df.head()

Unnamed: 0,id_bh,label
0,116303809,5
1,116301808,2
2,131644973,5
3,131264273,3
4,113003795,5
