<a href="https://colab.research.google.com/github/jackykwok2002/Resume_model_train/blob/main/resume_model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd # load and manipulate data and for One-Hot Encoding
import numpy as np # calculate the mean and standard deviation
import re 
import xgboost as xgb # XGBoost stuff
from sklearn.model_selection import train_test_split # split  data into training and testing sets
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer # for scoring during cross validation
from sklearn.model_selection import GridSearchCV # cross validation
from sklearn.metrics import confusion_matrix # creates a confusion matrix
from sklearn.metrics import plot_confusion_matrix # draws a confusion matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

In [2]:
df = pd.read_csv("sdfz_resume.csv")
df = df.drop(['原始文本','姓名','投递时间','电话','邮箱','岗位','工作经历'], axis=1) #temporarily drop 工作经历
df = df.drop_duplicates() 

In [3]:
global locations
filter_words = ['深圳','上海','厦门','北京','南山区', '（实习）','nan', '(实习)', '—', '冯德宇', '李涵', '廖舸凯', '张育霖', ' ', '张锴旺', '实习生', '实习']

replace_words = {
    'ai研究员/ai工程师':'ai工程师',
    'hr（招聘方向）':'hr',
    'ui':'ui设计师',
    '前端开发':'前端工程师',
    '前端开发工程师':'前端工程师',
    '后台工程师':'后台开发',
    '后台开发工程师':'后台开发',
    '后端':'后台开发',
    '爬虫后端':'爬虫后端工程师',
    '市场营销bd':'市场营销',
    '高级后台开发工程师':'高级后台工程师'
}

# 将应聘职位过滤
def filterJobs(x):
  s = str(x["应聘职位"]).lower()
  if s == '':
    return None
  s = s.replace('-','').replace('（','(').replace('）',')')
  for word in filter_words:
     if word in s:
       s = s.replace(word, '')
  if s in replace_words:
    s = replace_words[s]
  return s.strip()

df["应聘职位"] = df.apply(filterJobs, axis=1)
df["应聘职位"].unique()

array(['数据分析', '算法工程师(运筹优化方向)', '人工智能算法研究员', '后台开发', '高级后台工程师', '前端工程师',
       '项目经理', '算法工程师', '解决方案工程师', '销售经理', '大数据/ai架构工程师',
       '行业解决方案高级工程师(后台方向)', 'hr(招聘方向)', '产品经理', '微服务架构师',
       '行业解决方案工程师(后台方向)', '自然语言处理算法工程师', '人事行政专员', '搜索算法工程师',
       '行业解决方案工程师(运筹优化方向算法)', '智能物流解决方案业务经理', '行业解决方案工程师(机器学习方向)',
       '推荐算法工程师', '计算机视觉算法工程师', 'ceo助理', '爬虫后端工程师', '算法工程师(仿真平台方向)',
       '算法工程师(决策方向)', '机器学习算法工程师', '行业解决方案工程师(强化学习方向)', '运筹优化产品经理',
       '售前工程师', '系统架构师', '强化学习算法工程师', '知识图谱算法工程师', 'ui设计师',
       '金融/工业/零售行业高级咨询专家', '运维工程师', '数据中台工程师', '市场营销', '音频算法工程师',
       '大数据后台开发工程师', '用户画像工程师', '测试工程师', '大数据研发工程师', '搜索推荐高级研发工程师',
       '商务助理', '后端开发工程师(搜索推荐方向)', 'ai工程师', '大数据工程师(搜索推荐方向)', 'python工程师',
       '', '会计', '大客户经理', 'hrbp', 'hr', '视觉/运营设计师', '商务总监助理',
       '行业研究(解决方案产品助理)', 'ai解决方案工程师', '视觉设计师', '视觉设计师(三维方向)'],
      dtype=object)

In [4]:
# 工作年限转换为Integer
def toNumber(x):
  if x["工作年限"] is None:
    return 

  s = str(x["工作年限"])

  if '毕业' in s:
    return 0
  elif '1年及以内' in s or '1年以内' in s:
    return 0.5
  else:
    import re
    return re.sub("[^0-9]", "", s)

df["工作年限"] = df.apply(toNumber, axis=1)
df['工作年限'] = pd.to_numeric(df['工作年限'])
df['工作年限'].unique()

# 将学历过滤
def filterEducation(x):
  if '深圳' in str(x["学历"]):
    return None 
  else:
    return x["学历"]

df["学历"] = df.apply(filterEducation, axis=1)

# 性别过滤
def filterSex(x):
  if '男' in str(x["性别"]) or '女' in str(x["性别"]):
    return x["性别"]
  else:
    return None

df["性别"] = df.apply(filterSex, axis=1)


In [5]:
# 毕业时间
def filterGraduation(x):
  s = str(x["毕业时间"])
  if '-' in s:
    s = s.split('-')[0]
  if '/' in s:
    s = s.split('/')[2]
  s = s.replace("届", "").replace('级别','').replace('级','').replace('15659420518','0').replace('nan','0')
  return s

df["毕业时间"] = df.apply(filterGraduation, axis=1)
df['毕业时间'] = pd.to_numeric(df['毕业时间'])

In [6]:
# Use number to represent schools
# 学校转换为数字

global school_lst
school_lst = list(df['毕业院校'].unique().astype(str).copy())
school_lst.sort()

df['毕业院校'] = df['毕业院校'].apply(lambda x: school_lst.index(str(x)))


In [7]:
# 处理“技能”
# 转换为list
def skillsToList(x):
  if str(x["技能"]) == 'nan':
    return []
  return str(x["技能"]).split(' | ')
df["技能"] = df.apply(skillsToList, axis=1)

# One Hot Encoded #source:https://stackoverflow.com/questions/45312377/how-to-one-hot-encode-from-a-pandas-column-containing-a-list
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('技能')),
                          columns=mlb.classes_,
                          index=df.index))

In [8]:
# Split the Data into Dependent and Independent Variables
X = df.drop('应聘职位', axis=1).copy() 
y = df['应聘职位'].copy().astype(str)

In [9]:
# One Hot Encode the rest
X_encoded = pd.get_dummies(X, columns=['专业', 
                                       '学历', 
                                       '工作地点', 
                                       '性别',
                                       '类别',
                                       '应聘渠道'])

In [10]:
le = LabelEncoder()
y = le.fit_transform(y)

In [11]:
# Spliting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size = 0.2, random_state = 1002)

In [13]:
clf_xgb = xgb.XGBClassifier(objective='multi:softmax',
                            eval_metric="logloss", ## this avoids a warning...
                            seed=42, 
                            use_label_encoder=False)
clf_xgb.fit(X_train, 
            y_train,
            verbose=True,
            early_stopping_rounds=10,
            eval_metric='mlogloss',
            eval_set=[(X_test, y_test)])

[0]	validation_0-mlogloss:3.56771
Will train until validation_0-mlogloss hasn't improved in 10 rounds.
[1]	validation_0-mlogloss:3.32722
[2]	validation_0-mlogloss:3.16851
[3]	validation_0-mlogloss:3.04333
[4]	validation_0-mlogloss:2.94184
[5]	validation_0-mlogloss:2.85508
[6]	validation_0-mlogloss:2.77776
[7]	validation_0-mlogloss:2.7083
[8]	validation_0-mlogloss:2.64838
[9]	validation_0-mlogloss:2.59392
[10]	validation_0-mlogloss:2.54622
[11]	validation_0-mlogloss:2.50197
[12]	validation_0-mlogloss:2.46135
[13]	validation_0-mlogloss:2.4238
[14]	validation_0-mlogloss:2.38937
[15]	validation_0-mlogloss:2.35832
[16]	validation_0-mlogloss:2.32971
[17]	validation_0-mlogloss:2.30185
[18]	validation_0-mlogloss:2.27799
[19]	validation_0-mlogloss:2.2535
[20]	validation_0-mlogloss:2.23241
[21]	validation_0-mlogloss:2.21134
[22]	validation_0-mlogloss:2.19151
[23]	validation_0-mlogloss:2.17373
[24]	validation_0-mlogloss:2.15676
[25]	validation_0-mlogloss:2.14039
[26]	validation_0-mlogloss:2.12506

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
              silent=None, subsample=1, use_label_encoder=False, verbosity=1)

In [15]:
# save the model to disk
import pickle
filename = 'finalized_model.sav'
pickle.dump(clf_xgb, open(filename, 'wb'))

In [19]:
result = clf_xgb.score(X_test, y_test)
print(result)

0.46883468834688347
