In [1]:
import pickle
import joblib
from konlpy.tag import Okt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
import csv

In [2]:
def openStopword():
    f = open('dataset/stopwords/stopwords.csv', 'r', encoding='utf-8')
    reader = csv.reader(f)
    stopwords = list()

    for row in reader:
        stopwords.append(row[0])

    return stopwords

In [3]:
def tokenizer(raw, pos=["Noun","Verb"], stopword=openStopword()):
    return [
        word for word, tag in okt.pos(
            raw, 
            norm=True,
            stem=True
            )
            if len(word) > 1 and tag in pos and word not in stopword
        ]

In [4]:
okt = Okt()

In [5]:
keyword_names = ['글로벌역량', '능동', '도전', '성실', '소통', '인내심', '정직', '주인의식', '창의', '팀워크']

In [6]:
vectorize = TfidfVectorizer(
    ngram_range=(1,2),
    tokenizer=tokenizer,
    max_df=0.95,
    min_df=0,
    sublinear_tf=True
)

In [7]:
filename = 'SVMmodel_190516.joblib'
clf_from_pickle = joblib.load(filename)

In [8]:
def SVMpredict(text):
    result = clf_from_pickle.predict([text])
    return keyword_names[result[0]-1]

In [9]:
def SVMdecision(text):
    return clf_from_pickle.decision_function([text]).tolist()

In [10]:
import json
#기업
#삼성 현대 엘지 SK CJ

samsung_origin = json.loads(open('dataset/resumes/educe_samsung.json', encoding="utf-8").read())
hyundai_origin = json.loads(open('dataset/resumes/educe_hyundai.json', encoding="utf-8").read())
LG_origin = json.loads(open('dataset/resumes/educe_LG.json', encoding="utf-8").read())
SK_origin = json.loads(open('dataset/resumes/educe_SK.json', encoding="utf-8").read())
CJ_origin = json.loads(open('dataset/resumes/educe_CJ.json', encoding="utf-8").read())

In [11]:
def make_txt(origin_json):
    result = ""
    for resume in origin_json:
        result += resume['document']
    return result

In [12]:
samsung = make_txt(samsung_origin)
hyundai = make_txt(hyundai_origin)
LG = make_txt(LG_origin)
SK = make_txt(SK_origin)
CJ = make_txt(CJ_origin)

In [None]:
predict_result = clf_from_pickle.predict([samsung,hyundai,LG,SK,CJ]).tolist()
#2,2,8,2,2

In [None]:
decision_result = clf_from_pickle.decision_function([samsung,hyundai,LG,SK,CJ]).tolist()
#[[-1.037130696722559309e+00, -9.585070147284189268e-01, -9.974816779480838447e-01, -9.960434378430856883e-01, -1.044291144562182971e+00, -1.011416175802230422e+00, -1.082306028024373701e+00, -9.723745689274718718e-01, -1.054980302160777761e+00, -1.038985179996822383e+00],
#[-1.022905058780172016e+00, -9.729714011140895291e-01, -9.979109424116415772e-01, -9.960838890592615957e-01, -1.035563573794652914e+00, -1.006832867792128772e+00, -1.079813400441117510e+00, -9.743427615183873902e-01, -1.045494667635081809e+00, -1.025537419468688594e+00],
#[-1.016472614211637815e+00, -9.766467738352675454e-01, -1.000375379621457395e+00, -9.978555971847123951e-01, -1.038529457331845096e+00, -1.005774030426986432e+00, -1.073706419588563943e+00, -9.765262185815020146e-01, -1.042025601114519251e+00, -1.037331119062074425e+00],
#[-1.019009529590173768e+00, -9.757679547367305029e-01, -1.000952694031802315e+00, -1.001619339550971599e+00, -1.030700573938479581e+00, -1.010927717493489641e+00, -1.073901315828346359e+00, -9.765517184248780902e-01, -1.048375640102809703e+00, -1.034617815315719946e+00],
#[-1.010540533808717001e+00, -9.654996280275474074e-01, -9.992614800895766036e-01, -9.941822080072016954e-01, -1.034704905158396748e+00, -1.021293330430268931e+00, -1.068412167230169185e+00, -9.774429429272191161e-01, -1.036193310539011980e+00, -1.018506850812170628e+00]]

In [None]:
with open('predict_190516.txt', 'w') as f:
    for item in predict_result:
        f.write("%s\n" % item)

In [None]:
with open('decision_190516.txt', 'w') as f:
    for item in decision_result:
        f.write("%s\n" % item)