In [1]:
from konlpy.tag import Okt
import pandas as pd
import numpy as np
import csv

global_csv = pd.read_csv('dataset/blog/global.csv', encoding="utf-8").append(pd.read_csv('dataset/news/global.csv', encoding="utf-8")).append(pd.read_csv('dataset/keyword_daum/global.csv', encoding="utf-8"))
active_csv = pd.read_csv('dataset/blog/active.csv', encoding="utf-8").append(pd.read_csv('dataset/news/active.csv', encoding="utf-8")).append(pd.read_csv('dataset/keyword_daum/active.csv', encoding="utf-8"))
challenge_csv = pd.read_csv('dataset/blog/challenge.csv', encoding="utf-8").append(pd.read_csv('dataset/news/challenge.csv', encoding="utf-8")).append(pd.read_csv('dataset/keyword_daum/challenge.csv', encoding="utf-8"))
sincerity_csv = pd.read_csv('dataset/blog/sincerity.csv', encoding="utf-8").append(pd.read_csv('dataset/news/sincerity.csv', encoding="utf-8")).append(pd.read_csv('dataset/keyword_daum/sincerity.csv', encoding="utf-8"))
communication_csv = pd.read_csv('dataset/blog/communication.csv', encoding="utf-8").append(pd.read_csv('dataset/news/communication.csv', encoding="utf-8")).append(pd.read_csv('dataset/keyword_daum/communication.csv', encoding="utf-8"))
patient_csv = pd.read_csv('dataset/blog/patient.csv', encoding="utf-8").append(pd.read_csv('dataset/news/patient.csv', encoding="utf-8")).append(pd.read_csv('dataset/keyword_daum/patient.csv', encoding="utf-8"))
honesty_csv = pd.read_csv('dataset/blog/honesty.csv', encoding="utf-8").append(pd.read_csv('dataset/news/honesty.csv', encoding="utf-8")).append(pd.read_csv('dataset/keyword_daum/honesty.csv', encoding="utf-8"))
responsibility_csv = pd.read_csv('dataset/blog/responsibility.csv', encoding="utf-8").append(pd.read_csv('dataset/news/responsibility.csv', encoding="utf-8")).append(pd.read_csv('dataset/keyword_daum/responsibility.csv', encoding="utf-8"))
creative_csv = pd.read_csv('dataset/blog/creative.csv', encoding="utf-8").append(pd.read_csv('dataset/news/creative.csv', encoding="utf-8")).append(pd.read_csv('dataset/keyword_daum/creative.csv', encoding="utf-8"))
teamwork_csv = pd.read_csv('dataset/blog/teamwork.csv', encoding="utf-8").append(pd.read_csv('dataset/news/teamwork.csv', encoding="utf-8")).append(pd.read_csv('dataset/keyword_daum/teamwork.csv', encoding="utf-8"))

def openStopword():
    f = open('dataset/stopwords/stopwords.csv', 'r', encoding='utf-8')
    reader = csv.reader(f)
    stopwords = list()

    for row in reader:
        stopwords.append(row[0])

    return stopwords

def tokenizer(raw, pos=["Noun","Verb"], stopword=openStopword()):
    return [
        word for word, tag in okt.pos(
            raw, 
            norm=True,   # normalize 정제 과정
            stem=True    # stemming 정제 과정
            )
            if len(word) > 1 and tag in pos and word not in stopword
        ]

In [2]:
okt = Okt()

keyword_names = ['글로벌역량', '능동', '도전', '성실', '소통', '인내심', '정직', '주인의식', '창의', '팀워크']
all_data = global_csv.append(active_csv).append(challenge_csv).append(sincerity_csv).append(communication_csv)
all_data = all_data.append(patient_csv).append(honesty_csv).append(responsibility_csv).append(creative_csv).append(teamwork_csv)

from sklearn.model_selection import train_test_split
train, test = train_test_split(all_data, test_size=0.33, random_state=42)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorize = TfidfVectorizer(
    ngram_range=(1,3), #n-gram 3
    tokenizer=tokenizer,
    max_df=0.95,
    min_df=0,
    sublinear_tf=True
)

In [3]:
tdm = vectorize.fit_transform(train["sentence"].apply(lambda x: np.str_(x)))

In [4]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier

In [5]:
SVC_pipeline = Pipeline([
                ('tfidf', vectorize),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])

text_clf_svc = SVC_pipeline.fit(train["sentence"].apply(lambda x: np.str_(x)), train["label"])
# train에 있는 sentence와 label로 모델을 만든다. 이것을 predict한 결과를 test의 label로 테스트를 해볼 수 있다.

In [6]:
from sklearn.svm import SVC

In [7]:
SVC_PROB_pipeline = Pipeline([
                ('tfidf', vectorize),
                ('clf', OneVsRestClassifier(SVC(probability=True), n_jobs=1)),
            ])

In [8]:
text_clf_svc_prob = SVC_PROB_pipeline.fit(train["sentence"].apply(lambda x: np.str_(x)), train["label"])



In [10]:
import pickle
import joblib

joblib.dump(text_clf_svc_prob, 'SVC_PROB_190519.joblib')

['SVC_PROB_190519.joblib']

In [11]:
print("train",np.mean(text_clf_svc.predict(train["sentence"].apply(lambda x: np.str_(x))) == train["label"]))
print("test",np.mean(text_clf_svc.predict(test["sentence"].apply(lambda x: np.str_(x))) == test["label"]))

train 0.9958405244664076
test 0.9409866646499786


In [112]:
cj = pd.read_csv("dataset/CJ.csv", encoding="utf-8")
sk = pd.read_csv("dataset/SK.csv", encoding="utf-8")
lg = pd.read_csv("dataset/LG.csv", encoding="utf-8")
hyundai = pd.read_csv("dataset/hyundai.csv", encoding="utf-8")
samsung = pd.read_csv("dataset/samsung.csv", encoding="utf-8")

In [113]:
archit = pd.read_csv("dataset/architecture.csv", encoding="utf-8")
IT = pd.read_csv("dataset/IT.csv", encoding="utf-8")
manage = pd.read_csv("dataset/management.csv", encoding="utf-8")
product = pd.read_csv("dataset/production.csv", encoding="utf-8")
sales = pd.read_csv("dataset/sales.csv", encoding="utf-8")

In [None]:
cj_prob = text_clf_svc_prob.predict_proba(CJ["document"])
sk_prob = text_clf_svc_prob.predict_proba(SK["document"])
lg_prob = text_clf_svc_prob.predict_proba(LG["document"])
hyundia_prob = text_clf_svc_prob.predict_proba(hyundai["document"])
samsung_prob = text_clf_svc_prob.predict_proba(samsung["document"])

In [None]:
archit_prob = text_clf_svc_prob.predict_proba(CJ["document"])
IT_prob = text_clf_svc_prob.predict_proba(SK["document"])
manage_prob = text_clf_svc_prob.predict_proba(LG["document"])
product_prob = text_clf_svc_prob.predict_proba(hyundai["document"])
sales_prob = text_clf_svc_prob.predict_proba(samsung["document"])

In [92]:
print(type(cj_prob))
print(type(sk_prob))
print(type(lg_prob))
print(type(samsung_prob))
print(type(hyundai_prob))
print(type(archit_prob))
print(type(IT_prob))
print(type(manage_prob))
print(type(product_prob))
print(type(sales_prob))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [109]:
print(cj_prob.mean(axis=0))
print(sk_prob.mean(axis=0))
print(lg_prob.mean(axis=0))
print(hyundai_prob.mean(axis=0))
print(samsung_prob.mean(axis=0))

[2.04467562e-02 7.05937943e-01 1.11781851e-01 2.54034393e-02
 8.81388318e-02 4.55867949e-03 1.26688466e-03 3.30487493e-02
 9.13387643e-03 2.82988187e-04]
[3.52537717e-02 6.50264203e-01 1.54631750e-01 2.02076444e-02
 7.10805401e-02 7.31223318e-03 6.45861332e-06 5.79216537e-02
 3.29431109e-03 2.74346349e-05]
[3.78608867e-02 6.42035316e-01 1.35633849e-01 3.17778888e-02
 6.70169453e-02 2.54389122e-02 2.25059004e-05 4.96160935e-02
 1.03044619e-02 2.93140179e-04]
[3.78867768e-02 6.47128059e-01 1.39605425e-01 3.37944692e-02
 9.06473459e-02 1.54920968e-02 9.66145612e-06 3.09573369e-02
 3.78152244e-03 6.97307026e-04]
[3.36802721e-02 6.20427806e-01 1.70529581e-01 3.92434594e-02
 7.35320883e-02 2.21431603e-02 1.36473676e-05 3.33700692e-02
 6.55470694e-03 5.05209728e-04]


In [None]:
print(archit_prob.mean(axis=0))
print(IT_prob.mean(axis=0))
print(manage_prob.mean(axis=0))
print(product_prob.mean(axis=0))
print(sales_prob.mean(axis=0))