In [1]:
import json
from IPython.display import display
# display(pd.Dataframe(data))
import pandas as pd
import numpy as np
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
from math import pi
from pprint import pprint
from konlpy.tag import Okt
from collections import Counter
from sklearn.manifold import TSNE
from future.utils import iteritems
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [2]:
import os

def read_data(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        data = f.read()
        # remove_quotes
        data = data.replace("‘", " ")
        data = data.replace("’", " ")
        data = data.replace("“", " ")
        data = data.replace("”", " ")
        data = data.replace("`", " ")
        data = data.replace("\'", " ")
        data = data.replace("\"", " ")
    return data


global_data = read_data('dataset/keywords/global_large.txt')
active_data = read_data('dataset/keywords/active_large.txt')
challenge_data = read_data('dataset/keywords/challenge_large.txt')
sincerity_data = read_data('dataset/keywords/sincerity_large.txt')
communication_data = read_data('dataset/keywords/communication_large.txt')
patient_data = read_data('dataset/keywords/patient_large.txt')
honesty_data = read_data('dataset/keywords/honesty_large.txt')
responsibility_data = read_data('dataset/keywords/responsibility_large.txt')
creative_data = read_data('dataset/keywords/creative_large.txt')
teamwork_data = read_data('dataset/keywords/teamwork_large.txt')

In [3]:
import csv

pd.options.mode.chained_assignment = None
np.random.seed(0)

# Kolnpy
okt = Okt()

rawdata = [
    global_data,
    active_data,
    challenge_data,
    sincerity_data,
    communication_data,
    patient_data,
    honesty_data,
    responsibility_data,
    creative_data,
    teamwork_data
]

keyword_names = ['글로벌역량', '능동', '도전', '성실', '소통', '인내심', '정직', '주인의식', '창의', '팀워크']

# 불용어리스트 불러오는 과정
f = open('dataset/stopwords/stopwords.csv', 'r', encoding='utf-8')
reader = csv.reader(f)
stopwords = list()

for row in reader:
    stopwords.append(row[0])

In [4]:
# raw한 문장데이터를 분석에 알맞게 정제하도록 한다.
def tokenizer(raw, pos=["Noun","Verb"], stopword=stopwords):
    return [
        word for word, tag in okt.pos(
            raw, 
            norm=True,   # normalize 정제 과정
            stem=True    # stemming 정제 과정
            )
            if len(word) > 1 and tag in pos and word not in stopword
        ]

In [5]:
vectorize = TfidfVectorizer(
    ngram_range=(1,2),
    tokenizer=tokenizer,
    max_df=0.95,
    min_df=0,
    sublinear_tf=True    # tf값에 1+log(tf)를 적용하여 tf값이 무한정 커지는 것을 막음
)


X = vectorize.fit_transform(rawdata)


type(vectorize)

# X.shape[0]은 문서의 갯수(10개), X.shape[1]는 feature의 갯수(특징되는 단어집합)
print('fit_transform, (sentence {}, feature {})'.format(X.shape[0], X.shape[1]))
 
#print(X.toarray())
#print(X.shape)
 
    
# 문장에서 뽑아낸 feature 들의 배열
features = vectorize.get_feature_names()

print(type(X.toarray()))
print(type(features))


fit_transform, (sentence 10, feature 1783416)
<class 'numpy.ndarray'>
<class 'list'>


In [6]:
vector_X = pd.DataFrame(X.toarray())
vector_X_features = pd.DataFrame(features)

In [7]:
vector_X.to_csv('dataset/result/vector_X_large.csv', header=False, index=False)

In [8]:
vector_X_features.to_csv('dataset/result/vector_X_features_large.csv', header=False)

In [5]:
data_array = X.toarray()
data = pd.DataFrame(data_array, columns=features)
data.shape

data = data.rename({0:"글로벌역량", 1:"능동", 2:"도전", 3:"성실", 4:"소통", 5:"인내심", 6:"정직", 7:"주인의식", 8:"창의", 9:"팀워크"}, axis='index')
display(pd.DataFrame(data))

Unnamed: 0,가가,가가 오다,가게,가게 난로,가게 불복종,가격,가격 고객,가격 구매,가격 나볼,가격 반영,...,힘내다 항상,힘쓰다,힘쓰다 나가다,힘쓰다 달라,힘쓰다 대구시,힘쓰다 모음,힘쓰다 분당,힘쓰다 체적,힙합,힙합 대부
글로벌역량,0.0,0.0,0.0,0.0,0.0,0.007628,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
능동,0.008029,0.008029,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
도전,0.0,0.0,0.013852,0.008181,0.008181,0.004858,0.0,0.0,0.0,0.0,...,0.008181,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
성실,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.005335,0.0,0.008985,0.0,0.0,0.0,0.0,0.0,0.0
소통,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.008226,0.0,0.0,0.008182,0.0,0.0,0.008182,0.0,0.0
인내심,0.0,0.0,0.0,0.0,0.0,0.011182,0.0,0.0,0.0,0.008973,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
정직,0.0,0.0,0.0,0.0,0.0,0.02014,0.014198,0.008386,0.020011,0.0,...,0.0,0.00498,0.008386,0.0,0.0,0.0,0.0,0.0,0.0,0.0
주인의식,0.0,0.0,0.0,0.0,0.0,0.004884,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
창의,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.004587,0.0,0.0,0.0,0.007725,0.0,0.0,0.0,0.0
팀워크,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.008812,0.0,0.0,0.0,0.0,0.01484,0.0,0.020915,0.020915


In [3]:
'''
분석해놓은 파일을 사용하여 입력받은 데이터 분석하기
최초 분석할때는 이 과정을 건너 뛸 것.
'''
import csv

raw_keyword = ['글로벌역량', '능동', '도전', '성실', '소통', '인내심', '정직', '주인의식', '창의', '팀워크']

# tf-idf값 불러오는 과정
X = pd.read_csv('dataframe_X.csv', header=None)
X = np.array(X)
print(X)
print(X.shape)
print(type(X))

# 학습 단어(feature) 불러오는 과정
f = open('dataframe_features.csv', 'r', encoding='utf-8')
reader = csv.reader(f)
features = list()

for row in reader:
    features.append(row[0])
    
# 불용어리스트 불러오는 과정
f = open('dataset/stopwords/stopwords.csv', 'r', encoding='utf-8')
reader = csv.reader(f)
stopwords = list()

for row in reader:
    stopwords.append(row[0])

print(type(features))


okt = Okt()

def read_data(filename):
    with open(filename, 'r') as f:
        data = f.read()
        # remove_quotes
        data = data.replace("‘", " ")
        data = data.replace("’", " ")
        data = data.replace("“", " ")
        data = data.replace("”", " ")
        data = data.replace("`", " ")
        data = data.replace("\'", " ")
        data = data.replace("\"", " ")
    return data

def tokenizer(raw, pos=["Noun","Verb"], stopword=stopwords):
    return [
        word for word, tag in okt.pos(
            raw, 
            norm=True,   # normalize 정제 과정
            stem=True    # stemming 정제 과정
            )
            if len(word) > 1 and tag in pos and word not in stopword
        ]



[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.00802943 0.00802943 0.         ... 0.         0.         0.        ]
 [0.         0.         0.01385216 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.02091511 0.02091511]]
(10, 51276)
<class 'numpy.ndarray'>
<class 'list'>


In [4]:
'''
asarray -> 참조본, 얕은복사 배열
srch_dtm = np.asarray(X.toarray())[:, [vectorize.vocabulary_.get(i) for i in srch]]
srch = search
dtm = Document Term Matrix
'''  

# 검색 문장에서 feature를 뽑아냄
resume_data = read_data('/home/leesunhong/similarity/dataset/input_resume.txt')
a = tokenizer(resume_data)
print(a)

srch=[t for t in tokenizer(resume_data) if t in features]

print(srch)

# 분석할때에는 X대신 X.toarray()를 넣어준다
# document term matrix 에서 검색하고자 하는 feature만 뽑아낸다.

srch_dtm = np.asarray(X)[:, [
    # vectorize.vocabulary_.get 는 특정 feature에 가지고 있는 index값을 리턴한다 (features에 단어와 일치하는 것)
    features.index(i) for i in srch
]]


'''
srch_dtm = np.asarray(X.toarray())[:, [
    # vectorize.vocabulary_.get 는 특정 feature에 가지고 있는 index값을 리턴한다 (features에 단어와 일치하는 것)
    vectorize.vocabulary_.get(i) for i in srch
]]
'''

print(srch_dtm.shape)

data = pd.DataFrame(srch_dtm, columns=srch)
srch_dtm = data.rename({0:"글로벌역량", 1:"능동", 2:"도전", 3:"성실", 4:"소통", 5:"인내심", 6:"정직", 7:"주인의식", 8:"창의", 9:"팀워크"}, axis='index')
display(pd.DataFrame(srch_dtm))


score = srch_dtm.sum(axis=1)
# score는 문장별 feature 합계 점수
 
for i in score.argsort()[::-1]:
    if score[i] >= 0:
        print('{} // score : {}'.format(raw_keyword[i], score[i]))


['본인', '채용', '포스코켐텍', '영향', '긍정', '측면', '기회', '대해', '구체', '기술', '문제', '파악', '해결', '위해', '다각도', '방면', '보다', '전기', '설비', '기술', '직무', '역량', '불량', '현상', '파악', '원인', '알아내다', '문제', '개선', '위해', '대책', '수립', '통찰', '생각', '강점', '조직', '목표', '달성', '달성', '기여', '생각', '학부', '인턴', '디지털', '논리회로', '프로젝트', '진행', '통찰', '발견', '되어다', '프로젝트', '오다', '이용', '디지털', '암호', '장치', '만들다', '프로젝트', '이다', '맡다', '부분', '학번', '입력', '맞다', '초록색', '들어오다', '틀리다', '빨간색', '들어오다', '이다', '회로', '설계', '종로', '한일전', '방문', '부품', '구매', '납땜', '과정', '문제', '생기다', '회로도', '기름종이', '대다', '보다', '납땜', '과정', '실수', '피복', '벗겨지다', '되어다', '이르다', '즉시', '전선', '보완', '문제', '해결', '회로도', '납땜', '과정', '수십', '확인', '작동시키다', '정상', '작동시키다', '역량', '바탕', '포스코켐텍', '되어다', '전기', '설비', '기술', '업무', '신입사원', '최근', '가장', '도전', '목표', '세우다', '성취', '해내다', '구체', '경험', '대해', '과정', '결과', '기술', '프로세스', '개선', '중요성', '단체', '이끌다', '공동', '목표', '도전', '달성', '해내다', '가장', '고객', '만족', '위해', '도전', '포스코켐텍', '도전', '성향', '요구', '생각', '유레카', '학회', '전공', '공부', '학우', '도움', '고자', '스터디', '조장', '맡다'

Unnamed: 0,본인,채용,영향,긍정,측면,구체,기술,파악,해결,보다,...,협력,중요성,깨닫다,경험,토대,설비,개선,시스템,안전성,이바지
글로벌역량,0.0,0.0,0.004505,0.0,0.0,0.0,0.016262,0.0,0.003698,0.009051,...,0.017432,0.0,0.0,0.0,0.003368,0.006449,0.008037,0.007068,0.01309,0.005017
능동,0.0,0.005972,0.008073,0.003914,0.0,0.0,0.018217,0.011142,0.017599,0.010013,...,0.010927,0.024895,0.0,0.010976,0.00748,0.0,0.012971,0.011396,0.0,0.005309
도전,0.004392,0.0,0.0,0.008369,0.0,0.0,0.006149,0.0,0.0,0.013238,...,0.0,0.0,0.0,0.007622,0.0,0.0,0.008667,0.0,0.018852,0.0
성실,0.010122,0.0,0.0,0.0,0.006682,0.0,0.0,0.0,0.0,0.00616,...,0.009191,0.0,0.0,0.0,0.0,0.0,0.0,0.006753,0.0,0.01962
소통,0.004392,0.0,0.0,0.0,0.0,0.0,0.00615,0.00916,0.015732,0.006954,...,0.019345,0.0,0.0,0.012948,0.003632,0.0,0.01014,0.007622,0.0,0.0
인내심,0.004817,0.0,0.015696,0.019101,0.0,0.0,0.003983,0.0,0.013984,0.015869,...,0.004374,0.0,0.008973,0.006744,0.003983,0.0,0.008359,0.011734,0.0,0.0
정직,0.0,0.01056,0.012994,0.006921,0.0,0.0,0.0,0.005545,0.0,0.013569,...,0.0,0.0,0.0,0.003723,0.006303,0.0,0.007812,0.0,0.0,0.0
주인의식,0.009267,0.010358,0.00827,0.00401,0.006118,0.0,0.008713,0.009209,0.008415,0.009299,...,0.00401,0.005439,0.0,0.003651,0.007663,0.0,0.009528,0.007663,0.005439,0.009209
창의,0.0,0.0,0.0,0.003766,0.0,0.007725,0.016247,0.0,0.006376,0.0,...,0.015047,0.005108,0.0,0.008184,0.003429,0.017137,0.0,0.013342,0.0,0.0
팀워크,0.007966,0.0,0.0,0.011928,0.006519,0.0,0.003891,0.0,0.01728,0.018921,...,0.0,0.012162,0.0,0.003891,0.003891,0.0,0.010862,0.011462,0.005795,0.0


능동 // score : 2.71436915004898
창의 // score : 2.308569006105548
주인의식 // score : 1.8737782782867227
팀워크 // score : 1.7750897810434707
글로벌역량 // score : 1.7620693476038398
소통 // score : 1.57463818424262
인내심 // score : 1.5111446950206615
정직 // score : 1.296506386636496
도전 // score : 1.2896825590325922
성실 // score : 0.7096652251741485


In [9]:
import plotly
plotly.__version__

'3.8.1'

In [10]:
import plotly.plotly as py
import plotly.graph_objs as go

In [None]:
bar_data = [go.Bar(
            x=srch,
            y=srch_dtm[1],
            marker=dict(
                color='rgb(158,202,225)',
                line=dict(
                    color='rgb(8,48,107)',
                    width=1.5),
            ),
            opacity=0.6
    )]

plotly.offline.plot(bar_data)

print(srch)
print(len(srch))
# raw_keyword = ['글로벌역량', '능동', '도전', '성실', '소통', '인내심', '정직', '주인의식', '창의', '팀워크']

In [12]:
print(features[0:100])

['00초', '00초 기록', '00초 기록 남자', '00초 기록 차지', '010-9307', '010-9307 3810', '010-9307 3810 15일', '010-9307 3810 윤상', '010-9307 3810 책임지다', '010-9307 3810.29', '010-9307 3810.29 명의', '02-401', '02-401 7790', '02-401 7790 능동', '02-446', '02-446 7790', '02-446 7790 유동균', '020560', '020560 한국', '020560 한국 생산', '03월', '03월 19일', '03월 19일 주식', '03월 19일 현재', '069%', '069% 상태', '069% 상태 몰다', '1,678', '1,678 명과', '1,678 명과 유공', '1/3', '1/3 책임지다', '1/3 책임지다 오다', '10', '10 가지', '10 가지 한국', '10 개인', '10 개인 10', '10 구가', '10 구가 운영', '10 대한', '10 대한 조치', '10 대한 처분', '10 대한민국', '10 대한민국 올해', '10 대해', '10 대해 성남시', '10 명의', '10 명의 세무사', '10 방문', '10 방문 개별', '10 보건', '10 보건 기구', '10 보령시', '10 보령시 올해', '10 선정', '10 선정 성실', '10 성남시', '10 성남시 성실', '10 성실', '10 성실 납세', '10 시리즈', '10 시리즈 사용', '10 여명', '10 여명 성희롱', '10 오르다', '10 오르다 중국', '10 오르다 한국', '10 인증', '10 인증 수여', '10 주기', '10 주기 맞다', '10 진로', '10 진로 캠프', '10 진입', '10 진입 모습', '10 짜다', '10 짜다 용기', '10 초등학교', '10 초등학교 활용', '10 출시', '10 출시 밝히다', '10 출시 식품첨가물

In [14]:
radar_data = [go.Scatterpolar(
  r = score,
  theta = raw_keyword,
  fill = 'toself'
)]

max_value = max(score) + 0.5

layout = go.Layout(
  polar = dict(
    radialaxis = dict(
      visible = True,
      range = [0, max_value]
    )
  ),
  showlegend = False
)

fig = go.Figure(data=radar_data, layout=layout)
plotly.offline.plot(fig)

'temp-plot.html'