# Classification of text documents using sparse features

This is an example showing how scikit-learn can be used to classify documents
by topics using a bag-of-words approach. This example uses a scipy.sparse
matrix to store the features and demonstrates various classifiers that can
efficiently handle sparse matrices.


In [1]:
# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
#         Olivier Grisel <olivier.grisel@ensta.org>
#         Mathieu Blondel <mathieu@mblondel.org>
#         Lars Buitinck
# License: BSD 3 clause

from __future__ import print_function
import pandas as pd
import logging
import numpy as np
import nltk.stem
from optparse import OptionParser
import sys, copy
from time import time
from random import randint
#import matplotlib.pyplot as plt
from IPython.display import display, HTML
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets import fetch_rcv1

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVR
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [2]:
def apk_per_class(clf, actual, predicted, k=5):
	"""
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items per each class
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : multi list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
	"""
	class_score = {}
	micro_correct = 0.0
	length = 0
    
	if len(actual) == len(predicted):		
			
		for	i in range(len(actual)):
			if actual[i] not in class_score:
				class_score[actual[i]] = [0.0, 0.0] # correct_score, length
			well_classified = False
			if(guess[i] is not ""):
				predicted[i][0] = guess[i]    
			for pred in predicted[i]:
				if actual[i] in pred:
					#if(randint(0, 500) == 50):
					#	print("act: -" , actual[i], "-pred:", pred,":")
					class_score[actual[i]][0] += 1.0
					micro_correct += 1.0
					well_classified = True
			if(type(actual[i]) is list):
				print("!! actual[",i,"] is ",actual[i])
			if(actual[i] == CODE):
				print(well_classified , "docID:",i,"prediction of ",CODE," was:",predicted[i])
				fail_list.append(i)
				#print(test_dtm[i].toarray()[0])
				#for j,word_rat in enumerate(test_dtm[i].toarray()[0]):
				#	if(word_rat>0.2):
				#		print(word_rat)
			class_score[actual[i]][1] += 1.0
			length+=1
				
	avg_acc = 0.0 
	for cl in class_score.keys():
		avg = class_score[cl][0]/class_score[cl][1]
		if(avg<0.4 and count[cl]-class_score[cl][1] > class_score[cl][1]):
			print("!Low precision :! #Correct:", class_score[cl][0], "#Tested:", class_score[cl][1],"#Train",count[cl]-class_score[cl][1])
		print ("\t", cl, "Acc.:", avg, "Correct:", class_score[cl][0], "Tested:", class_score[cl][1],"#Train",count[cl]-class_score[cl][1])
		avg_acc +=avg

	print ('Total Test Examples', length, "\nMicro Acc.(item level)", micro_correct/length)
	return avg_acc/len(class_score)

###############################################################################
# Benchmark classifiers
def benchmark(clf):
	print('_' * 80)
	print("Training: ")
	print(clf)
	t0 = time()
	clf.fit(X_train, y_train)
	train_time = time() - t0
	print("train time: %0.3fs" % train_time)

	t0 = time()
	
	# Top 1 
	pred = clf.predict(X_test)    
	probs = clf.predict_proba(X_test)
    
	for topk in range(5,6):
		best_n_label = transform_label(clf, probs, topk)
		
		test_time = time() - t0
		print("test time:  %0.3fs" % test_time)

		pred = best_n_label
		print ("Top-", topk)
		print ("Macro Acc.(class level)", apk_per_class(clf, y_test, best_n_label, topk), "\n\n")

In [3]:
def transform_label(clf, prob, topk):
	global target_names
	
	rst_arr = np.empty( (len(prob), topk), dtype=object) 
	for i in range(len(prob)):
		s_items = np.argsort(prob[i])[-topk:]
		
		for j in range(len(s_items)):
			rst_arr[i][j] = clf.classes_[s_items[j]]
			
			
	return rst_arr
def apk(actual, predicted, k=5):
	"""
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : multi list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
	"""

	score = 0.0
	num_hits = 0.0
	if len(actual) == len(predicted):
		for	i in range(len(actual)):
			for pred in predicted[i]:
				if actual[i] in pred:
					score += 1

			if not actual:
				return 0.0

	return score / len(actual)

In [19]:

###############################################################################
# Load some categories from the training set
CODE = "LC04"
categories =  [x for x in open('KSCC_sample_data_170206_Codelist.dat','r').read().split('\n') if len(x) > 0]
	
remove = ()

print("Loading NSCC  dataset for categories:")
print(categories if categories else "all")
print("Category size",len(categories))
	
#data_train = open('KSSC_sample_data_170206_Train.dat').readlines()
#data_test = open('KSSC_sample_data_170206_Test.dat').readlines()
data_train = open('KSSC_sample_data_170206_Train.dat').readlines()
#all_data = open('rev_reserved_new_data_all.dat').readlines()
all_data = open('rev_kkma_data_all_3cols.dat').readlines()
#data_test = open('rev_utf8_test.dat').readlines()
data_test = open("KSSC_sample_data_170206_Test.dat").readlines()
ENCODING = 'euc-kr'
data_train_data, data_test_data = [], []
y_train, y_test = [], []
all_x = []
count = {}
all_y = []
for cat in categories:
    count[cat] = 0
    """
for i,line in enumerate(all_data):
    items = line.split('\t')
    if len(items) == 2:
        all_x.append(items[1].decode(ENCODING, 'ignore'))
        #if(items[0] == CODE):
        #    print(line)
        all_y.append(items[0])
        if not count.has_key(items[0]):
            count[items[0]] = 0
        count[items[0]] += 1
    else:
        print("ERROR in train",i,len(items))
    """      
for i,line in enumerate(data_train):
    items = line.split('\t')
    if len(items) == 2:
        data_train_data.append(items[1].decode(ENCODING, 'ignore'))
        if(i == 0):
            print(items[1].decode(ENCODING, 'ignore'))
        y_train.append(items[0])
        if not count.has_key(items[0]):
            count[items[0]] = 0
        count[items[0]] += 1
    else:
        print("ERROR in train",i,len(items))
for i,line in enumerate(data_test):
    items = line.split('\t')
    if len(items) == 2:
        data_test_data.append(items[1].decode(ENCODING, 'ignore'))
        y_test.append(items[0])
        if not count.has_key(items[0]):
            count[items[0]] = 0
        count[items[0]] += 1
    else:
        print("ERROR in test",i,len(items))
   


Loading NSCC  dataset for categories:
['ED10', 'ED11', 'EI02', 'EI03', 'EI06', 'EI07', 'EI05', 'EI08', 'EH06', 'EF99', 'EB01', 'EA09', 'EE11', 'EE10', 'EE13', 'EE12', 'EI99', 'EE14', 'EA04', 'EA05', 'ED07', 'ED06', 'ED05', 'ED04', 'ED03', 'ED01', 'EE99', 'ED08', 'EA02', 'EH10', 'EI11', 'EI12', 'EA14', 'EA11', 'EA10', 'EA13', 'EA07', 'EF05', 'EF06', 'ED99', 'EE08', 'EE09', 'EE06', 'EE07', 'EE04', 'EE05', 'EE02', 'EE03', 'EE01', 'SB99', 'ND07', 'OA04', 'LC06', 'SI04', 'SH07']
Category size 55
운전 안전 편의 향상 운전자 시야 중심 차량 증강 현실 정보 제공 시스템 기술 개발 증강 현실 안전 운전 헤드업 디스플레이 나이트비전 장애물 인식 Head up display Night vision Object recognition Augmented reality Driving safety차량 증강 현실 정보 제공 핵심 요소 기술 개발 차량 증강 현실 정보 제공 통합 개발 증강 현실 시제품 개발 성능 검증 차량 증강 현실 실차 테스트 베드 구축 차량 증강 현실 정보 제공 서비스 프로토타입 개발 차량 증강 현실 정보 제공 표준 추진 주관 기관 한국전자통신연구원 차량 증강 현실 정보 제공 통합 개발 테스트 베드 구축 참여 기관 경북대학교 산학 협력단 실세계 정보 실시간 인식 추적 기술 개발 참여 기관 자동차 부품 연구원 운전자 인지 향상 증강 현실 처리 기술 개발 참여 기관 현대오 트론 증강 현실 시제품 제어 개발 실용 방안 도출 참여 기관 현대엠엔소프트 고정밀 차원 디지털 맵 내비게이션 연동

In [20]:
#data_train_data,data_test_data,y_train,y_test = train_test_split(all_x,all_y,random_state =1, train_size = 0.65)
print (len(data_train_data), len(data_test_data))
print('data loaded')
guess = 80000*[""]
#data_train_data,data_test_data = data_test_data,data_train_data
#y_test,y_train=y_train,y_test
#data_train_data = data_test_data
#y_train = y_test
# order of labels in `target_names` can be different from `categories`
for i,line in enumerate(data_test_data):
    if(u'선박' in line and u'해양' in line):
        #print("Obvious: EA10",y_test[i])
        #guess[i] = "EA10"
        pass
    if(u'power' in line and u'에너지' in line):
        #print("Obvious:",CODE,y_test[i])
        #guess[i] = "EF05"
        pass
    #if(u'데이터' in line and u'임상' in line and u'적용' in line and  u'측정' in line):
        #print("Obvious:",CODE,y_test[i])
        #guess[i] = "LC04"
target_names = categories #data_train.target_names

7106 4856
data loaded


In [21]:
# Add Word Embedding (Word Embedding, Topic Embedding, Topic-Event Embedding) Features

print("Extracting features from the training data using a sparse vectorizer")
t0 = time()
my_stop_words = [np.unicode(x.strip(), 'utf-8','ignore') for x in open('kor_stop_word.txt', 'r').read().split('\n')]


#print (my_stop_words)

vectorizer = TfidfVectorizer(max_df=0.5,stop_words=my_stop_words,max_features = 100000,
                             min_df=3)
#vectorizer = StemmedTfidfVectorizer(stop_words=my_stop_words,max_df=0.5,max_features = 50000,min_df=3)    
X_train = vectorizer.fit_transform(data_train_data)

duration = time() - t0

print("n_samples: %d, n_features: %d" % X_train.shape)
print()

print("Extracting features from the test data using the same vectorizer")
t0 = time()
X_test = vectorizer.transform(data_test_data)
duration = time() - t0
print("n_samples: %d, n_features: %d" % X_test.shape)
# mapping from integer feature name to original token string
feature_names = vectorizer.get_feature_names()

feature_names = np.asarray(feature_names)
#test_dtm = vectorizer.transform(data_test_data)
#train_dtm = vectorizer.transform(data_train_data)

Extracting features from the training data using a sparse vectorizer
n_samples: 7106, n_features: 15254

Extracting features from the test data using the same vectorizer
n_samples: 4856, n_features: 15254


In [22]:
#results = []
# Train SGD model
fail_list = []
suggested_n_iter = np.ceil(10**6/len(data_train_data))
clf = SGDClassifier(loss='log', alpha=.0001, n_iter=50, penalty="l2")
benchmark(clf)

________________________________________________________________________________
Training: 
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='log', n_iter=50, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)
train time: 9.181s
test time:  0.102s
Top- 5
!Low precision :! #Correct: 0.0 #Tested: 14.0 #Train 17.0
	 ED10 Acc.: 0.0 Correct: 0.0 Tested: 14.0 #Train 17.0
	 ED11 Acc.: 0.548387096774 Correct: 17.0 Tested: 31.0 #Train 42.0
!Low precision :! #Correct: 0.0 #Tested: 6.0 #Train 7.0
	 EI02 Acc.: 0.0 Correct: 0.0 Tested: 6.0 #Train 7.0
!Low precision :! #Correct: 3.0 #Tested: 8.0 #Train 9.0
	 EI03 Acc.: 0.375 Correct: 3.0 Tested: 8.0 #Train 9.0
	 EI06 Acc.: 0.777777777778 Correct: 7.0 Tested: 9.0 #Train 9.0
	 EI07 Acc.: 0.166666666667 Correct: 1.0 Tested: 6.0 #Train 5.0
	 EI05 Acc.: 0.526315789474 Correct: 10.

In [23]:
out = file("features.txt","w")

for x in feature_names:
    out.write(x.encode('utf-8','ignore')+"\n")
out.close()

In [25]:
#class index of EE02 is 34
#class index of LC06 is 94
#class index of EE99 is 47
#class index of EA10 is 9 
#class index of EF05 is 50 
#
CODE_INDEX = clf.classes_.tolist().index('EE14')
CODE_INDEX

51

In [24]:
#deceiving category? EE02?
#Powerful features of EE02, the biggest category
black_list = []
print(clf.classes_[CODE_INDEX])
for i,x in enumerate(vectorizer.get_feature_names()):
    if(3>clf.coef_[CODE_INDEX][i]>1):
        print(clf.coef_[CODE_INDEX][i],x)
        black_list.append(x)


EE02
1.53800387477 3d
1.63228134361 analysi
1.70881888026 cloud
1.52782369874 data
1.31686778922 learn
1.23723081528 manag
1.2987756951 market
1.26135604594 mobil
1.78876945712 platform
1.7069120214 softwar
2.00206514621 sw
1.26679179568 web
1.58100307866 개발자
1.76401415654 고객
1.4221152699 공유
1.33577778595 과정
2.04288484033 관리
1.435774204 관리자
1.34185970961 구매
1.76651210379 기능
1.21183394259 내역
1.36073905463 데이터
1.50577911949 디드
1.37907259094 마케팅
1.79116460966 모바일
1.36236947874 문서
1.41723440859 보유
1.47796186426 비용
2.42864122576 소프트
2.44022018702 소프트웨어
2.34722952153 솔루션
1.22888352847 실시간
1.95531655258 엔진
1.30565146131 오픈
1.34030017058 온라인
1.36857061292 운영체제
2.19949840272 웨어
1.20756084026 유저
1.42051339564 융합기술고도화
1.27463307125 이미지
1.50058178073 임베디드
1.43362201748 입력
1.39416773254 자동
1.4067433942 전문
1.46837666544 전문가
1.64888012043 지원
1.45743705306 체크
1.78531145051 추가
1.46590194919 출시
1.70585892825 컨텐츠
1.51502761312 프로그램
1.89084775999 플랫폼


In [44]:
white_list = []
for i,x in enumerate(vectorizer.get_feature_names()):
    if(clf.coef_[CODE_INDEX][i]>2):
        print(clf.coef_[CODE_INDEX][i],x)
        white_list.append(x)
    

2.6364365261 film
2.59125747492 health
2.63322161012 medic
2.67100666961 심장
2.79196081601 의료
3.61525651388 의료기기


In [46]:

print(y_test[fail_list[0]],data_test_data[fail_list[0]])
fail_data = []
for x in fail_list:
    fail_data.append(data_test_data[x])

LC04  센서 센서배열 배열 모바일 기기 기반 탄성 도플러 영상 영상진단기 진단기 개발



In [47]:
v = vectorizer.transform(fail_data)
v_array = v.toarray()
important_word = 80000*[0]
buffer = 80000*[0]
for i,doc in enumerate(fail_data):
    print(doc)
    #print(v.toarray()[i])
    #rev_list =  reversed(np.argsort(v.toarray()[i]))
    for j in range(len(v_array[i])):
        if(v_array[i][j]>0.0):
     #       print(index,feature_names[index])
            buffer[j] = 1
        
    for j in range(len(buffer)):
        if buffer[j] > 0:
            important_word[j] += 1
            buffer[j] = 0
            


 센서 센서배열 배열 모바일 기기 기반 탄성 도플러 영상 영상진단기 진단기 개발

machine learning , deep learning , artificial intelligence , fund us , diabetic ret in opa thy , age related mac u lar degeneration , glaucoma , retinal vein occlusion , retinal artery occlusion 딥 딥러닝 러닝 기술 이용 안저 안저사진 사진 판독 원천 원천기술 확보 응용 머신 머신러닝 인공 인공지능 지능 당뇨 당뇨망막병증 망막 병증 노인 노인성황반변성 성황반 변성 녹내장 망막정맥폐쇄 정맥 폐쇄 망막동맥폐쇄 동맥 한국인 평균 평균수명 수명 증가 의학 의학연구 연구 방향 단순 연장 사회 전체 삶 질 보정 목표 시력 중요 요소 수의 사람 나이 나이관련황반 관련 황 반 등 저하 실명 유발 수 중대 질환 이환 이 로 손상 방법 정기적 검진 조기 진단 최근 보급 무산 무산동안 동안 사진촬영장비 촬영 장비 주요 안과 안과질환 민감도 숙련 안과의사 의사 진료 필요 이미지 분야 인식 기계 기계학습 학습 중 발전 의료 의료분야 적용 노력 시도 최고 최고수준 수준 임상 임상의사 상기 나 아가 이 바탕 질병 예측 예후 평가 시스템 구축 초석 세계적 수준의 환자 임상정보 정보 이브 정확 데이터 데이터베이스 베이스 후 개입 최소화 의미 알고리즘 개발 정상 정상안저 등 시작 다양 개별 망막질환 예 망막혈관폐쇄 혈관 유전성 유전성망막질환 의 자동화 모델 뿐 성능 시험용 상용 가능성 타진 기반 마련 현재 개인적 경험 역량 좌우 영역 양의 정량화 근거 이 정량적 측정 데 본 실제 일치 검정 성능개선 개선 고도화 각광 근래 의료영역 잠재력 기대 초기 단계 임상환자정보 연구진 실정 과제 자동 자동판독 기술적 돌파구 돌파구임 임 입증 경우 파급 향후 진행 뇌혈류 뇌혈류장애 장애 검출 개별적 연구가 보고 자체 어려움 연구자 연구자간의 간의 편차 동반 전신 전신질환 추

In [50]:
for i,word_id in enumerate(important_word):
    if word_id > 4:
        print(word_id,feature_names[i])

5 데이터
5 센서
5 임상
5 적용
6 측정


In [84]:
#print(v)
prob = clf.predict_proba(v)
print(prob.shape)
Max = 0
for i,x in enumerate(prob[0,:]):
    if(x > 0.0001):
#        print(i,x)
        if(x > Max):
            Max = x
            index = i
    
print("Max",Max,index,vectorizer.get_feature_names()[index])
for word in np.argsort(prob):
    print(word,feature_names[word])

(1L, 140L)
Max 0.673850471003 34 acronym
[ 49 127 101   7  17  74 128 121  83  98 125 124 123 106  57 107  76  96
 109   2  23  69 136  95  20  58 132 139  55 126  84  78  80 137 116  90
 134  75  54  19  52  18   6  25 133  16 117 102 110 103  60  53  15  87
 108 120 122 130  86 114 105  13 135  72 100  14  48  21  61  29 118 138
  59   3  66  85  62  63  73 115  99   0   5  64  81 131  91 129  26  88
  92  93 104  56  82  94  67  97  89  22 119  10  51 111  79  12 112  11
   1   8  70  71  27   4  30  28 113  32  36  68  24  65  50  35  39  77
  40  38  31  43  33   9  46  45  42  37  47  41  44  34] [u'addit' u'alm' u'airbag' u'abort' u'acc' u'aesthet' u'alon' u'aliz' u'ag'
 u'aid' u'alloc' u'allianc' u'allerg' u'ajax' u'administr' u'al' u'affect'
 u'ah' u'alarm' u'abbrevi' u'accord' u'advis' u'ambient' u'agricultur'
 u'access' u'admiss' u'alu' u'ami' u'adm' u'alloy' u'age' u'affin'
 u'africa' u'ambigu' u'ali' u'agnat' u'alzheim' u'af' u'adjust' u'accept'
 u'adher' u'acceler' u'abno

In [54]:

table = pd.DataFrame(test_dtm.toarray())
s = pd.Series([45])
problems = pd.DataFrame()
f_names = pd.Series(vectorizer.get_feature_names())
for i,v in enumerate(s):
    problems = problems.append(table.loc[s[i], table.loc[s[i]]>0],ignore_index = True)
vocab = problems.T
vocab['meaning']= f_names[vocab.index]
fn_list = f_names.tolist()
#vocab['meaning']= vocab['meaning'].apply(lambda x: x.encode(encoding='utf-8',errors="ignore"))
vocab = vocab[vocab[0]>0.1]
#display(vocab)

for i,x in enumerate(vocab['meaning']):
    if x in black_list:
        #print("Black L found",i,x)
        print(x)

for i,x in enumerate(vocab['meaning']):
    if x in white_list:
        print("White L found",i,x)
        

base
model
simulator
test
격자
계산
관계식
국방
국토
도움
반사량
비교
시뮬레이터
재질
판독


In [26]:
english_stemmer = nltk.stem.SnowballStemmer('english')
class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer,self).build_analyzer()
        return lambda doc: (
        english_stemmer.stem(w) for w in analyzer(doc))

In [71]:
table = pd.DataFrame(train_dtm.toarray())
s = pd.Series([59,60,61,62,63,64,65])
problems = pd.DataFrame()
f_names = pd.Series(vectorizer.get_feature_names())
for i in range(6):
    problems = problems.append(table.loc[s[i], table.loc[s[i]]>0],ignore_index = True)
vocab = problems.T
vocab['meaning']= f_names[vocab.index]
fn_list = f_names.tolist()
#vocab['meaning']= vocab['meaning'].apply(lambda x: x.encode('utf-8'))
#vocab