In [1]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
import random

In [2]:
df = pd.read_csv("news_dataset.csv", index_col = 0)
df.head()

Unnamed: 0,date,title,content,label
0,20170101,"朴대통령 ""뇌물죄, 완전히 엮은 것…세월호 허위 걷혀야""(종합)","새해 첫날 청와대서 사실상 기자간담회…직무정지 23일 만에 첫 입장표명""공모나 누구...",BH
1,20170102,"정유라, 덴마크서 불법 체류 혐의로 체포···특검 “송환 협조중” (종합)",[아시아경제 정준영 기자] 이화여대 학사부정 및 삼성 특혜지원 의혹의 수혜자 겸 공...,BH
2,20170103,"[단독]정유라, “(주사 아줌마)누구인지 알 것 같다”…현지 답변태도 분석, 사전 ...",덴마크 올보르 법원에서 잠시 휴정중 기자들의 질문에 답변하는 정유라씨 사진=현지교...,BH
3,20170104,"[단독]""정유라, 이대학장실 등 교내서 교수 6명에 학점취득 코치받아""","[연합뉴스TV제공]김병욱, 교육부 자료 확인…""학점 좋은이유 모른다더니""담당교수들 ...",Politic
4,20170105,"윤전추 ""기억안나. 몰라. 말못해""… 헌재 ""본인범죄 외 답해라""","""외부인 동행 없다"" 주장하다 ""세월호 당일 미용사 태워왔다"" 윤전추 헌재 탄핵심리...",BH


##### 형태소 분석기 Twiter 사용, tokenize 함수 정의(명사(Noun) 만 사용)

In [3]:
from konlpy.tag import Komoran
tag = Komoran()

In [4]:
def kor_noun(text):
    words = []
    for w in tag.nouns(text):
        if len(w) > 1:
            words.append(w)
    return words

##### TfidfVectorizer 사용 단어들을 vector화 하고 TermDocumentMatrix(df_tfidf) 생성

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
text_data_list = df["content"].astype(str).tolist()
text_data_arr = np.array(["".join(text) for text in text_data_list])

In [7]:
vectorizer = TfidfVectorizer(min_df = 2, tokenizer = kor_noun, norm = 'l2')
text_data = vectorizer.fit_transform(text_data_arr)

In [8]:
df_tfidf = pd.DataFrame(text_data.A, columns = vectorizer.get_feature_names())
df_tfidf.head()

Unnamed: 0,12월 14일,1시간,1월 2일,2007년 남북정상회담,2010년 9월,2012년 10월,2012년 12월,2013년 10월,2013년 12월,2013년 1월,...,휴전선,휴정,휴학,흐름,흔적,흡수,희망,희생,희생자,흰색
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.072151,0.037957,0.0,0.036075,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
df_tfidf["label"] = df["label"]

In [10]:
df = df_tfidf
df.head(10)

Unnamed: 0,12월 14일,1시간,1월 2일,2007년 남북정상회담,2010년 9월,2012년 10월,2012년 12월,2013년 10월,2013년 12월,2013년 1월,...,휴정,휴학,흐름,흔적,흡수,희망,희생,희생자,흰색,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BH
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BH
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.072151,0.037957,0.0,0.036075,0.0,0.0,0.0,0.0,0.0,BH
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Politic
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BH
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Politic
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BH
7,0.05459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BH
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BH
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BH


In [11]:
## Date Spliting
from sklearn.model_selection import train_test_split

In [12]:
X = df.drop(["label"], axis = 1)
y = df["label"]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

##### Machine Learning 으로 분류 수행

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.neighbors import NearestCentroid

In [15]:
lr = LogisticRegression()
tree = DecisionTreeClassifier()
mlp = MLPClassifier()
ridge = RidgeClassifier(tol = 1e-2, solver = 'lsqr', alpha = .5)
sgd = SGDClassifier(loss = "hinge", penalty = "l2", alpha = 1e-3, random_state = 42, max_iter = 100, tol = None)
rf = RandomForestClassifier(max_features = 9, n_estimators = 100)
percep = Perceptron(n_iter = 50)
pass_agg = PassiveAggressiveClassifier(n_iter = 50)
near_cent = NearestCentroid()

In [16]:
from sklearn.metrics import classification_report
for clf in (lr, tree, mlp, ridge, sgd, percep, pass_agg, near_cent):
    clf.fit(X_train, y_train)
    print("=" * 25 + "   " + clf.__class__.__name__ +  "   " + "=" * 30)
    print(clf.__class__.__name__, classification_report(y_test, clf.predict(X_test)));

LogisticRegression                precision    recall  f1-score   support

           BH       0.52      1.00      0.68        32
    Con/Party       1.00      0.21      0.35        14
Defence/Diplo       0.75      0.38      0.50         8
        North       0.75      0.33      0.46         9
      Politic       0.00      0.00      0.00        10

  avg / total       0.59      0.56      0.48        73

DecisionTreeClassifier                precision    recall  f1-score   support

        Admin       0.00      0.00      0.00         0
           BH       0.66      0.72      0.69        32
    Con/Party       0.56      0.36      0.43        14
Defence/Diplo       0.43      0.38      0.40         8
        North       0.64      1.00      0.78         9
      Politic       0.14      0.10      0.12        10

  avg / total       0.54      0.56      0.54        73



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


MLPClassifier                precision    recall  f1-score   support

           BH       0.67      0.91      0.77        32
    Con/Party       0.70      0.50      0.58        14
Defence/Diplo       0.71      0.62      0.67         8
        North       0.80      0.89      0.84         9
      Politic       0.67      0.20      0.31        10

  avg / total       0.70      0.70      0.67        73

RidgeClassifier                precision    recall  f1-score   support

           BH       0.62      0.94      0.75        32
    Con/Party       0.78      0.50      0.61        14
Defence/Diplo       1.00      0.50      0.67         8
        North       0.82      1.00      0.90         9
      Politic       0.00      0.00      0.00        10

  avg / total       0.63      0.68      0.63        73

SGDClassifier                precision    recall  f1-score   support

           BH       0.65      0.94      0.77        32
    Con/Party       0.75      0.43      0.55        14
Defence/Diplo 



Perceptron                precision    recall  f1-score   support

           BH       0.56      0.94      0.70        32
    Con/Party       0.88      0.50      0.64        14
Defence/Diplo       1.00      0.25      0.40         8
        North       0.88      0.78      0.82         9
      Politic       0.00      0.00      0.00        10

  avg / total       0.63      0.63      0.57        73





PassiveAggressiveClassifier                precision    recall  f1-score   support

           BH       0.65      0.94      0.77        32
    Con/Party       0.67      0.43      0.52        14
Defence/Diplo       1.00      0.50      0.67         8
        North       0.75      1.00      0.86         9
      Politic       0.50      0.10      0.17        10

  avg / total       0.68      0.68      0.64        73

NearestCentroid                precision    recall  f1-score   support

           BH       0.69      0.91      0.78        32
    Con/Party       0.79      0.79      0.79        14
Defence/Diplo       1.00      0.50      0.67         8
        North       0.82      1.00      0.90         9
      Politic       0.50      0.10      0.17        10

  avg / total       0.73      0.74      0.70        73



In [17]:
from sklearn.metrics import accuracy_score
for clf in (lr, tree, mlp, ridge, sgd, percep, pass_agg, near_cent):
    clf.fit(X_train, y_train)
    print(clf.__class__.__name__, " : ", accuracy_score(y_test, clf.predict(X_test)))
    print("\n");

LogisticRegression  :  0.5616438356164384


DecisionTreeClassifier  :  0.6164383561643836


MLPClassifier  :  0.726027397260274


RidgeClassifier  :  0.684931506849315


SGDClassifier  :  0.684931506849315






Perceptron  :  0.6301369863013698






PassiveAggressiveClassifier  :  0.6986301369863014


NearestCentroid  :  0.7397260273972602




In [18]:
from sklearn.metrics import accuracy_score
for clf in (lr, tree, mlp, ridge, sgd, percep, pass_agg, near_cent):
    clf.fit(X_train, y_train)
    print(clf.__class__.__name__, " : ", accuracy_score(y_train, clf.predict(X_train)))
    print("\n");

LogisticRegression  :  0.773972602739726


DecisionTreeClassifier  :  1.0


MLPClassifier  :  1.0


RidgeClassifier  :  1.0


SGDClassifier  :  1.0






Perceptron  :  1.0






PassiveAggressiveClassifier  :  1.0


NearestCentroid  :  0.8904109589041096




일반적인 ML 알고리즘으로 분류하였을 경우, 가장 높은 성능인 Nearest Centroid의 val_acc의 값은 0.74 정도이었음.