# ロジスティック回帰（パーセプトロン）による記事ジャンルの分類

In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


## 訓練・テストデータの作成

In [2]:
from glob import glob
path_to_corpus = 'text' # livedoor ニュースコーパスのフォルダを指定してください．
sports_list = glob(path_to_corpus + '/sports-watch/sports-watch*.txt')
others_list = [p for p in glob(path_to_corpus + '/*/*.txt') if 'sports-watch' not in p]

In [3]:
import random
random.seed(1)

def write_files(out, filenames, label):
    for f in filenames:
        out.write(' '.join(l.strip().replace('\t', ' ') for l in open(f).readlines()[2:]))
        out.write('\t' + label + "\n")

out_train = open('train.tsv', 'w')
out_test = open('test.tsv', 'w')
sports_files = random.sample(sports_list, 500)
others_files = random.sample(others_list, 500)

write_files(out_train, sports_files[:400], 'sports')
write_files(out_train, others_files[:400], 'others')
write_files(out_test, sports_files[400:], 'sports')
write_files(out_test, others_files[400:], 'others')

out_train.close()
out_test.close()

## 訓練データの読み込みとモデルの学習

In [4]:
data = [l.strip().split('\t') for l in open('train.tsv')]
sents, labels = list(zip(*data))
print(sents[0])
print(labels[0])

【Sports Watch】安藤美姫“日本の人の声は気になったり、今でも怖い” 日本テレビ「バンクーバー2010」（27日放送）には女子フィギュアスケートで活躍した安藤美姫＆鈴木明子が出演、大舞台での演技から一夜明け、その心境を語った。  「トリノと比べると、落ち着いて一日一日を過ごせて、メダルが目標だったので、残念だったんですけど、演技自体はオリンピックの舞台でミスなく終われて幸せでした。スケートやっててよかったな。初めて心から幸せだなと思えた」という安藤に、パーソナルベストを出し8位入賞を果たした鈴木明子は「無事に終わってホッとしています。想像していた通り、緊張するだろうなと思って入ってきていたので、予想通りすごい緊張感はあったんですけど、会場の空気感を目で見て、耳で感じて、肌でも感じられるようにしたいなと思って、そこから滑り出したいと思っていた」と振り返った。  また、日本女子フィギュア勢で唯一トリノ五輪を経験している安藤は、前大会と比較したプレッシャーの差を訊かれると、「日本の人の声は気になったり、今でも怖い。どうやって言われるだろうとか。でも、そういうものを全日本で出してしまってジャンプの失敗に繋がったので、今回は一つの作品として、とにかく難易度を下げてでも一つの作品として滑りたかった」と明かし、そのプレッシャーの重さを感じさせた。
sports


In [5]:
import MeCab
tagger = MeCab.Tagger('-Owakati')
tokenized_sents = [tagger.parse(sent) for sent in sents]

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=2, max_df=0.5, max_features=1000)
tfs = vectorizer.fit_transform(tokenized_sents)

In [7]:
print(vectorizer.get_feature_names())

['00', '000', '01', '05', '06', '10', '100', '1000', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '200', '2009', '2010', '2011', '2012', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '300', '31', '32', '35', '40', '48', '50', '500', '60', 'akb', 'android', 'arrows', 'au', 'cm', 'com', 'cpu', 'facebook', 'fi', 'galaxy', 'gb', 'ghz', 'google', 'hd', 'htc', 'http', 'ipad', 'iphone', 'is', 'isw', 'it', 'jp', 'livedoor', 'lte', 'mah', 'max', 'mbps', 'mm', 'movie', 'news', 'ntt', 'on', 'one', 'optimus', 'os', 'pc', 'phone', 'play', 'salon', 'sh', 'smaxjp', 'softbank', 'sports', 'store', 'sx', 'tbs', 'the', 'tv', 'twitter', 'watch', 'web', 'wi', 'xi', 'xperia', 'ああ', 'あげ', 'あっ', 'あと', 'あなた', 'あの', 'あまり', 'あり', 'あれ', 'いい', 'いう', 'いえ', 'いか', 'いかが', 'いき', 'いく', 'いけ', 'いただき', 'いっ', 'いつ', 'いつも', 'いろいろ', 'うち', 'うまく', 'おく', 'おすすめ', 'および', 'おり', 'お知らせ', 'お金', 'かけ', 'かなり', 'かも', 'かわいい', 'ください', 'くらい', 'くる', 'くれ', 'くれる', 'けど', 'こう', 'ここ', 'こそ', 'こちら', 'この', 'これ', 'これから', 'こ

In [8]:
print(tfs[0])

  (0, 84)	1
  (0, 91)	1
  (0, 651)	4
  (0, 743)	3
  (0, 188)	2
  (0, 210)	5
  (0, 744)	1
  (0, 20)	1
  (0, 29)	1
  (0, 734)	1
  (0, 640)	2
  (0, 805)	1
  (0, 562)	1
  (0, 898)	2
  (0, 935)	1
  (0, 846)	1
  (0, 192)	2
  (0, 256)	3
  (0, 135)	2
  (0, 335)	1
  (0, 230)	1
  (0, 676)	2
  (0, 208)	1
  (0, 296)	2
  (0, 303)	1
  :	:
  (0, 520)	1
  (0, 704)	3
  (0, 309)	1
  (0, 302)	1
  (0, 181)	2
  (0, 171)	1
  (0, 720)	1
  (0, 277)	1
  (0, 498)	1
  (0, 869)	1
  (0, 631)	1
  (0, 441)	2
  (0, 923)	1
  (0, 310)	2
  (0, 219)	1
  (0, 921)	1
  (0, 170)	1
  (0, 290)	1
  (0, 155)	1
  (0, 502)	1
  (0, 475)	2
  (0, 528)	2
  (0, 215)	2
  (0, 217)	1
  (0, 747)	1


In [9]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(tfs, labels)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

## テストデータの読み込みと予測

In [10]:
data = [l.strip().split('\t') for l in open('test.tsv')]
test_sents, test_labels = list(zip(*data))
tokenized_test_sents = [tagger.parse(sent) for sent in test_sents]
test_tfs = vectorizer.transform(tokenized_test_sents)
predict_labels = lr.predict(test_tfs)
print(predict_labels)

['sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports'
 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports'
 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports'
 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports'
 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports'
 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports'
 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports'
 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'others' 'sports'
 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports'
 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports'
 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports'
 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports'
 'sports' 'sports' 'sports' 'sports' 'others' 'others' 'others' 'others'
 'others' 'others' 'others' 'others' 'others' 'spor

In [11]:
print(np.sum(test_labels == predict_labels) / len(test_labels))

0.985


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=2, max_df=0.5, max_features=1000)
tfidfs = vectorizer.fit_transform(tokenized_sents)

In [13]:
print(tfidfs[0])

  (0, 747)	0.07380085127830598
  (0, 217)	0.09993823831728248
  (0, 215)	0.08959786816243354
  (0, 528)	0.17795114504550474
  (0, 475)	0.18418028856411983
  (0, 502)	0.05676255456918252
  (0, 155)	0.07027789153819866
  (0, 290)	0.049124751732099466
  (0, 170)	0.07256645527774962
  (0, 921)	0.07027789153819866
  (0, 219)	0.06286989812051866
  (0, 310)	0.07915914379680627
  (0, 923)	0.08802341416663526
  (0, 441)	0.20479146898451642
  (0, 631)	0.07825714603561588
  (0, 869)	0.07984446861632172
  (0, 498)	0.07457411460341987
  (0, 277)	0.041849460743490344
  (0, 720)	0.08802341416663526
  (0, 171)	0.07735414394791652
  (0, 181)	0.09356391980772065
  (0, 302)	0.03975560775837314
  (0, 309)	0.062243695658803125
  (0, 704)	0.18445160456104961
  (0, 520)	0.09381868610786533
  :	:
  (0, 303)	0.09634272909183711
  (0, 296)	0.12479735446107872
  (0, 208)	0.07256645527774962
  (0, 676)	0.20655202816707366
  (0, 230)	0.049124751732099466
  (0, 335)	0.10327601408353683
  (0, 135)	0.1000439278253486

In [14]:
lr = LogisticRegression()
lr.fit(tfidfs, labels)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
data = [l.strip().split('\t') for l in open('test.tsv')]
test_sents, test_labels = list(zip(*data))
tokenized_test_sents = [tagger.parse(sent) for sent in test_sents]
test_tfs = vectorizer.transform(tokenized_test_sents)
predict_labels = lr.predict(test_tfs)
print(predict_labels)

['sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports'
 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports'
 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports'
 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports'
 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports'
 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports'
 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports'
 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'others' 'sports'
 'sports' 'others' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports'
 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports'
 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports'
 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports' 'sports'
 'sports' 'sports' 'sports' 'sports' 'others' 'others' 'others' 'others'
 'others' 'others' 'others' 'others' 'others' 'spor

In [16]:
print(np.sum(test_labels == predict_labels) / len(test_labels))


0.975
