In [6]:
#coding:utf-8

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Problem 50: Download and Preprocess Dataset

In [None]:
#データのダウンロードと解凍
!wget https://archive.ics.uci.edu/ml/datasets/News+Aggregator
!unzip NewsAggregatorDataset.zip

In [3]:
#1 readme.txt
!cat readme.txt

SUMMARY: Dataset of references (urls) to news web pages

DESCRIPTION: Dataset of references to news web pages collected from an online aggregator in the period from March 10 to August 10 of 2014. The resources are grouped into clusters that represent pages discussing the same news story. The dataset includes also references to web pages that point (has a link to) one of the news page in the collection.

TAGS: web pages, news, aggregator, classification, clustering

LICENSE: Public domain - Due to restrictions on content and use of the news sources, the corpus is limited to web references (urls) to web pages and does not include any text content. The references have been retrieved from the news aggregator through traditional web browsers. 

FILE ENCODING: UTF-8

FORMAT: Tab delimited CSV files. 

DATA SHAPE AND STATS: 422937 news pages and divided up into:

152746 	news of business category
108465 	news of science and technology category
115920 	news of business category

In [32]:
#2 publisherの抽出

attr = ['id','title','url','publisher','category','story','hostname','timestamp']

df = pd.read_csv('newsCorpora.csv',delimiter='\t',header=None, names=attr)
exdf = df[df['publisher'].isin(['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail'])]

#shuffleしてtrain:validation:test=8:1:1に分割
train, test = train_test_split(exdf, test_size=0.2)
val, test = train_test_split(test, test_size=0.5)

#ファイル出力
train[['title','category']].to_csv('train.txt', sep='\t')
val[['title','category']].to_csv('valid.txt', sep='\t')
test[['title','category']].to_csv('test.txt', sep='\t')

#各カテゴリの事例数確認
print('train:\n', train['category'].value_counts())
print('val:\n', val['category'].value_counts())
print('test:\n', test['category'].value_counts())

train:
 b    4469
e    4244
t    1222
m     737
Name: category, dtype: int64
val:
 b    576
e    533
t    141
m     84
Name: category, dtype: int64
test:
 b    582
e    502
t    161
m     89
Name: category, dtype: int64


# Problem 51: Feature extraction

In [85]:
#試したけどよく分からんかったやつ

# stop_wordsの設定
stop_words = nltk.corpus.stopwords.words('english')
symbol = ["'", '"', ':', ';', '.', ',', '-', '!', '?', "'s"]

feature_words = set()
for string in list(df['title']):
    words = nltk.word_tokenize(string)
    clean_words = [w.lower() for w in words if w.lower() not in stop_words+symbol]
    [feature_words.add(w) for w in clean_words]
    
print(len(feature_words))

78647


In [109]:
train.append(val)

Unnamed: 0,id,title,url,publisher,category,story,hostname,timestamp
71201,71277,GM's Barra Faces New Allegations Added to Corv...,http://www.businessweek.com/news/2014-03-31/gm...,Businessweek,t,dLuV3nUomf1hshMPo13W-T3R4PqIM,www.businessweek.com,1396286891876
83036,83112,9 Ways To Walk More On National Walking Day,http://www.huffingtonpost.com/2014/04/02/walk-...,Huffington Post,m,dn9fvRMu49UtO4MVqJs-zSBgqqM8M,www.huffingtonpost.com,1396443419865
128358,128694,The service will be done in partnership with E...,http://www.huffingtonpost.com/huff-wires/20140...,Huffington Post,b,djTJEXdELNBQs8MEfG5eLPI4Ez7jM,www.huffingtonpost.com,1397765133449
56754,56755,Walmart Suing Visa Over Credit Card Swipe Fees,http://www.huffingtonpost.com/2014/03/27/walma...,Huffington Post,b,d_hW9mavhlXZ9mMP2sSzq7LOou5ZM,www.huffingtonpost.com,1396012571448
158735,159071,US STOCKS-Apple lifts Nasdaq; Ukraine drags on...,http://in.reuters.com/article/2014/04/24/marke...,Reuters,b,dyDpOQaVwycXqTMjxqQnnujCnxu2M,in.reuters.com,1398393136881
394175,394694,UPDATE 2-China regulator announces anti-monopo...,http://www.reuters.com/article/2014/07/29/micr...,Reuters,t,d-mbL8_DqBYvEdMBqoh-6GQ42xlLM,www.reuters.com,1406644091934
99393,99590,Ben Savage and Danielle Fishel reunite in trai...,http://www.dailymail.co.uk/tvshowbiz/article-2...,Daily Mail,e,dLiDSAjVbSgSD_MAv1rhmLRWqxRhM,www.dailymail.co.uk,1397242837205
376454,376914,Cynk Surges 36000% as Buzz Builds for 1-Employ...,http://www.businessweek.com/news/2014-07-10/cy...,Businessweek,b,dax1Q6Cmjrv68KMcD_I6EYbFqvo5M,www.businessweek.com,1405030322647
223321,223767,US STOCKS SNAPSHOT-Wall St ends higher; Intern...,http://in.reuters.com/article/2014/05/19/marke...,Reuters,b,dOffeaHt7dLDo1McbDBnUNC_uoNBM,in.reuters.com,1400580001535
405043,405562,WTO Talks Near Collapse Over India's Objections,http://www.businessweek.com/news/2014-07-31/wt...,Businessweek,b,dE0u6ysVPTmsiJM9qzLDFJuQ-0soM,www.businessweek.com,1406849439513


In [119]:
from sklearn.feature_extraction.text import TfidfVectorizer

train_val = train.append(val)

vectorizer = TfidfVectorizer(min_df=10)
X = vectorizer.fit_transform(train_val['title'])

X_train = vectorizer.transform(train['title'])
X_val = vectorizer.transform(val['title'])
X_test = vectorizer.transform(test['title'])

X_train = pd.DataFrame(X_train.toarray(), columns=vectorizer.get_feature_names())
X_val = pd.DataFrame(X_val.toarray(), columns=vectorizer.get_feature_names())
X_test = pd.DataFrame(X_test.toarray(), columns=vectorizer.get_feature_names())

X_train.to_csv('train.feature.txt', sep='\t', index=False)
X_val.to_csv('valid.feature.txt', sep='\t', index=False)
X_test.to_csv('test.feature.txt', sep='\t', index=False)

In [121]:
X_train.head()

Unnamed: 0,10,100,11,12,13,14,15,16,17,18,...,yields,york,you,young,your,yr,yuan,zac,zendaya,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Problem 52: Training

# Problem 53: Prediction

# Problem 54: Accuracy score

# Problem 55: Confusion matrix

# Problem 56: Precision, recall and F1 score

# Problem 57: Feature weights

# Problem 58: Regularization

# Problem 59: Hyper-parameter tuning