In [25]:
import numpy as np
import joblib

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from nltk.stem.porter import PorterStemmer as PS
import re
from gensim.models import KeyedVectors

model = KeyedVectors.load_word2vec_format('../ch07/GoogleNews-vectors-negative300.bin', binary=True)
ps = PS() #steming用

train = pd.read_table('train.txt', header=None)
valid = pd.read_table('valid.txt', header=None)
test = pd.read_table('test.txt', header=None)

cols = ['CATEGORY', 'TITLE']
train.columns = cols
valid.columns = cols
test.columns = cols

In [3]:
#出現頻度の高い単語の除去
def remove_stopwords(text):
    #入力と出力は同じstring形式
    stop_words = set(stopwords.words("english")) #stopwords
    
    #文区切りと単語区切りを順に行い　単語をストップワードと照らし合わせる
    period=[]
    for i in re.split("[.]",text):
        word=[j for j in i.split(" ") if j != "" and j not in stop_words]
        period.append(word)
        word=[]
    
    #元の形に復元
    tmp=[]
    for k in period:
        if k != []:
            tmp.append(' '.join(k))

    result='.'.join(tmp)
    return result

In [4]:
def clean_text(text):
    replaced_text = text.lower()
    replaced_text = re.sub(r'[【】]', '', replaced_text)       # 【】の除去
    replaced_text = re.sub(r'[（）()]', '', replaced_text)     # （）の除去
    replaced_text = re.sub(r'[［］\[\]]', '', replaced_text)   # ［］の除去
    replaced_text = re.sub(r'[{}\{\}]', '', replaced_text)   #  {}
    replaced_text = re.sub(r'[@＠]\w+', '', replaced_text)  # メンションの除去
    replaced_text = re.sub(r'https?:\/\/.*?[\r\n ]', '', replaced_text)  # URLの除去
    replaced_text = re.sub(r'\-',' ', replaced_text) #ハイフンは空白へ
    replaced_text = re.sub(r'　', ' ', replaced_text) # 全角空白の除去
    replaced_text = re.sub(r'  ', '', replaced_text) #２連続の半角空白を1つに
    replaced_text = re.sub(r'\d+\.\d+','', replaced_text) #小数点を含む数列の除去
    replaced_text = re.sub(r'\;','', replaced_text) #セミコロン
    replaced_text = re.sub(r'\:','', replaced_text) #コロン
    replaced_text = re.sub(r'\'','', replaced_text) #クオーテーション
    replaced_text = re.sub(r'\`','', replaced_text) #クオーテーション
    replaced_text = re.sub(r'\,','', replaced_text) #カンマ
    replaced_text = re.sub(r'\_','', replaced_text) #アンダーバー
    replaced_text = re.sub(r'\\','', replaced_text) #バックスラッシュ
    replaced_text = re.sub(r'\?','', replaced_text) #クエスチョン
    replaced_text = re.sub(r'\!','', replaced_text) #感嘆符　（ピリオドど同様に文末に使われる）
    replaced_text = re.sub(r'\+','', replaced_text) #プラス
    replaced_text = re.sub(r'\*','', replaced_text) #アスタリスク
    replaced_text = re.sub(r'\/','', replaced_text) #スラッシュ
    replaced_text = re.sub(r'\<','', replaced_text) #小なり
    replaced_text = re.sub(r'\>','', replaced_text) #大なり
    replaced_text = re.sub(r'\=','', replaced_text) #イコール
    replaced_text = re.sub(r'\%','', replaced_text) #パーセント
    replaced_text = re.sub(r'\&','', replaced_text) #アンパサント
    replaced_text = re.sub(r'\$','', replaced_text) #ドル
    replaced_text = re.sub(r'\#','', replaced_text) #シャープ
    replaced_text = re.sub(r'.\..\.','', replaced_text) #U.S. or U.N.の除去
    #replaced_text = re.sub(r'\d+','', replaced_text) #数列の除去
    replaced_text = re.sub(r'\d{1,3}','', replaced_text) #1から3桁の数列の除去
    replaced_text = re.sub(r'\d{5,}','', replaced_text) #5桁以上の数列の除去
    replaced_text = ps.stem("%s" % replaced_text) #語幹抽出
    return remove_stopwords(replaced_text)

In [5]:
train["shaped_txt"] = list(map(clean_text,train["TITLE"]))
valid["shaped_txt"] = list(map(clean_text,valid["TITLE"]))
test["shaped_txt"] = list(map(clean_text,test["TITLE"]))

In [6]:
train["type"] = "train"
valid["type"] = "valid"
test["type"] = "test"

In [7]:
data = pd.concat([train, valid, test]).reset_index(drop=True)

In [10]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le = le.fit(data['CATEGORY'])
data['label'] = le.transform(data['CATEGORY'])

In [11]:
data

Unnamed: 0,CATEGORY,TITLE,shaped_txt,type,label
0,m,The Flu Tricked Google. Can Wikipedia Do Better?,flu tricked google.wikipedia bett,train,2
1,b,UPDATE 2-ECB's Constancio watching more than j...,update ecbs constancio watching april inflatio...,train,0
2,t,Amazon to unveil smartphone in time for holida...,amazon unveil smartphone time holidayswsj,train,3
3,e,Mighty Morphin Power Rangers - Power Rangers F...,mighty morphin power rangers power rangers fea...,train,1
4,m,Biogen Idec wins Canadian approval for hemophi...,biogen idec wins canadian approval hemophilia ...,train,2
...,...,...,...,...,...
10880,m,STOCKS NEWS EUROPE-Novartis jumps on positive ...,stocks news europe novartis jumps positive hea...,test,2
10881,e,Bachelor Juan Pablo Galavis chooses Nikki Ferr...,bachelor juan pablo galavis chooses nikki ferr...,test,1
10882,b,Kingfisher Starts Returning Cash as Confidence...,kingfisher starts returning cash confidence ou...,test,0
10883,t,EPA says Ford to correct fuel economy standard...,epa says ford correct fuel economy standard si...,test,3


In [15]:
df_dict = {}
for name, group in data.groupby('type'):
    df_dict[name] = group

In [17]:
df_dict['train']

Unnamed: 0,CATEGORY,TITLE,shaped_txt,type,label
0,m,The Flu Tricked Google. Can Wikipedia Do Better?,flu tricked google.wikipedia bett,train,2
1,b,UPDATE 2-ECB's Constancio watching more than j...,update ecbs constancio watching april inflatio...,train,0
2,t,Amazon to unveil smartphone in time for holida...,amazon unveil smartphone time holidayswsj,train,3
3,e,Mighty Morphin Power Rangers - Power Rangers F...,mighty morphin power rangers power rangers fea...,train,1
4,m,Biogen Idec wins Canadian approval for hemophi...,biogen idec wins canadian approval hemophilia ...,train,2
...,...,...,...,...,...
8703,b,Delaware Probing Tilting Bridge Clogging Inter...,delaware probing tilting bridge clogging inter...,train,0
8704,m,African Camels Show MERS Virus Is More Widespr...,african camels show mers virus widespread believ,train,2
8705,t,"UPDATE 3-GM recalls half million Camaros, safe...",update gm recalls half million camaros safety ...,train,3
8706,b,REFILE-Euro zone manufacturing growth eases in...,refile euro zone manufacturing growth eases ju...,train,0


In [52]:
X_train = []
for i in df_dict['train']['shaped_txt']:
    X_train.append(np.mean([model[i] for i in i.split() if i in model.vocab],axis=0))
X_train = np.array(X_train)

y_train = np.array(df_dict['train']['label'])

joblib.dump(X_train,'X_train.joblib')
joblib.dump(y_train,'y_train.joblib')

['y_train.joblib']

In [53]:
X_valid = []
for i in df_dict['valid']['shaped_txt']:
    X_valid.append(np.mean([model[i] for i in i.split() if i in model.vocab],axis=0))
X_valid = np.array(X_valid)

y_valid = np.array(df_dict['valid']['label'])

joblib.dump(X_valid,'X_valid.joblib')
joblib.dump(y_valid,'y_valid.joblib')

['y_valid.joblib']

In [54]:
X_test = []
for i in df_dict['test']['shaped_txt']:
    X_test.append(np.mean([model[i] for i in i.split() if i in model.vocab],axis=0))
X_test = np.array(X_test)

y_test = np.array(df_dict['test']['label'])

joblib.dump(X_test,'X_test.joblib')
joblib.dump(y_test,'y_test.joblib')

['y_test.joblib']