In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

train = pd.read_table('train.txt', header=None)
valid = pd.read_table('valid.txt', header=None)
test = pd.read_table('test.txt', header=None)

cols = ['CATEGORY', 'TITLE']
train.columns = cols
valid.columns = cols
test.columns = cols

In [2]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from nltk.stem.porter import PorterStemmer as PS
import re
ps = PS() #steming用

In [3]:
#出現頻度の高い単語の除去
def remove_stopwords(text):
    #入力と出力は同じstring形式
    stop_words = set(stopwords.words("english")) #stopwords
    
    #文区切りと単語区切りを順に行い　単語をストップワードと照らし合わせる
    period=[]
    for i in re.split("[.]",text):
        word=[j for j in i.split(" ") if j != "" and j not in stop_words]
        period.append(word)
        word=[]
    
    #元の形に復元
    tmp=[]
    for k in period:
        if k != []:
            tmp.append(' '.join(k))

    result='.'.join(tmp)
    return result

In [4]:
def clean_text(text):
    replaced_text = text.lower()
    replaced_text = re.sub(r'[【】]', '', replaced_text)       # 【】の除去
    replaced_text = re.sub(r'[（）()]', '', replaced_text)     # （）の除去
    replaced_text = re.sub(r'[［］\[\]]', '', replaced_text)   # ［］の除去
    replaced_text = re.sub(r'[{}\{\}]', '', replaced_text)   #  {}
    replaced_text = re.sub(r'[@＠]\w+', '', replaced_text)  # メンションの除去
    replaced_text = re.sub(r'https?:\/\/.*?[\r\n ]', '', replaced_text)  # URLの除去
    replaced_text = re.sub(r'\-',' ', replaced_text) #ハイフンは空白へ
    replaced_text = re.sub(r'　', ' ', replaced_text) # 全角空白の除去
    replaced_text = re.sub(r'  ', '', replaced_text) #２連続の半角空白を1つに
    replaced_text = re.sub(r'\d+\.\d+','', replaced_text) #小数点を含む数列の除去
    replaced_text = re.sub(r'\;','', replaced_text) #セミコロン
    replaced_text = re.sub(r'\:','', replaced_text) #コロン
    replaced_text = re.sub(r'\'','', replaced_text) #クオーテーション
    replaced_text = re.sub(r'\`','', replaced_text) #クオーテーション
    replaced_text = re.sub(r'\,','', replaced_text) #カンマ
    replaced_text = re.sub(r'\_','', replaced_text) #アンダーバー
    replaced_text = re.sub(r'\\','', replaced_text) #バックスラッシュ
    replaced_text = re.sub(r'\?','', replaced_text) #クエスチョン
    replaced_text = re.sub(r'\!','', replaced_text) #感嘆符　（ピリオドど同様に文末に使われる）
    replaced_text = re.sub(r'\+','', replaced_text) #プラス
    replaced_text = re.sub(r'\*','', replaced_text) #アスタリスク
    replaced_text = re.sub(r'\/','', replaced_text) #スラッシュ
    replaced_text = re.sub(r'\<','', replaced_text) #小なり
    replaced_text = re.sub(r'\>','', replaced_text) #大なり
    replaced_text = re.sub(r'\=','', replaced_text) #イコール
    replaced_text = re.sub(r'\%','', replaced_text) #パーセント
    replaced_text = re.sub(r'\&','', replaced_text) #アンパサント
    replaced_text = re.sub(r'\$','', replaced_text) #ドル
    replaced_text = re.sub(r'\#','', replaced_text) #シャープ
    replaced_text = re.sub(r'.\..\.','', replaced_text) #U.S. or U.N.の除去
    #replaced_text = re.sub(r'\d+','', replaced_text) #数列の除去
    replaced_text = re.sub(r'\d{1,3}','', replaced_text) #1から3桁の数列の除去
    replaced_text = re.sub(r'\d{5,}','', replaced_text) #5桁以上の数列の除去
    replaced_text = ps.stem("%s" % replaced_text) #語幹抽出
    return remove_stopwords(replaced_text)

In [5]:
train["shaped_txt"] = list(map(clean_text,train["TITLE"]))
valid["shaped_txt"] = list(map(clean_text,valid["TITLE"]))
test["shaped_txt"] = list(map(clean_text,test["TITLE"]))

In [6]:
train["type"] = "train"
valid["type"] = "valid"
test["type"] = "test"

In [7]:
data = pd.concat([train, valid, test]).reset_index(drop=True)

In [8]:
data

Unnamed: 0,CATEGORY,TITLE,shaped_txt,type
0,m,The Flu Tricked Google. Can Wikipedia Do Better?,flu tricked google.wikipedia bett,train
1,b,UPDATE 2-ECB's Constancio watching more than j...,update ecbs constancio watching april inflatio...,train
2,t,Amazon to unveil smartphone in time for holida...,amazon unveil smartphone time holidayswsj,train
3,e,Mighty Morphin Power Rangers - Power Rangers F...,mighty morphin power rangers power rangers fea...,train
4,m,Biogen Idec wins Canadian approval for hemophi...,biogen idec wins canadian approval hemophilia ...,train
...,...,...,...,...
10880,m,STOCKS NEWS EUROPE-Novartis jumps on positive ...,stocks news europe novartis jumps positive hea...,test
10881,e,Bachelor Juan Pablo Galavis chooses Nikki Ferr...,bachelor juan pablo galavis chooses nikki ferr...,test
10882,b,Kingfisher Starts Returning Cash as Confidence...,kingfisher starts returning cash confidence ou...,test
10883,t,EPA says Ford to correct fuel economy standard...,epa says ford correct fuel economy standard si...,test


In [9]:
vectorizer = CountVectorizer()

In [10]:
txt_vec = vectorizer.fit_transform(data["shaped_txt"])

In [11]:
data = pd.concat([data, pd.DataFrame(txt_vec.toarray())], axis=1)

In [12]:
data.head()

Unnamed: 0,CATEGORY,TITLE,shaped_txt,type,0,1,2,3,4,5,...,14012,14013,14014,14015,14016,14017,14018,14019,14020,14021
0,m,The Flu Tricked Google. Can Wikipedia Do Better?,flu tricked google.wikipedia bett,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,b,UPDATE 2-ECB's Constancio watching more than j...,update ecbs constancio watching april inflatio...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,t,Amazon to unveil smartphone in time for holida...,amazon unveil smartphone time holidayswsj,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,e,Mighty Morphin Power Rangers - Power Rangers F...,mighty morphin power rangers power rangers fea...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,m,Biogen Idec wins Canadian approval for hemophi...,biogen idec wins canadian approval hemophilia ...,train,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
train = data.query('type=="train"').drop(cols + ['type','shaped_txt'], axis=1)
valid = data.query('type=="valid"').drop(cols + ['type','shaped_txt'], axis=1)
test = data.query('type=="test"').drop(cols + ['type','shaped_txt'], axis=1)

train.to_csv('train.feature.txt', sep='\t', index=False, header=None)
valid.to_csv('valid.feature.txt', sep='\t', index=False, header=None)
test.to_csv('test.feature.txt', sep='\t', index=False, header=None)

In [16]:
train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14012,14013,14014,14015,14016,14017,14018,14019,14020,14021
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8703,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8704,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8705,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8706,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
