In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

train = pd.read_table('train.txt', header=None)
valid = pd.read_table('valid.txt', header=None)
test = pd.read_table('test.txt', header=None)

cols = ['CATEGORY', 'TITLE']
train.columns = cols
valid.columns = cols
test.columns = cols

In [2]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from nltk.stem.porter import PorterStemmer as PS
import re
ps = PS() #steming用

In [3]:
#出現頻度の高い単語の除去
def remove_stopwords(text):
    #入力と出力は同じstring形式
    stop_words = set(stopwords.words("english")) #stopwords
    
    #文区切りと単語区切りを順に行い　単語をストップワードと照らし合わせる
    period=[]
    for i in re.split("[.]",text):
        word=[j for j in i.split(" ") if j != "" and j not in stop_words]
        period.append(word)
        word=[]
    
    #元の形に復元
    tmp=[]
    for k in period:
        if k != []:
            tmp.append(' '.join(k))

    result='.'.join(tmp)
    return result

In [4]:
def clean_text(text):
    replaced_text = text.lower()
    replaced_text = re.sub(r'[【】]', '', replaced_text)       # 【】の除去
    replaced_text = re.sub(r'[（）()]', '', replaced_text)     # （）の除去
    replaced_text = re.sub(r'[［］\[\]]', '', replaced_text)   # ［］の除去
    replaced_text = re.sub(r'[{}\{\}]', '', replaced_text)   #  {}
    replaced_text = re.sub(r'[@＠]\w+', '', replaced_text)  # メンションの除去
    replaced_text = re.sub(r'https?:\/\/.*?[\r\n ]', '', replaced_text)  # URLの除去
    replaced_text = re.sub(r'\-',' ', replaced_text) #ハイフンは空白へ
    replaced_text = re.sub(r'　', ' ', replaced_text) # 全角空白の除去
    replaced_text = re.sub(r'  ', '', replaced_text) #２連続の半角空白を1つに
    replaced_text = re.sub(r'\d+\.\d+','', replaced_text) #小数点を含む数列の除去
    replaced_text = re.sub(r'\;','', replaced_text) #セミコロン
    replaced_text = re.sub(r'\:','', replaced_text) #コロン
    replaced_text = re.sub(r'\'','', replaced_text) #クオーテーション
    replaced_text = re.sub(r'\`','', replaced_text) #クオーテーション
    replaced_text = re.sub(r'\,','', replaced_text) #カンマ
    replaced_text = re.sub(r'\_','', replaced_text) #アンダーバー
    replaced_text = re.sub(r'\\','', replaced_text) #バックスラッシュ
    replaced_text = re.sub(r'\?','', replaced_text) #クエスチョン
    replaced_text = re.sub(r'\!','', replaced_text) #感嘆符　（ピリオドど同様に文末に使われる）
    replaced_text = re.sub(r'\+','', replaced_text) #プラス
    replaced_text = re.sub(r'\*','', replaced_text) #アスタリスク
    replaced_text = re.sub(r'\/','', replaced_text) #スラッシュ
    replaced_text = re.sub(r'\<','', replaced_text) #小なり
    replaced_text = re.sub(r'\>','', replaced_text) #大なり
    replaced_text = re.sub(r'\=','', replaced_text) #イコール
    replaced_text = re.sub(r'\%','', replaced_text) #パーセント
    replaced_text = re.sub(r'\&','', replaced_text) #アンパサント
    replaced_text = re.sub(r'\$','', replaced_text) #ドル
    replaced_text = re.sub(r'\#','', replaced_text) #シャープ
    replaced_text = re.sub(r'.\..\.','', replaced_text) #U.S. or U.N.の除去
    #replaced_text = re.sub(r'\d+','', replaced_text) #数列の除去
    replaced_text = re.sub(r'\d{1,3}','', replaced_text) #1から3桁の数列の除去
    replaced_text = re.sub(r'\d{5,}','', replaced_text) #5桁以上の数列の除去
    replaced_text = ps.stem("%s" % replaced_text) #語幹抽出
    return remove_stopwords(replaced_text)

In [5]:
train["shaped_txt"] = list(map(clean_text,train["TITLE"]))
valid["shaped_txt"] = list(map(clean_text,valid["TITLE"]))
test["shaped_txt"] = list(map(clean_text,test["TITLE"]))

In [6]:
train["type"] = "train"
valid["type"] = "valid"
test["type"] = "test"

In [7]:
data = pd.concat([train, valid, test]).reset_index(drop=True)

In [8]:
vectorizer = CountVectorizer()

In [9]:
txt_vec = vectorizer.fit_transform(data["shaped_txt"])

In [10]:
data = pd.concat([data, pd.DataFrame(txt_vec.toarray())], axis=1)

In [11]:
train = data.query('type=="train"').drop(cols + ['type','shaped_txt'], axis=1)
valid = data.query('type=="valid"').drop(cols + ['type','shaped_txt'], axis=1)
test = data.query('type=="test"').drop(cols + ['type','shaped_txt'], axis=1)

In [22]:
features = vectorizer.get_feature_names()

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
X_train = train

y_train = pd.read_table('train.txt', header=None)

y_train[0]
le = LabelEncoder()
le = le.fit(y_train[0])
y_train = le.transform(y_train[0])

clf = LogisticRegression(penalty='l2', solver='sag', random_state=0)
clf.fit(X_train, y_train)



LogisticRegression(random_state=0, solver='sag')

In [21]:
len(clf.coef_[0])

14022

In [30]:
for i in clf.coef_:
    ind_ = [(m,l) for l,m in enumerate(i)]
    top_lst = [features[int(j[1])] for j in sorted(ind_)[:10]]
    print(top_lst)
    bottom_lst = [features[int(k[1])] for k in sorted(ind_,reverse=True)[:10]]
    print(bottom_lst)

['video', 'activision', 'ebola', 'aereo', 'fcc', 'nintendo', 'samsung', 'microsoft', 'sony', 'sprint']
['bank', 'fed', 'ecb', 'stocks', 'china', 'euro', 'dollar', 'growth', 'fitch', 'ukraine']
['google', 'china', 'facebook', 'gm', 'billion', 'scientists', 'buy', 'ceo', 'sales', 'data']
['kardashian', 'chris', 'paul', 'miley', 'cyrus', 'film', 'kim', 'transformers', 'george', 'spotify']
['ceo', 'gm', 'climate', 'twitter', 'apple', 'profit', 'costs', 'deal', 'bank', 'amazon']
['ebola', 'drug', 'fda', 'cancer', 'mers', 'virus', 'cigarettes', 'cdc', 'study', 'alzheimers']
['stocks', 'grows', 'drug', 'american', 'shares', 'raise', 'spotify', 'wants', 'york', 'second']
['google', 'microsoft', 'apple', 'facebook', 'fcc', 'nasa', 'comcast', 'tesla', 'gm', 'climate']
