In [1]:
import MeCab,re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
import warnings

In [2]:
# 表示関連
# DataFrameの列数設定
pd.set_option('display.max_columns', 500)
pd.set_option("display.max_rows", 500)
warnings.filterwarnings('ignore')

In [3]:
# テキストデータの読み込み
wikiData = pd.read_csv("csv/train.csv", encoding="utf-8")

# 読み込みデータの表示
wikiData.head()

Unnamed: 0,text,target
0,ドール・フード・カンパニー ドール・フード・カンパニー（Dole Food Compan...,1
1,南ヶ丘牧場 株式会社 南ヶ丘牧場（みなみがおかぼくじょう）は、栃木県那須高原に本拠を置く...,1
2,岩瀬牧場 岩瀬牧場（いわせぼくじょう）は、日本の福島県岩瀬郡鏡石町にある牧場である。 ...,1
3,"クラギ クラギ株式会社（くらぎ、""KURAGI Co.Ltd."" ）は、三重県を中心に「...",1
4,"ハッピーネモファーム 株式会社ハッピーネモファーム（""""）は、北海道浦河郡浦河町にある競...",1


In [4]:
X = wikiData.text.values
y = wikiData.target.values

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X , y, test_size=0.2, random_state=42, stratify=y)

In [6]:
def tokenize(text):
    tokens = []
    tagger = MeCab.Tagger( "-Ochasen" )
    node = tagger.parseToNode(text)
    while node:
        if node.feature.split(",")[0] == "名詞":
                replace_node = re.sub( re.compile( "[!-/:-@[-`{-~]" ), "", node.surface )
                if replace_node != "" and replace_node != " ":
                    tokens.append(replace_node)
        node = node.next
    return tokens

In [7]:
vectorizer = TfidfVectorizer(tokenizer=tokenize)
train_matrix = vectorizer.fit_transform(X_train)
test_matrix = vectorizer.transform(X_test)

In [8]:
clf = MultinomialNB()
clf.fit(train_matrix, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [9]:
print(clf.score(train_matrix, y_train))
print(clf.score(test_matrix, y_test))

0.478403141361
0.389616055846


In [10]:
from sklearn.ensemble import RandomForestClassifier

clf2 = RandomForestClassifier(n_estimators=50,random_state=42)
clf2.fit(train_matrix, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [11]:
print(clf2.score(train_matrix, y_train))
print(clf2.score(test_matrix, y_test))

1.0
0.650959860384


In [12]:
import lightgbm as lgb

clf3 = lgb.LGBMClassifier(objective='multiclass', num_class=33)
clf3.fit(train_matrix, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               learning_rate=0.1, max_depth=-1, min_child_samples=20,
               min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
               n_jobs=-1, num_class=33, num_leaves=31, objective='multiclass',
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [13]:
print(clf3.score(train_matrix, y_train))
print(clf3.score(test_matrix, y_test))

0.999672774869
0.742582897033
