In [None]:
# install MeCab
!apt-get -q -y install sudo file mecab libmecab-dev mecab-ipadic-utf8 git curl python-mecab > /dev/null
!git clone --depth 1 https://github.com/neologd/mecab-ipadic-neologd.git > /dev/null 
!echo yes | mecab-ipadic-neologd/bin/install-mecab-ipadic-neologd -n > /dev/null 2>&1
!pip install mecab-python3 > /dev/null

In [None]:
# check path to "ipadic-neologd" 
!echo `mecab-config --dicdir`"/mecab-ipadic-neologd"

In [2]:
!pip install japanize-matplotlib

Collecting japanize-matplotlib
[?25l  Downloading https://files.pythonhosted.org/packages/ac/6d/ac9891b0b2c76b9a2cee6302c3083f7992a9249583cd2b611cb9c7e2fb34/japanize-matplotlib-1.0.4.tar.gz (4.1MB)
[K     |████████████████████████████████| 4.1MB 3.5MB/s 
[?25hBuilding wheels for collected packages: japanize-matplotlib
  Building wheel for japanize-matplotlib (setup.py) ... [?25l[?25hdone
  Created wheel for japanize-matplotlib: filename=japanize_matplotlib-1.0.4-cp36-none-any.whl size=4118717 sha256=5222d3dbf0a98a989af57f74a922b5b3cc236146cde8dfadec00772127ed9b85
  Stored in directory: /root/.cache/pip/wheels/47/d1/ba/1a686af7cc042edde2c2f4cb18bd981f7eefdcbfe688590d25
Successfully built japanize-matplotlib
Installing collected packages: japanize-matplotlib
Successfully installed japanize-matplotlib-1.0.4


In [1]:
# Google Driveへマウント
from google.colab import drive
drive.mount('/content/drive')
# 必要ならば以下のようにディレクトリ移動する
%cd /content/drive/'My Drive'/'Google Colab'/'industryPrediction'/

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive
/content/drive/My Drive/Google Colab/industryPrediction


In [0]:
# データ分析・操作用ライブラリ
import numpy as np
import pandas as pd

# NLP用ライブラリ
import MeCab,re
from sklearn.feature_extraction.text import TfidfVectorizer

# スコア評価用ライブラリ
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_recall_fscore_support

# エラー表示の抑制
import warnings

In [0]:
# 表示関連
# DataFrameの列数設定
pd.set_option('display.max_columns', 500)
pd.set_option("display.max_rows", 500)
warnings.filterwarnings('ignore')

In [6]:
# テキストデータの読み込み
wikiData = pd.read_csv("csv/train_cleaned.csv", encoding="utf-8")

# 読み込みデータの表示
wikiData.head()

Unnamed: 0,text,target
0,ドール・フード・カンパニー ドール・フード・カンパニー（Dole Food Compan...,1
1,南ヶ丘牧 南ヶ丘牧（みなみがおかぼくじょう）は、栃那須原に拠を置く酪農経営企。現在4つ...,1
2,岩瀬牧 岩瀬牧（いわせぼくじょう）は、福島岩瀬郡鏡石にある牧である。 鏡石シンボルソン...,1
3,"クラギ クラギ（くらぎ、""KURAGI Co.Ltd."" ）は、重を心に「農」を展開する...",1
4,"ハッピーネモファーム ハッピーネモファーム（""""）は、北海浦河郡浦河にある競走馬（サラブ...",1


In [0]:
# Numpy配列へ変換
X = wikiData.text.values
y = wikiData.target.values

In [0]:
# トレーニングデータ、テストデータの分割
X_train, X_test, y_train, y_test = train_test_split(X , y, test_size=0.2, random_state=42, stratify=y)

In [0]:
# 形態素解析用関数
def tokenize(text):
    tokens = []
    tagger = MeCab.Tagger( "-Ochasen" )
    tagger.parse("")
    node = tagger.parseToNode(text)
    while node:
        if node.feature.split(",")[0] == "名詞":
                replace_node = re.sub( re.compile( "[!-/:-@[-`{-~]" ), "", node.surface )
                if replace_node != "" and replace_node != " ":
                    tokens.append(replace_node)
        node = node.next
    return tokens

In [0]:
# 形態素解析を実行
vectorizer = TfidfVectorizer(tokenizer=tokenize)
vectorizer.fit(X)
train_matrix = vectorizer.transform(X_train)
test_matrix = vectorizer.transform(X_test)

In [17]:
# ナイーブベイズ
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(train_matrix, y_train)
cv_scores = cross_val_score(clf, train_matrix, y_train, cv=5)

print("Training score：" + str(clf.score(train_matrix, y_train)))
print("Cross-Validation score：" + str(np.mean(cv_scores)))
print("Test score：" + str(clf.score(test_matrix, y_test)))

Training score：0.49214659685863876
Cross-Validation score：0.4014950540418466
Test score：0.39965095986038396


In [18]:
# ランダムフォレスト
from sklearn.ensemble import RandomForestClassifier

clf2 = RandomForestClassifier(n_estimators=50,random_state=42)
clf2.fit(train_matrix, y_train)
cv_scores = cross_val_score(clf2, train_matrix, y_train, cv=5)

print("Training score：" + str(clf2.score(train_matrix, y_train)))
print("Cross-Validation score：" + str(np.mean(cv_scores)))
print("Test score：" + str(clf2.score(test_matrix, y_test)))

Training score：1.0
Cross-Validation score：0.6622117787799463
Test score：0.6570680628272252


In [0]:
# Light GBM
import lightgbm as lgb

clf3 = lgb.LGBMClassifier(objective='multiclass', num_class=33)
clf3.fit(train_matrix, y_train)
cv_scores = cross_val_score(clf3, train_matrix, y_train, cv=5)

print("Training score：" + str(clf3.score(train_matrix, y_train)))
print("Cross-Validation score：" + str(np.mean(cv_scores)))
print("Test score：" + str(clf3.score(test_matrix, y_test)))

In [0]:
# SVM
from sklearn.svm import SVC

## パラメーター設定
C = 1.
kernel = 'rbf'
gamma  = 0.01

## One-versus-the-rest
clf4 = SVC(C=C, kernel=kernel, gamma=gamma, decision_function_shape='ovr')
clf4.fit(train_matrix, y_train)
cv_scores = cross_val_score(clf4, train_matrix, y_train, cv=5)

print("Training score：" + str(clf4.score(train_matrix, y_train)))
print("Cross-Validation score：" + str(np.mean(cv_scores)))
print("Test score：" + str(clf4.score(test_matrix, y_test)))


## One-versus-the-one(multi-class default)
clf5 = SVC(C=C, kernel=kernel, gamma=gamma, decision_function_shape='ovo')
clf5.fit(train_matrix, y_train)
cv_scores = cross_val_score(clf5, train_matrix, y_train, cv=5)

print("Training score：" + str(clf5.score(train_matrix, y_train)))
print("Cross-Validation score：" + str(np.mean(cv_scores)))
print("Test score：" + str(clf5.score(test_matrix, y_test)))

Training score：0.206806282723
Cross-Validation score：0.188141785068
Test score：0.200261780105
Training score：0.206806282723
Cross-Validation score：0.188141785068
Test score：0.200261780105
