In [39]:
import MeCab
import math

In [5]:
wakati=MeCab.Tagger("-Owakati")
sentence_wakati = wakati.parse("私は猫が好きです").split()
print(sentence_wakati)

['私', 'は', '猫', 'が', '好き', 'です']


In [4]:
wakati=MeCab.Tagger("-Ochasen")
sentence_wakati = wakati.parse("私は猫が好きです").split()
print(sentence_wakati)

['私', 'ワタシ', '私', '名詞-代名詞-一般', 'は', 'ハ', 'は', '助詞-係助詞', '猫', 'ネコ', '猫', '名詞-一般', 'が', 'ガ', 'が', '助詞-格助詞-一般', '好き', 'スキ', '好き', '名詞-形容動詞語幹', 'です', 'デス', 'です', '助動詞', '特殊・デス', '基本形', 'EOS']


In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [7]:
train_data = pd.read_csv("./data/train.csv")

In [8]:
train_data.head()


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [9]:
train_data.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [11]:
xi, yi = train_data.shape

In [31]:
print("xi: 行数:{}".format(xi))
print("yi: 列数:{}".format(yi))

xi: 行数:7613
yi: 列数:5


In [42]:
w_list = []
labels = []

for i in range(xi):
   w = train_data["text"][i].replace("¥n", ' ')
   w_list.append(w)
   labels.append(train_data["target"][i])

In [43]:
import MeCab

mecab = MeCab.Tagger("mecabrc")

def tokenize(text):
   node = mecab.parseToNode(text)
   while node:
      if node.feature.split(',')[0] == '名詞':
         yield node.surface.lower()
      node = node.next
      
def get_words(contents):
   ret = []
   for content in contents:
      ret.append(get_words_main(content))
   return ret

def get_words_main(content):
   return [token for token in tokenize(content)]



In [49]:
words = get_words(w_list)
words[0]

['our',
 'deeds',
 'are',
 'the',
 'reason',
 'of',
 'this',
 '#',
 'earthquake',
 'may',
 'allah',
 'forgive',
 'us',
 'all']

In [52]:
from gensim import corpora

In [53]:
dictionary = corpora.Dictionary(words)
dictionary.filter_extremes(no_below = 200, no_above = 0.2)
#dictionary.save_as_text("./tmp/dictionary.txt") で、作成した辞書を保存可能
#dictionary = corpora.Dictionary.load_from_text("./tmp/dictionary.txt") で読み込み
courpus = [dictionary.doc2bow(word) for word in words]

In [69]:
label_list = ["災害なし", "災害あり"]

In [55]:
from gensim import matutils

def vec2dense(vec, num_terms):
    return list(matutils.corpus2dense([vec], num_terms=num_terms).T[0])
data_all = [vec2dense(dictionary.doc2bow(words[i]),len(dictionary)) for i in range(len(words))]

In [56]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

#トレーニング・テストデータの設定
train_data = data_all
X_train, X_test, y_train, y_test = train_test_split(train_data, labels, test_size=0.4, random_state=1)

#データの標準化
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

#学習モデルの作成
clf = SVC(C = 1, kernel = 'rbf')
clf.fit(X_train_std, y_train)

SVC(C=1)

In [57]:
score = clf.score(X_test_std, y_test)
print("{:.3g}".format(score))

0.687


In [58]:
test_data = pd.read_csv("./data/test.csv")

In [60]:
test_list = []
test_doc = ""
xti, yti = test_data.shape

In [61]:
for i in range(xti):
   test_doc = test_data["text"][i]
   test_list.append(test_doc)

In [62]:
test_words = get_words(test_list)
test_dense = [vec2dense(dictionary.doc2bow(test_words[i]),len(dictionary)) for i in range(len(test_words))]

In [74]:
predicted0 = clf.predict(test_dense)
print(predicted0[0])

0


In [75]:
data_f = pd.DataFrame(predicted0)

In [78]:
test_data["target"] = data_f

In [81]:
test_data.head(20)

Unnamed: 0,id,keyword,location,text,target
0,0,,,Just happened a terrible car crash,0
1,2,,,"Heard about #earthquake is different cities, s...",0
2,3,,,"there is a forest fire at spot pond, geese are...",0
3,9,,,Apocalypse lighting. #Spokane #wildfires,0
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,0
5,12,,,We're shaking...It's an earthquake,0
6,21,,,They'd probably still show more life than Arse...,0
7,22,,,Hey! How are you?,0
8,27,,,What a nice hat?,0
9,29,,,Fuck off!,0


In [85]:
submit_data = pd.read_csv("./data/sample_submission.csv")