In [None]:
import pandas as pd
import numpy as np
import MeCab

from tqdm import tqdm_notebook

In [None]:
%%time
# read original data
pd.set_option('display.max_colwidth', -1)
csvData = pd.read_csv("rental.csv", delimiter=",")
display(csvData)

In [None]:
%%time
# extract text columns
txtCols = ['item_name','catch_copy','pc_caption','caption']
dataTxt = csvData[txtCols + ['Category']]

display(dataTxt)

In [None]:
%%time
# remove irrelavant words
dataTxtCopy = dataTxt.copy()
dataTxtCopy.replace(['&nbsp;'],'',regex=True,inplace=True)
dataTxtCopy.replace({'0':'', np.nan:''},inplace=True)
dataTxtCopy.applymap(lambda x: x.rstrip() if type(x) is str else x)

display(dataTxtCopy)

In [None]:
%%time
# combine text columns
allTxt = pd.DataFrame({'text': dataTxtCopy['item_name'] + dataTxtCopy['catch_copy'] + dataTxtCopy['pc_caption'] + dataTxtCopy['caption'], 'label': dataTxtCopy['Category']})
display(allTxt)

In [None]:
%%time
# extract nouns and verbs (feature selecting)
mecab = MeCab.Tagger('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')

def extractWords(doc):
    keywords = []
    parsedStr = mecab.parse(doc)
    for chunk in parsedStr.splitlines()[:-1]:
        (surface, feature) = chunk.split('\t')
        if feature.startswith('名詞') or feature.startswith('動詞'):
            lemma = feature.split(',')[6]
            if lemma != '*':
                keywords.append(lemma)
            else:
                keywords.append(surface)
    return keywords


bows = []
txtList = list(allTxt['text'])
for row in tqdm_notebook(txtList):
    bow = extractWords(row)
    bows.append(bow)
display(pd.DataFrame(bows[:5]))

In [None]:
%%time
# make all words set
wordSet = set().union(*bows)

display(wordSet)

In [None]:
%%time
wordDicts = []
wordDictInit = dict.fromkeys(wordSet, 0)

for bow in tqdm_notebook(bows):
    wordDict = wordDictInit
    for word in bow:
        wordDict[word] += 1
    wordDicts.append(wordDict)

display(pd.DataFrame.from_dict(wordDicts[:5]))