In [None]:
import pandas as pd
import numpy as np
import MeCab
import math
import collections

from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

from tqdm import tqdm_notebook



In [None]:
%%time
# read original data
pd.set_option('display.max_colwidth', -1)
csvData = pd.read_csv("rental.csv", delimiter=",")
display(csvData)

In [None]:
display(csvData['Category'].value_counts())
display(csvData.shape)
display(csvData.info())

In [None]:
%%time
# extract text columns
txtCols = ['item_name','catch_copy','pc_caption','caption']
dataTxt = csvData[['item_name','catch_copy','pc_caption','caption', 'Category']]

display(dataTxt)

In [None]:
%%time
# remove irrelavant words
dataTxtCopy = dataTxt.copy()
dataTxtCopy.replace(['&nbsp;'],'',regex=True,inplace=True)
dataTxtCopy.replace({'0':'', np.nan:''},inplace=True)
dataTxtCopy.applymap(lambda x: x.rstrip() if type(x) is str else x)

display(dataTxtCopy)

In [None]:
%%time
# combine text columns
allTxt = pd.DataFrame({'text': dataTxtCopy['item_name'] + dataTxtCopy['catch_copy'] + dataTxtCopy['pc_caption'] + dataTxtCopy['caption'], 'label': dataTxtCopy['Category']})
display(allTxt)

In [None]:
%%time
# extract nouns and verbs (feature selecting)
mecab = MeCab.Tagger('-d /usr/local/lib/mecab/dic/mecab-ipadic-neologd')

def extractWords(doc):
    keywords = []
    parsedStr = mecab.parse(doc)
    for chunk in parsedStr.splitlines()[:-1]:
        (surface, feature) = chunk.split('\t')
        if feature.startswith('名詞') or feature.startswith('動詞'):
            lemma = feature.split(',')[6]
            if lemma != '*':
                keywords.append(lemma)
            else:
                keywords.append(surface)
    return keywords


bows = []
txtList = list(allTxt['text'])
for row in tqdm_notebook(txtList):
    bow = extractWords(row)
    bows.append(bow)
display(pd.DataFrame(bows[:5]))

In [None]:
%%time
# make all words set
vocabulary = set().union(*bows)

display(vocabulary)

In [None]:
%%time
# TF
tfDicts = [collections.Counter(i) for i in bows]

display(pd.DataFrame(tfDicts[:5]))

In [None]:
%%time
# log normalize TF and calculate IDF

tfNmDicts = []
idfDict = dict.fromkeys(vocabulary, 0)
N = len(tfDicts)

for tfDict in tqdm_notebook(tfDicts):
    # log normalize TF
    tfNmDict = {k: 1 + math.log(v) for k, v in tfDict.items()}
    tfNmDicts.append(tfNmDict)
    
    # IDF
    for word, count in tfDict.items():
        if count > 0:
            idfDict[word] += 1
            
idfDict.update({k: math.log(N / float(v)) for k, v in tqdm_notebook(idfDict.items())})

display(pd.DataFrame.from_dict(tfNmDicts[:5]))
display(idfDict)

In [None]:
%%time
# TFIDF
tfidfDicts = []

for tfDict in tqdm_notebook(tfDicts):
    tfidfDict = {k: v*idfDict[k] for k, v in tfDict.items()}
    tfidfDicts.append(tfidfDict)

    
display(pd.DataFrame.from_dict(tfidfDicts[:5]))

In [None]:
display(len(tfidfDicts))

In [None]:
v = DictVectorizer()
X = v.fit_transform(tfidfDicts)

display(X)

In [None]:
y = allTxt['label']

display(y)

In [None]:
clf = LogisticRegression()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
all_accuracies = cross_val_score(estimator=clf, X=X_train, y=y_train, cv=5)

display(all_accuracies)

In [None]:
display(all_accuracies.mean())

In [None]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

display(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))