In [1]:
import os

In [2]:
DATA_DIR = './text'
labels = [
    x
    for x in os.listdir(DATA_DIR)
    if os.path.isdir(os.path.join(DATA_DIR, x))
]
data_files = {
    label: [
        file
        for file in os.listdir(os.path.join(DATA_DIR, label))
        if file != 'LICENSE.txt'
    ]
    for label in labels
}

In [3]:
from collections import ChainMap
import pandas as pd

In [4]:
def read_file(file: str) -> dict:
    with open(file) as f:
        s = f.read()
    l = s.split('\n')
    return {
        'url': l[0],
        'date': l[1],
        'title': l[2],
        'text': '\n'.join(l[3:])
    }

In [5]:
data_list = [
    ChainMap(
        read_file(os.path.join(DATA_DIR, label, file)),
        {
            'id': os.path.splitext(file)[0],
            'label': label
        }
    )
    for label in labels
    for file in data_files[label]
]
df = pd.DataFrame(data_list, columns=['id', 'label', 'url', 'date', 'title', 'text'])

In [6]:
from normalize_neologd import normalize_neologd
df['normalized_title'] = df.apply(lambda x: normalize_neologd(x.title), axis=1)
df['normalized_text'] = df.apply(lambda x: normalize_neologd(x.text), axis=1)

In [7]:
from janome.tokenizer import Tokenizer
tokenizer = Tokenizer(mmap=True)

In [10]:
POS_LIST = ['名詞,サ変活用', '名詞,一般', '名詞,固有名詞']
def select_pos(token: 'Token') -> bool:
    pos = token.part_of_speech
    for pos_def in POS_LIST:
        if pos.startswith(pos_def):
            return True
    return False

def select_token(tokens: list) -> list:
    return [
        token.surface
        for token in tokens
        if select_pos(token)
    ]

def join_tokens(tokens: list) -> str:
    return ' '.join(tokens)

In [11]:
df['wakati'] = df.title.apply(lambda x: join_tokens(select_token(tokenizer.tokenize(x))))

In [12]:
df.wakati.head()

0             家政婦のミタ 忽那汐里 声優 妹
1             特別映像 生物 救世主 公開処刑
2    日本 トップ 現実 2011年 まとめ vol.3
3             マツコ・デラックス 世界 セレブ
4              妖 ヶ 劇場 第9話 生徒 巻
Name: wakati, dtype: object

In [13]:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(1)
teacher_index, test_index = next(sss.split(df, df.label))
teacher_df = df.iloc[teacher_index]
test_df = df.iloc[test_index]

sss = StratifiedShuffleSplit(1)
train_index, valid_index = next(sss.split(teacher_df, teacher_df.label))
train_df = df.iloc[train_index]
valid_df = df.iloc[valid_index]

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

with open('./stop_words.txt') as f:
    STOP_WORDS = f.readlines()
    
tfidf_vectorizer = TfidfVectorizer()

In [15]:
X_train = tfidf_vectorizer.fit_transform(train_df.wakati).toarray()
y_train = train_df.label
X_valid = tfidf_vectorizer.transform(valid_df.wakati).toarray()
y_valid = valid_df.label

In [16]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_valid)

In [18]:
from sklearn.metrics import accuracy_score

accuracy_score(y_valid, y_pred)

0.75414781297134237

In [20]:
X_teacher = tfidf_vectorizer.fit_transform(teacher_df.wakati).toarray()
y_teacher = teacher_df.label
X_test = tfidf_vectorizer.transform(test_df.wakati).toarray()
y_test = test_df.label

In [21]:
model.fit(X_teacher, y_teacher)
y_pred = model.predict(X_test)

In [22]:
accuracy_score(y_test, y_pred)

0.74898236092265946