In [1]:
USE_NMF = True
N_NEIGHBORS = 20

In [2]:
import os
import re
import random
import pandas as pd
from pyod.models.lof import LOF
from gensim.models import LdaModel
from gensim.models.nmf import Nmf
from gensim.corpora.dictionary import Dictionary
from sklearn.metrics import classification_report

In [3]:
with open('./dataset/id.stopwords.02.01.2016.txt') as f:
    stop_words = f.read().split('\n')
    stop_words_regex = r'\b(' + '|'.join(stop_words) + r')\b'

def clean(doc):
    doc = str(doc).lower()
    doc = re.sub('[^A-Za-z\ ]', ' ', doc)
    doc = re.sub(r'\b\w{0,1}\b', '', doc)
    doc = re.sub(stop_words_regex, '', doc)
    doc = re.sub('\s{2,}', ' ', doc)
    doc = doc.split(' ')

    return doc

def process_csv(path):
    df = pd.read_csv(path).astype(str)
    df['Article'] = df.apply(
        lambda d: d['Title'] + ' ' + d['Ingredients'] + ' ' + d['Steps'], axis=1
    )
    df['Article'] = df['Article'].apply(clean)
    df['Label'] = 1
    return df[['Article', 'Label']]

def process_json(path):
    df = pd.read_json(path, orient='column').astype(str)
    df['Article'] = df['content'].apply(clean)
    df['Label'] = 0
    return df[['Article', 'Label']]

In [4]:
path_train = [
    './dataset/recipe/dataset-ayam.csv',
    './dataset/recipe/dataset-ikan.csv',
    './dataset/recipe/dataset-kambing.csv',
    './dataset/recipe/dataset-sapi.csv',
    './dataset/recipe/dataset-udang.csv'
]
path_test = [
    './dataset/recipe/dataset-tahu.csv',
    './dataset/recipe/dataset-telur.csv',
    './dataset/recipe/dataset-tempe.csv',
    './dataset/criminality_news.json'
]
train_docs, test_docs = [], []

for p in path_train:
    train_docs.append(process_csv(p))
X_train = pd.concat(train_docs, ignore_index=True)['Article'].tolist()
for p in path_test:
    if p[-4:] == '.csv':
        test_docs.append(process_csv(p))
    else:
        test_docs.append(process_json(p))
X_test = pd.concat(test_docs, ignore_index=True)['Article'].tolist()
y_test = pd.concat(test_docs, ignore_index=True)['Label'].tolist()

In [5]:
dictionary = Dictionary(X_train)
dictionary.filter_extremes(no_below=25, no_above=0.5, keep_n=10000)
train_corpus = [dictionary.doc2bow(doc) for doc in X_train]
test_corpus = [dictionary.doc2bow(doc) for doc in X_test]

In [6]:
if USE_NMF:
    tm = Nmf(corpus=train_corpus, id2word=dictionary, num_topics=5, w_max_iter=100)
else:
    tm = LdaModel(corpus=train_corpus, id2word=dictionary, num_topics=5, iterations=100)

for i in range(5):
    print(f'Topic #{i} |', [dictionary[idx] for idx, prob in tm.get_topic_terms(i, topn=9)])

Topic #0 | ['daging', 'kecap', 'sdm', 'sapi', 'manis', 'kambing', 'sdt', 'saos', 'halus']
Topic #1 | ['ayam', 'bubuk', 'halus', 'kaldu', 'salam', 'santan', 'sdt', 'daging', 'rebus']
Topic #2 | ['udang', 'saus', 'tepung', 'sdm', 'telur', 'iris', 'bombay', 'ayam', 'masukan']
Topic #3 | ['ayam', 'sdm', 'tepung', 'bahan', 'sdt', 'jeruk', 'sambal', 'menit', 'adonan']
Topic #4 | ['cabe', 'ikan', 'jeruk', 'rawit', 'halus', 'tomat', 'ruas', 'salam', 'jahe']


In [7]:
# obtain LDA or NMF vector for train/test data
def get_vector(doc):
    doc_vector = [[i, 0.0] for i in range(5)]
    partial_doc_vector = tm.get_document_topics(dictionary.doc2bow(doc), minimum_probability=0)
    for p in partial_doc_vector:
        doc_vector[p[0]][1] = p[1]
    return [d[1] for d in doc_vector]

train_vector = [get_vector(d) for d in X_train]
test_vector = [get_vector(d) for d in X_test]

In [8]:
# train LOF
lof = LOF(n_neighbors=N_NEIGHBORS)
lof.fit(train_vector)

LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
  metric_params=None, n_jobs=1, n_neighbors=20, novelty=True, p=2)

In [9]:
y_pred = lof.predict(test_vector)
print(classification_report(y_test, y_pred, target_names=['Resep', 'Berita']))

              precision    recall  f1-score   support

       Resep       0.65      0.89      0.75     10751
      Berita       0.39      0.13      0.20      5945

    accuracy                           0.62     16696
   macro avg       0.52      0.51      0.47     16696
weighted avg       0.56      0.62      0.55     16696

