In [1]:
USE_NMF = True
N_NEIGHBORS = 20

In [2]:
import os
import re
import random
import ktrain
import pandas as pd
from sklearn.metrics import classification_report
random.seed(42)

In [3]:
def clean(doc):
    doc = str(doc).lower()
    doc = re.sub('[^A-Za-z\ ]', ' ', doc)
    doc = re.sub(r'\b\w{0,1}\b', '', doc)
    doc = re.sub('\s{2,}', ' ', doc)

    return doc

def process_csv(path):
    df = pd.read_csv(path).astype(str)
    print(df.shape)
    df['Article'] = df.apply(
        lambda d: d['Title'] + ' ' + d['Ingredients'] + ' ' + d['Steps'], axis=1
    )
    df['Article'] = df['Article'].apply(clean)
    df['Label'] = 1
    return df[['Article', 'Label']]

def process_json(path):
    df = pd.read_json(path, orient='column').astype(str)
    print(df.shape)
    df['Article'] = df['content'].apply(clean)
    df['Label'] = 0
    return df[['Article', 'Label']]

In [4]:
path_train = [
    './dataset/recipe/dataset-ayam.csv',
    './dataset/recipe/dataset-ikan.csv',
    './dataset/recipe/dataset-kambing.csv',
    './dataset/recipe/dataset-sapi.csv',
    './dataset/recipe/dataset-udang.csv'
]
path_test = [
    './dataset/recipe/dataset-tahu.csv',
    './dataset/recipe/dataset-telur.csv',
    './dataset/recipe/dataset-tempe.csv',
    './dataset/criminality_news.json'
]
train_docs, test_docs = [], []

for p in path_train:
    train_docs.append(process_csv(p))
X_train = pd.concat(train_docs, ignore_index=True)['Article'].tolist()
for p in path_test:
    if p[-4:] == '.csv':
        test_docs.append(process_csv(p))
    else:
        test_docs.append(process_json(p))
X_test = pd.concat(test_docs, ignore_index=True)['Article'].tolist()
y_test = pd.concat(test_docs, ignore_index=True)['Label'].tolist()

In [5]:
with open('./dataset/id.stopwords.02.01.2016.txt') as f:
    stop_words = f.read().split('\n')

In [6]:
if USE_NMF:
    tm = ktrain.text.get_topic_model(
        X_train, n_topics=5, stop_words=stop_words, 
        min_df=25, max_df=0.5, model_type='nmf',
        lda_max_iter=100, n_features=10000, verbose=0
    )
else:
    tm = ktrain.text.get_topic_model(
        X_train, n_topics=5, stop_words=stop_words, 
        min_df=25, max_df=0.5, model_type='lda', lda_mode='batch',
        lda_max_iter=100, n_features=10000, verbose=0
    )



In [7]:
tm.build(X_train, threshold=0.2)
tm.print_topics(n_words=9, show_counts=True)
tm.train_scorer(n_neighbors=N_NEIGHBORS)

topic:0 | count:1925 | daging sapi kecap sdm kambing sdt manis bubuk iris
topic:3 | count:1827 | udang sdm tepung saus cabe saos iris telur tomat
topic:1 | count:1639 | ayam tepung sdm sdt bubuk telur bahan yg kecap
topic:2 | count:1432 | cabe jeruk salam halus santan ruas jahe rawit lembar
topic:4 | count:1155 | ikan cabe jeruk tomat nipis iris sdm matang tongkol


In [8]:
y_pred = tm.score(X_test)
y_pred = [1 if s >= 0 else 0 for s in y_pred]
print(classification_report(y_test, y_pred, target_names=['Resep', 'Berita']))

              precision    recall  f1-score   support

       Resep       0.71      1.00      0.83     10751
      Berita       0.97      0.27      0.43      5945

    accuracy                           0.74     16696
   macro avg       0.84      0.63      0.63     16696
weighted avg       0.81      0.74      0.69     16696

