In [1]:
import json
import numpy as np
import pandas as pd
from collections import Counter

from nltk import word_tokenize, sent_tokenize

from tqdm.auto import tqdm
tqdm.pandas()

pd.set_option('display.max_colwidth', 200)

In [2]:
with open("../data/href2synopsis.json", encoding="utf-8") as f:
    href2synopsis = json.load(f)

# BOW stats

In [3]:
data = pd.DataFrame()

data["href"] = href2synopsis.keys()
data["synopsis"] = href2synopsis.values()


In [4]:
data["char_total"] = data["synopsis"].progress_apply(len)
data["token_total"] = data["synopsis"].progress_apply(lambda x: len(word_tokenize(x)))
data["sent_total"] = data["synopsis"].progress_apply(lambda x: len(sent_tokenize(x)))

  0%|          | 0/19393 [00:00<?, ?it/s]

  0%|          | 0/19393 [00:00<?, ?it/s]

  0%|          | 0/19393 [00:00<?, ?it/s]

In [27]:
data["lines_total"] = data["synopsis"].progress_apply(lambda x: len(x.split("\n")))

  0%|          | 0/19393 [00:00<?, ?it/s]

In [5]:
data["token_max_char_len"] = data["synopsis"].progress_apply(
    lambda x: max([len(t) for t in word_tokenize(x)]) if x.strip() else 0
)

data["sent_max_char_len"] = data["synopsis"].progress_apply(
    lambda x: max([len(t) for t in sent_tokenize(x)]) if x.strip() else 0
)

data["line_max_char_len"] = data["synopsis"].progress_apply(
    lambda x: max([len(t) for t in x.split("\n")]) if x.strip() else 0
)

  0%|          | 0/19393 [00:00<?, ?it/s]

  0%|          | 0/19393 [00:00<?, ?it/s]

  0%|          | 0/19393 [00:00<?, ?it/s]

In [13]:
data[data["synopsis"] == ""]

Unnamed: 0,href,synopsis,char_total,token_total,sent_total,token_max_char_len,sent_max_char_len,line_max_char_len
10,/736899-doughnuts-mori,,0,0,0,0,0,0
58,/736027-extraordinary-attorney-woo-season-2,,0,0,0,0,0,0
70,/744201-themselves,,0,0,0,0,0,0
90,/743807-roommates-of-poongduck-304-special,,0,0,0,0,0,0
94,/744195-mina-ni-ko-are,,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
19357,/705713-welcome-to-toei-slaughterhouse,,0,0,0,0,0,0
19362,/698583-jae-joong-on-the-road,,0,0,0,0,0,0
19370,/51401-unified-three-kingdoms,,0,0,0,0,0,0
19376,/742441-start-up,,0,0,0,0,0,0


In [10]:
data.describe().astype(int)

Unnamed: 0,char_total,token_total,sent_total,token_max_char_len,sent_max_char_len,line_max_char_len
count,19393,19393,19393,19393,19393,19393
mean,493,99,4,11,149,372
std,359,72,3,3,69,241
min,0,0,0,0,0,0
25%,233,46,2,11,112,198
50%,433,87,4,12,147,349
75%,683,138,7,13,185,504
max,5804,1201,75,188,1701,2865


In [7]:
stats = data.drop(columns=["href", "synopsis"])

# Outlier detection

[The svm.OneClassSVM is known to be sensitive to outliers and thus does not perform very well for outlier detection](https://scikit-learn.org/stable/modules/outlier_detection.html#outlier-detection)

In [37]:
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest

In [41]:
clf = LocalOutlierFactor(n_neighbors=20)
preds = clf.fit_predict(stats)
# clf.negative_outlier_factor_
data["pred"] = preds
data["negative_outlier_factor_"] = clf.negative_outlier_factor_

In [12]:
# data[data["pred"] == -1]["synopsis"].sample(10).to_list()

In [50]:
data[data["pred"] == -1].sort_values(by="negative_outlier_factor_"
                                    )["synopsis"].to_list()[-20:]

['A loving friend, a son, and lover, who is being haunted by his deep dark past...\n \nA story of love, betrayal, and revenge...',
 '~~ Adapted from the web novel "Once Upon a Time, There Was a Spirit Sword Mountain" (从前有座灵剑山) by His Majesty the King (国王陛下).',
 'Where can there be a teenager without a story?\nIf I send a DM to IABC, the Broadcasting Department of Iyego,\nIABC will take care of it!\n\nYou must see it before you see Part 5!?\nThe most fun way to enjoy Real:Time:Love Series!\nTeen Love Story of Iye High Broadcasters [I:Love:DM]\n\n(Source: 콬TV)',
 'On the advice of Professor Tsukuba, a physician she respects, houseman Togano Makoto takes training at forensic medicine classes. She gingerly steps into a classroom with a strange atmosphere. The eccentric but outstanding forensic professor Matsuzaki Tojiro gives her a polite brush-off and leaves the place. Makoto trains under Kosaki in a story that throws the ethical conflict surrounding autopsies and human relationships into