In [56]:
import json
import pandas as pd
from collections import Counter

from nltk import word_tokenize, sent_tokenize

from tqdm.auto import tqdm
tqdm.pandas()

pd.set_option('display.max_colwidth', 200)

In [31]:
with open("../data/href2synopsis.json", encoding="utf-8") as f:
    href2synopsis = json.load(f)

# BOW stats

In [63]:
data = pd.DataFrame()

data["href"] = href2synopsis.keys()
data["synopsis"] = href2synopsis.values()

In [64]:
data["char_len"] = data["synopsis"].progress_apply(len)
data["token_len"] = data["synopsis"].progress_apply(lambda x: len(word_tokenize(x)))
data["sent_len"] = data["synopsis"].progress_apply(lambda x: len(sent_tokenize(x)))

  0%|          | 0/19393 [00:00<?, ?it/s]

  0%|          | 0/19393 [00:00<?, ?it/s]

  0%|          | 0/19393 [00:00<?, ?it/s]

In [62]:
data.describe().round(2)

Unnamed: 0,char_len,token_len,sent_len
count,19393.0,19393.0,19393.0
mean,493.76,99.52,4.99
std,359.1,72.71,3.6
min,0.0,0.0,0.0
25%,233.0,46.0,2.0
50%,433.0,87.0,4.0
75%,683.0,138.0,7.0
max,5804.0,1201.0,75.0


In [68]:
stats = data.drop(columns=["href", "synopsis"])

# Outlier detection

[The svm.OneClassSVM is known to be sensitive to outliers and thus does not perform very well for outlier detection](https://scikit-learn.org/stable/modules/outlier_detection.html#outlier-detection)

In [69]:
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest

In [83]:
clf = LocalOutlierFactor(n_neighbors=20)
preds = clf.fit_predict(stats)
# clf.negative_outlier_factor_
data["pred"] = preds

In [84]:
data[data["pred"] == -1]["synopsis"].sample(10).to_list()

['Mini beauty drama.',
 'Following the story of policewomen in their twenties and the physical and psychological interaction and training with police dogs.',
 'Where can there be a teenager without a story?\nIf I send a DM to IABC, the Broadcasting Department of Iyego,\nIABC will take care of it!\n\nYou must see it before you see Part 5!?\nThe most fun way to enjoy Real:Time:Love Series!\nTeen Love Story of Iye High Broadcasters [I:Love:DM]\n\n(Source: 콬TV)',
 'Ahn So Hee, Han Ji Yeon, and Kang Ji Gu enjoy hanging out after work and drinking together. The three women are single and in their 30s. As for their jobs, So Hee works as a broadcasting writer, Ji Yeon as a yoga instructor, and Ji Gu as a YouTuber. Meanwhile, Kang Buk Gu works as a PD of the TV variety show. He hangs out with these three women.\n\n(Source: AsianWiki) ~~ Adapted from the webtoon "Sooldo Girl" (술꾼도시처녀들) by Mi Kkang (미깡).',
 "A murder takes place. Hong Yi Young, who is a timpanist, does not remember what happened 