In [1]:
import json
import numpy as np
import pandas as pd
from collections import Counter

from nltk import word_tokenize, sent_tokenize

from tqdm.auto import tqdm
tqdm.pandas()

pd.set_option('display.max_colwidth', 200)

In [2]:
with open("../data/href2synopsis.json", encoding="utf-8") as f:
    href2synopsis = json.load(f)

# BOW stats

In [3]:
data = pd.DataFrame()

data["href"] = href2synopsis.keys()
data["synopsis"] = href2synopsis.values()

In [14]:
data = data[data["synopsis"] != ""]

In [4]:
data["char_total"] = data["synopsis"].progress_apply(len)
data["token_total"] = data["synopsis"].progress_apply(lambda x: len(word_tokenize(x)))
data["sent_total"] = data["synopsis"].progress_apply(lambda x: len(sent_tokenize(x)))

  0%|          | 0/19393 [00:00<?, ?it/s]

  0%|          | 0/19393 [00:00<?, ?it/s]

  0%|          | 0/19393 [00:00<?, ?it/s]

In [27]:
data["lines_total"] = data["synopsis"].progress_apply(lambda x: len(x.split("\n")))

  0%|          | 0/19393 [00:00<?, ?it/s]

In [5]:
data["token_max_char_len"] = data["synopsis"].progress_apply(
    lambda x: max([len(t) for t in word_tokenize(x)]) if x.strip() else 0
)

data["sent_max_char_len"] = data["synopsis"].progress_apply(
    lambda x: max([len(t) for t in sent_tokenize(x)]) if x.strip() else 0
)

data["line_max_char_len"] = data["synopsis"].progress_apply(
    lambda x: max([len(t) for t in x.split("\n")]) if x.strip() else 0
)

  0%|          | 0/19393 [00:00<?, ?it/s]

  0%|          | 0/19393 [00:00<?, ?it/s]

  0%|          | 0/19393 [00:00<?, ?it/s]

In [16]:
data["sent_char_ratio"] = [t.sent_total/t.char_total for t in data.itertuples()]
data["sent_token_ratio"] = [t.sent_total/t.token_total for t in data.itertuples()]
data["token_char_ratio"] = [t.token_total/t.char_total for t in data.itertuples()]
data["token_sent_ratio"] = [t.token_total/t.sent_total for t in data.itertuples()]
data["char_sent_ratio"] = [t.char_total/t.sent_total for t in data.itertuples()]
data["char_token_ratio"] = [t.char_total/t.token_total for t in data.itertuples()]

In [17]:
data.describe().astype(int)

Unnamed: 0,char_total,token_total,sent_total,token_max_char_len,sent_max_char_len,line_max_char_len,sent_char_ratio,sent_token_ratio,token_char_ratio,token_sent_ratio,char_sent_ratio,char_token_ratio
count,18374,18374,18374,18374,18374,18374,18374,18374,18374,18374,18374,18374
mean,521,105,5,12,157,393,0,0,0,20,102,4
std,349,70,3,2,61,230,0,0,0,7,39,1
min,1,1,1,1,1,1,0,0,0,1,1,1
25%,262,53,3,11,118,225,0,0,0,16,78,4
50%,454,92,5,12,151,364,0,0,0,19,97,4
75%,702,141,7,13,188,515,0,0,0,24,120,5
max,5804,1201,75,188,1701,2865,1,1,1,124,1701,130


In [18]:
stats = data.drop(columns=["href", "synopsis"])

# Outlier detection

[The svm.OneClassSVM is known to be sensitive to outliers and thus does not perform very well for outlier detection](https://scikit-learn.org/stable/modules/outlier_detection.html#outlier-detection)

In [19]:
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest

In [20]:
clf = LocalOutlierFactor(n_neighbors=20)
preds = clf.fit_predict(stats)
# clf.negative_outlier_factor_
data["pred"] = preds
data["negative_outlier_factor_"] = clf.negative_outlier_factor_

In [25]:
# data[data["pred"] == -1]["synopsis"].sample(10).to_list()

In [31]:
data[data["pred"] == -1].sort_values(by="negative_outlier_factor_"
                                    )[['href', 'synopsis', 'negative_outlier_factor_']].head(20)

Unnamed: 0,href,synopsis,negative_outlier_factor_
9039,/740317-la-itzy,"ITZY spends a special youth friendship trip in Los Angeles, U.S.A.\n\n(Source: Korean = News.Naver.com || Tranlateion = MyDramaList)",-8.094004
9387,/56367-test-content-season-4,押イおッ人高ラヘ致24格会ケロヲク放文イメラシ倍極ろぼるけ島判険おが会影のたト床用じフ索手工ふ決39客芝2会促縮繰しー。議づだーめ景能オマ囲索趣こイが車74客芝6交いかへづ談情そゆあん明臨ヲホセサ博位カフク逃転地掘ークゃ年柏沼烈翌け。樹ネエ未著艦ぎう請報ア時阪便テコス学医テニ限物うてを嘉一るなむあ豪権ノマ慢活ーひや頑受やきイょ制団ぴゅめ捕活そクぱ泡死スチメ法乏咋塞墨はラい。\n\n扱ドみば...,-7.934917
11213,/3471-beauties-of-the-emperor,"He loved her and threw in his empire for her but they had no fate...\n\nShe wanted to be the Emperor's woman and finally claimed the empire, but the one beside her wasn't him...",-7.623897
11287,/69817-num-kala,"Path can't give up searching for his love Prim who, because of her debt, can't help but ran away from him...\nHow much longer will he keep waiting for her and will they ever meet again?",-7.217971
18052,/31000-replaylist,"Three different web dramas :\nFirst episode -Seventeen - Yang Yoseop - On the Road\nSecond episode - Not All Right, But It's Alright - Rothy - Butterfly Effect\nThird episode - Just One Bite - BEN...",-6.440586
11140,/8355-spec-rei,"This is a prequel to Keizoku 2: SPEC, it depicts how Saya Toma lost her \nleft hand and how Sebumi Takeru joined SIT.",-6.266474
12393,/724485-spring-is-gone-by-chance,"A popular romance webtoon writer, 'Woo-yeon'\nA no-boyfriend-since-birth who haven't dated anyone, 'Hyang-gi'\nAnd her younger sibling and cute love coach, 'Ro-woon'\n\nA heart-fluttering romance ...",-6.034445
14223,/41937-2012-big-bang-alive-tour-bigshow,2012 Big Bang Alive Tour: Bigshow\nnative\t2012 빅뱅 얼라이브 투어: 빅쇼,-5.870948
12251,/54215-alien-the-joker,"Third movie of the ""Alien"" series:\nA mysterious killer dressed as a clown appears in town and Lan Jue starts to investigate.",-5.43662
18488,/39935-i-am-yu-huanshui,"I Am Yu Huanshui is a story for the underdogs about a man who tolerates as much as he can, yet things change when he discovers that he is terminal ill.\n\n(Source: DramaPanda)\n\n~~ Adapted from t...",-5.108256
