In [25]:
import json
import numpy as np
import pandas as pd
from collections import Counter

from nltk import word_tokenize, sent_tokenize

from tqdm.auto import tqdm
tqdm.pandas()

pd.set_option('display.max_colwidth', 200)

In [2]:
with open("../data/href2synopsis.json", encoding="utf-8") as f:
    href2synopsis = json.load(f)

# BOW stats

In [15]:
data = pd.DataFrame()

data["href"] = href2synopsis.keys()
data["synopsis"] = href2synopsis.values()

In [16]:
data["char_total"] = data["synopsis"].progress_apply(len)
data["token_total"] = data["synopsis"].progress_apply(lambda x: len(word_tokenize(x)))
data["sent_total"] = data["synopsis"].progress_apply(lambda x: len(sent_tokenize(x)))

  0%|          | 0/19393 [00:00<?, ?it/s]

  0%|          | 0/19393 [00:00<?, ?it/s]

  0%|          | 0/19393 [00:00<?, ?it/s]

In [27]:
data["lines_total"] = data["synopsis"].progress_apply(lambda x: len(x.split("\n")))

  0%|          | 0/19393 [00:00<?, ?it/s]

In [32]:
data["token_max_char_len"] = data["synopsis"].progress_apply(
    lambda x: max([len(t) for t in word_tokenize(x)]) if x.strip() else 0
)

data["sent_max_char_len"] = data["synopsis"].progress_apply(
    lambda x: max([len(t) for t in sent_tokenize(x)]) if x.strip() else 0
)

  0%|          | 0/19393 [00:00<?, ?it/s]

  0%|          | 0/19393 [00:00<?, ?it/s]

In [33]:
data.describe().astype(int)

Unnamed: 0,char_total,token_total,sent_total,pred,token_max_char_len,sent_max_char_len,lines_total
count,19393,19393,19393,19393,19393,19393,19393
mean,493,99,4,0,11,149,2
std,359,72,3,0,3,69,2
min,0,0,0,-1,0,0,1
25%,233,46,2,1,11,112,1
50%,433,87,4,1,12,147,3
75%,683,138,7,1,13,185,4
max,5804,1201,75,1,188,1701,42


In [36]:
stats = data.drop(columns=["href", "synopsis"])

# Outlier detection

[The svm.OneClassSVM is known to be sensitive to outliers and thus does not perform very well for outlier detection](https://scikit-learn.org/stable/modules/outlier_detection.html#outlier-detection)

In [37]:
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest

In [41]:
clf = LocalOutlierFactor(n_neighbors=20)
preds = clf.fit_predict(stats)
# clf.negative_outlier_factor_
data["pred"] = preds
data["negative_outlier_factor_"] = clf.negative_outlier_factor_

In [12]:
# data[data["pred"] == -1]["synopsis"].sample(10).to_list()

In [49]:
data[data["pred"] == -1].sort_values(by="negative_outlier_factor_"
                                    )["synopsis"].to_list()[:20]

['.',
 '.',
 '-',
 '.',
 '.',
 '.',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'NA',
 '押イおッ人高ラヘ致24格会ケロヲク放文イメラシ倍極ろぼるけ島判険おが会影のたト床用じフ索手工ふ決39客芝2会促縮繰しー。議づだーめ景能オマ囲索趣こイが車74客芝6交いかへづ談情そゆあん明臨ヲホセサ博位カフク逃転地掘ークゃ年柏沼烈翌け。樹ネエ未著艦ぎう請報ア時阪便テコス学医テニ限物うてを嘉一るなむあ豪権ノマ慢活ーひや頑受やきイょ制団ぴゅめ捕活そクぱ泡死スチメ法乏咋塞墨はラい。\n\n扱ドみば質削ぶな中説ケラリ田斉ロカシ月己もむトみ民合変が康6務的何名歳ツス節5断ヘカヌ食芸に長算却ぱむ。変テヲフヱ格漢せッ鳥日ロオ応留サ面医てゆりル投写ぴいだな交委チツテミ変田むルど平生トフ出安じめきる込候96坊訴テヤハ台動ヨワ礎府トと給工ぼめ軒賞そなも明軽アニモウ喬1親がぐめさ。\n\n稿あン批政セホ表会書ロチム紙景びたぜ業7都オヤハク媛格せごぶ断売ドれびげ測記果ス育変だた綿問いにぼ更年ぎルち問表こんだン禁7報王のねスが点53賀みなびそ。演こ医路紗フモイア自擬はぱお伊月へそ女一を祝観まよゃょ社健ナヨイ含歩児ハキトノ石記ろへ億狙円録フサニ雰聞イエ能人テノイメ夜56待ニ管思ヤ断輸ア大井計さッきあ。\n\n特サヨ京慎オヤツ治疑づんみリ東今れぴリフ優給すぜご問測ぞ覚1大さぞず掲犯ぴびちあ保幕チツモ能理横2科光っゆゃょ。発クホキ制日ヒ聴6透ユ財焼アヨム題渡政後ユ車分ルシ立開べ強橋はふご都29人血キホソ味殺うるイえ。読きりょ禁指ッねしゅ原実ざス情改さぜぎ計高タ国和とげほ予岸ト物索惑えず図根っぽつ被能遊案メス護好さこ代初ヒユセホ山各こたづむ本情わ久42前襲ヱ経下相らりづゃ。\n\n東おゃはぱ更部イッリ予応ぎば舞約的20改働り増弱ロト陸請オ高祖ゆー載供ホユロ毎校やただべ養更イぐ後票エテ申本セルスメ訓党そうぱ京更ノム聞仰巣はん。官さほぱル木用ゅろぐ関暮る真権ナメタネ事残ソヘトテ注自えくみ新代近ユ田亡げゅほ県提じ身89前ラツロ曲夜何じ。量毎4睡モハイホ量文齢士モノカ下67響イ今大ク上芸き文部ホテイ健手たぶ法査タロカ目込らじ南37乏咋ばどーざ。\n\n五みゅいは面出ーぽめ利投昭エ