In [1]:
import json
import numpy as np
import pandas as pd
from collections import Counter

from nltk import word_tokenize, sent_tokenize

from tqdm.auto import tqdm
tqdm.pandas()

pd.set_option('display.max_colwidth', 200)

In [2]:
with open("../data/href2synopsis.json", encoding="utf-8") as f:
    href2synopsis = json.load(f)

    Takeaways:
        - replace \n or \r sequences with single \n
        - use language-specific tokenizer

# BOW stats

In [3]:
data = pd.DataFrame()

data["href"] = href2synopsis.keys()
data["synopsis"] = href2synopsis.values()

In [4]:
data = data[data["synopsis"] != ""]

In [8]:
import regex as re

data["synopsis"] = data["synopsis"].apply(
    lambda x: re.sub("\s*\n\s*", "\n", x).strip()
)

In [20]:
data["min_ord"] = data["synopsis"].progress_apply(
    lambda s: min([ord(char) for char in s])
)

data["max_ord"] = data["synopsis"].progress_apply(
    lambda s: max([ord(char) for char in s])
)

data["mean_ord"] = data["synopsis"].progress_apply(
    lambda s: np.mean([ord(char) for char in s])
)

data["median_ord"] = data["synopsis"].progress_apply(
    lambda s: np.median([ord(char) for char in s])
)

data["mode_ord"] = data["synopsis"].progress_apply(
    lambda s: Counter([ord(char) for char in s]).most_common()[0][0]
)

  0%|          | 0/18374 [00:00<?, ?it/s]

  0%|          | 0/18374 [00:00<?, ?it/s]

  0%|          | 0/18374 [00:00<?, ?it/s]

  0%|          | 0/18374 [00:00<?, ?it/s]

  0%|          | 0/18374 [00:00<?, ?it/s]

In [12]:
data["char_total"] = data["synopsis"].progress_apply(len)
data["token_total"] = data["synopsis"].progress_apply(
    lambda x: len(word_tokenize(x))
)
data["sent_total"] = data["synopsis"].progress_apply(
    lambda x: len([sent for line in x.split("\n") for sent in  sent_tokenize(line) if line.strip()])
)

  0%|          | 0/18374 [00:00<?, ?it/s]

  0%|          | 0/18374 [00:00<?, ?it/s]

  0%|          | 0/18374 [00:00<?, ?it/s]

In [27]:
# data["lines_total"] = data["synopsis"].progress_apply(lambda x: len(x.split("\n")))

  0%|          | 0/19393 [00:00<?, ?it/s]

In [5]:
data["token_max_char_len"] = data["synopsis"].progress_apply(
    lambda x: max([len(t) for t in word_tokenize(x)]) if x.strip() else 0
)

data["sent_max_char_len"] = data["synopsis"].progress_apply(
    lambda x: max([len(t) for t in sent_tokenize(x)]) if x.strip() else 0
)

data["line_max_char_len"] = data["synopsis"].progress_apply(
    lambda x: max([len(t) for t in x.split("\n")]) if x.strip() else 0
)

  0%|          | 0/19393 [00:00<?, ?it/s]

  0%|          | 0/19393 [00:00<?, ?it/s]

  0%|          | 0/19393 [00:00<?, ?it/s]

In [16]:
data["sent_char_ratio"] = [t.sent_total/t.char_total for t in data.itertuples()]
data["sent_token_ratio"] = [t.sent_total/t.token_total for t in data.itertuples()]
data["token_char_ratio"] = [t.token_total/t.char_total for t in data.itertuples()]
data["token_sent_ratio"] = [t.token_total/t.sent_total for t in data.itertuples()]
data["char_sent_ratio"] = [t.char_total/t.sent_total for t in data.itertuples()]
data["char_token_ratio"] = [t.char_total/t.token_total for t in data.itertuples()]

In [13]:
data.describe().astype(int)

Unnamed: 0,char_total,token_total,sent_total
count,18374,18374,18374
mean,519,105,5
std,348,70,3
min,1,1,1
25%,262,53,3
50%,454,92,5
75%,700,141,7
max,5799,1201,79


In [14]:
stats = data.drop(columns=["href", "synopsis"]) #.drop(columns=["line_max_char_len"])

# Outlier detection

[The svm.OneClassSVM is known to be sensitive to outliers and thus does not perform very well for outlier detection](https://scikit-learn.org/stable/modules/outlier_detection.html#outlier-detection)

In [15]:
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest

In [16]:
clf = LocalOutlierFactor(n_neighbors=20)
preds = clf.fit_predict(stats)
# clf.negative_outlier_factor_
data["pred"] = preds
data["negative_outlier_factor_"] = clf.negative_outlier_factor_

In [17]:
# data[data["pred"] == -1]["synopsis"].sample(10).to_list()

In [18]:
data.sort_values(by="negative_outlier_factor_"
                                    )[['href', 'synopsis', 'pred', 'negative_outlier_factor_']].head(20)

Unnamed: 0,href,synopsis,pred,negative_outlier_factor_
9387,/56367-test-content-season-4,押イおッ人高ラヘ致24格会ケロヲク放文イメラシ倍極ろぼるけ島判険おが会影のたト床用じフ索手工ふ決39客芝2会促縮繰しー。議づだーめ景能オマ囲索趣こイが車74客芝6交いかへづ談情そゆあん明臨ヲホセサ博位カフク逃転地掘ークゃ年柏沼烈翌け。樹ネエ未著艦ぎう請報ア時阪便テコス学医テニ限物うてを嘉一るなむあ豪権ノマ慢活ーひや頑受やきイょ制団ぴゅめ捕活そクぱ泡死スチメ法乏咋塞墨はラい。\n扱ドみば質削...,-1,-11.101223
6767,/35149-yo-nimo-kimyou-na-monogatari-2019-rain-special,"***Eien no Hero\nWith the advances in science, the world has developed cyborg technology that draws from the best in human abilities. Monsters abuse this technology, threatening the lives and safe...",-1,-4.931302
6765,/701325-yo-nimo-kimyou-na-monogatari-2021-fall-special,"Story 01 - Die in 15 Seconds\nAt midnight, the pharmacist Mikami Megumi, who was working in the drug room of the clinic, suddenly got stuck. And in front of her, a red splash and a stationary bull...",-1,-4.135296
9622,/27458-gay-out-soon-3-some,"A South Korean short film collection:\n• Some (10 mins, 2014)\n• Turquoise Sky (15 mins, 2014)\n• Green light (8 mins, 2014)\n• One Summer Night (10 mins, 2014)\n• The Name Game (7 mins, 2014)\n• ...",-1,-3.984794
6876,/15101-mr.-x-and-i,"Four different gay love stories based on real-life events: ""Us Against the World"" (Ep. 1), ""Beijing Beijing"" (Ep. 2), ""The Groomsman"" (Ep. 3), and ""Promise You a City"" (Ep. 4).",-1,-3.703648
3440,/40105-gank-your-heart-special,The special clips tell seven stories that happen after the finale:\n1. New Legends team members\n2. Pei Xi & Summer\n3. Lin Yi Xuan & Lu Yi Yi\n4. Mi Ya\n5. Sun Ze Yi & Shu Wen\n6. Xia Ling\n7. Ji...,-1,-3.485384
14027,/5696-13-real-asian-horror-stories,Scary stories based on true Japanese incidents.\nStory 1 - Only two of us\nStory 2 - Three people are coming\nStory 3 - Copy Machine\nStory 4 - Blue Raincoat\nStory 5 - Animal Odour\nStory 6 - Sle...,-1,-3.390385
7797,/735329-revival,"Documentary using reenactments to explore the potential of beneficial microorganisms in restoring the environment. The documentary focuses on the effects of microbes such as lactic acid bacteria, ...",-1,-3.382842
17921,/29385-test-title-22341,"Donec id justo. Etiam ultricies nisi vael augue. Nulla porta dolor. In hac habitasse platea dictumst. Cras id dui. test\nIn dui magna, posuere eget, vestibulum et, tempor auctor, justo. Nullam acc...",-1,-3.239255
4185,/734799-honto-ni-atta-kowai-hanashi-summer-special-2022,"Hijou Tsuhou\nTakahashi Ryota, a new security guard working for a security company, is doing the night shift with a senior colleague Nishida Tetsushi when an emergency call comes in from a centre ...",-1,-3.138633
