In [1]:
import json
import numpy as np
import pandas as pd
from collections import Counter

from nltk import word_tokenize, sent_tokenize

from tqdm.auto import tqdm
tqdm.pandas()

pd.set_option('display.max_colwidth', 200)

In [2]:
with open("../data/href2synopsis.json", encoding="utf-8") as f:
    href2synopsis = json.load(f)

    Takeaways:
        - replace \n or \r sequences with single \n
        - use language-specific tokenizer
        - when using max_count_token_ratio make sure do delete stopwords
            (unless you specifically wish to locate examples with too much stopwords)
        - 

# BOW stats
    # TODO: add actual bow stats

In [3]:
data = pd.DataFrame()

data["href"] = href2synopsis.keys()
data["synopsis"] = href2synopsis.values()

In [4]:
data = data[data["synopsis"] != ""]

In [5]:
import regex as re

data["synopsis"] = data["synopsis"].apply(
    lambda x: re.sub("\s*\n\s*", "\n", x).strip()
)

In [7]:
data["min_ord"] = data["synopsis"].progress_apply(
    lambda s: min([ord(char) for char in s])
)

data["max_ord"] = data["synopsis"].progress_apply(
    lambda s: max([ord(char) for char in s])
)

data["mean_ord"] = data["synopsis"].progress_apply(
    lambda s: np.mean([ord(char) for char in s])
)

data["median_ord"] = data["synopsis"].progress_apply(
    lambda s: np.median([ord(char) for char in s])
)

data["mode_ord"] = data["synopsis"].progress_apply(
    lambda s: Counter([ord(char) for char in s]).most_common()[0][0]
)

  0%|          | 0/18374 [00:00<?, ?it/s]

  0%|          | 0/18374 [00:00<?, ?it/s]

  0%|          | 0/18374 [00:00<?, ?it/s]

  0%|          | 0/18374 [00:00<?, ?it/s]

  0%|          | 0/18374 [00:00<?, ?it/s]

In [8]:
data["mode_count"] = data["synopsis"].progress_apply(
    lambda s: Counter(s).most_common()[0][1]
)

data["mode_ratio"] = data["synopsis"].progress_apply(
    lambda s: Counter(s).most_common()[0][1]/len(s)
)

data["mean_char_count"] = data["synopsis"].progress_apply(
    lambda s: np.mean(list(Counter(s).values()))
)

data["mean_token_count"] = data["synopsis"].progress_apply(
    lambda s: np.mean(list(Counter([w.lower() for w in word_tokenize(s)]).values()))
)

data["max_count_token_ratio"] = data["synopsis"].progress_apply(
    lambda s: Counter([w.lower() for w in word_tokenize(s)]).most_common()[0][1]/len(word_tokenize(s))
)

  0%|          | 0/18374 [00:00<?, ?it/s]

  0%|          | 0/18374 [00:00<?, ?it/s]

  0%|          | 0/18374 [00:00<?, ?it/s]

  0%|          | 0/18374 [00:00<?, ?it/s]

  0%|          | 0/18374 [00:00<?, ?it/s]

In [9]:
from stopwordsiso import stopwords

In [10]:
def max_count_token_ratio_no_stops(text, stopwords=stopwords("en")):
    words = word_tokenize(text)
    counter = Counter()
    for w in words:
        lowered = w.lower()
        if lowered not in stopwords:
            counter.update([lowered])
    if not counter:
        return 0.
    return counter.most_common()[0][1]/len(words)

# data["max_count_token_ratio_stops"] = data["synopsis"].progress_apply(max_count_token_ratio_stops)
data["max_count_token_ratio_no_stops"] = data["synopsis"].progress_apply(max_count_token_ratio_no_stops)

  0%|          | 0/18374 [00:00<?, ?it/s]

In [11]:
data["char_total"] = data["synopsis"].progress_apply(len)
data["token_total"] = data["synopsis"].progress_apply(
    lambda x: len(word_tokenize(x))
)
data["sent_total"] = data["synopsis"].progress_apply(
    lambda x: len([sent for line in x.split("\n") for sent in  sent_tokenize(line) if line.strip()])
)

  0%|          | 0/18374 [00:00<?, ?it/s]

  0%|          | 0/18374 [00:00<?, ?it/s]

  0%|          | 0/18374 [00:00<?, ?it/s]

In [27]:
# data["lines_total"] = data["synopsis"].progress_apply(lambda x: len(x.split("\n")))

  0%|          | 0/19393 [00:00<?, ?it/s]

In [12]:
data["token_max_char_len"] = data["synopsis"].progress_apply(
    lambda x: max([len(t) for t in word_tokenize(x)]) if x.strip() else 0
)

data["sent_max_char_len"] = data["synopsis"].progress_apply(
    lambda x: max([len(t) for t in sent_tokenize(x)]) if x.strip() else 0
)

data["line_max_char_len"] = data["synopsis"].progress_apply(
    lambda x: max([len(t) for t in x.split("\n")]) if x.strip() else 0
)

  0%|          | 0/18374 [00:00<?, ?it/s]

  0%|          | 0/18374 [00:00<?, ?it/s]

  0%|          | 0/18374 [00:00<?, ?it/s]

In [13]:
data["sent_char_ratio"] = [t.sent_total/t.char_total for t in data.itertuples()]
data["sent_token_ratio"] = [t.sent_total/t.token_total for t in data.itertuples()]
data["token_char_ratio"] = [t.token_total/t.char_total for t in data.itertuples()]
data["token_sent_ratio"] = [t.token_total/t.sent_total for t in data.itertuples()]
data["char_sent_ratio"] = [t.char_total/t.sent_total for t in data.itertuples()]
data["char_token_ratio"] = [t.char_total/t.token_total for t in data.itertuples()]

In [15]:
data.describe().round(2)

Unnamed: 0,min_ord,max_ord,mean_ord,median_ord,mode_ord,mode_count,mode_ratio,mean_char_count,mean_token_count,max_count_token_ratio,...,sent_total,token_max_char_len,sent_max_char_len,line_max_char_len,sent_char_ratio,sent_token_ratio,token_char_ratio,token_sent_ratio,char_sent_ratio,char_token_ratio
count,18374.0,18374.0,18374.0,18374.0,18374.0,18374.0,18374.0,18374.0,18374.0,18374.0,...,18374.0,18374.0,18374.0,18374.0,18374.0,18374.0,18374.0,18374.0,18374.0,18374.0
mean,17.55,5485.71,164.34,104.31,33.29,88.63,0.17,12.25,1.43,0.07,...,5.44,12.32,157.41,392.94,0.01,0.06,0.2,19.92,99.24,4.97
std,10.47,11839.38,329.14,95.17,90.66,61.0,0.02,6.83,0.24,0.04,...,3.64,2.94,61.0,230.86,0.02,0.04,0.02,6.83,36.1,1.01
min,9.0,45.0,45.0,45.0,32.0,1.0,0.02,1.0,1.0,0.03,...,1.0,1.0,1.0,1.0,0.0,0.01,0.01,1.0,1.0,1.0
25%,10.0,121.0,91.13,103.0,32.0,44.0,0.16,7.16,1.25,0.05,...,3.0,11.0,118.0,225.0,0.01,0.04,0.19,15.5,76.33,4.73
50%,10.0,121.0,92.31,104.0,32.0,77.0,0.17,11.2,1.42,0.07,...,5.0,12.0,151.0,363.5,0.01,0.05,0.2,19.0,94.44,4.95
75%,32.0,8217.0,108.58,104.0,32.0,120.0,0.18,16.08,1.59,0.08,...,7.0,13.0,188.0,515.0,0.01,0.06,0.21,23.12,116.0,5.18
max,65.0,65374.0,18556.63,12516.0,12290.0,1017.0,1.0,93.53,2.84,1.0,...,79.0,188.0,1692.0,2865.0,1.0,1.0,1.0,118.0,727.0,130.15


In [11]:
stats = data.drop(columns=["href", "synopsis"]) #.drop(columns=["line_max_char_len"])

# Outlier detection

[The svm.OneClassSVM is known to be sensitive to outliers and thus does not perform very well for outlier detection](https://scikit-learn.org/stable/modules/outlier_detection.html#outlier-detection)

In [12]:
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest

In [13]:
clf = LocalOutlierFactor(n_neighbors=20)
preds = clf.fit_predict(stats)

data["pred"] = preds
data["negative_outlier_factor_"] = clf.negative_outlier_factor_

In [14]:
# data[data["pred"] == -1]["synopsis"].sample(10).to_list()

In [17]:
data[data["pred"] == -1].assign(len=data["synopsis"].apply(len)).sort_values(by=[
    "len", "negative_outlier_factor_"
])[['href', 'synopsis', 'pred', 'negative_outlier_factor_']].head(50)

Unnamed: 0,href,synopsis,pred,negative_outlier_factor_
16056,/12266-joshikou-keisatsu,-,-1,-2.241926
17836,/4896-uniform-survigirl-ii,.,-1,-2.241926
17837,/4895-uniform-survigirl-i,.,-1,-2.241926
18061,/59645-monsta-x-amigo-tv-season-1,.,-1,-2.241926
19248,/11085-the-inspector-wear-skirts-iv,.,-1,-2.241926
19338,/14347-inside-architecture-a-challenge-to-japanese-society,.,-1,-2.241926
14367,/21378-forever-young-2,,-1,-15.61669
9567,/27070-facebook-caf,,-1,-2.241926
11721,/14783-sympathy,,-1,-2.241926
13349,/24017-wifi-society-the-horror-home,,-1,-2.241926


In [16]:
# data.assign(emperor_ratio=data["synopsis"].progress_apply(
#     lambda s: len(re.findall("emperor", s.lower()))/len(word_tokenize(s))
# )).sort_values(by="emperor_ratio", ascending=False).head(10)["synopsis"].to_list()