In [1]:
import json
import numpy as np
import pandas as pd
from collections import Counter

from nltk import word_tokenize, sent_tokenize

from tqdm.auto import tqdm
tqdm.pandas()

pd.set_option('display.max_colwidth', 200)

In [2]:
with open("../data/href2synopsis.json", encoding="utf-8") as f:
    href2synopsis = json.load(f)

    Takeaways:
        - replace \n or \r sequences with single \n
        - use language-specific tokenizer
        - when using max_count_token_ratio make sure do delete stopwords

# BOW stats
    # TODO: add actual bow stats

In [13]:
data = pd.DataFrame()

data["href"] = href2synopsis.keys()
data["synopsis"] = href2synopsis.values()

In [14]:
data = data[data["synopsis"] != ""]

In [15]:
import regex as re

data["synopsis"] = data["synopsis"].apply(
    lambda x: re.sub("\s*\n\s*", "\n", x).strip()
)

In [75]:
# data["min_ord"] = data["synopsis"].progress_apply(
#     lambda s: min([ord(char) for char in s])
# )

# data["max_ord"] = data["synopsis"].progress_apply(
#     lambda s: max([ord(char) for char in s])
# )

# data["mean_ord"] = data["synopsis"].progress_apply(
#     lambda s: np.mean([ord(char) for char in s])
# )

# data["median_ord"] = data["synopsis"].progress_apply(
#     lambda s: np.median([ord(char) for char in s])
# )

# data["mode_ord"] = data["synopsis"].progress_apply(
#     lambda s: Counter([ord(char) for char in s]).most_common()[0][0]
# )

In [16]:
# data["mode_count"] = data["synopsis"].progress_apply(
#     lambda s: Counter(s).most_common()[0][1]
# )

# data["mode_ratio"] = data["synopsis"].progress_apply(
#     lambda s: Counter(s).most_common()[0][1]/len(s)
# )

# data["mean_char_count"] = data["synopsis"].progress_apply(
#     lambda s: np.mean(list(Counter(s).values()))
# )

# data["mean_token_count"] = data["synopsis"].progress_apply(
#     lambda s: np.mean(list(Counter([w.lower() for w in word_tokenize(s)]).values()))
# )

data["max_count_token_ratio"] = data["synopsis"].progress_apply(
    lambda s: Counter([w.lower() for w in word_tokenize(s)]).most_common()[0][1]/len(word_tokenize(s))
)

  0%|          | 0/18374 [00:00<?, ?it/s]

In [12]:
data["char_total"] = data["synopsis"].progress_apply(len)
data["token_total"] = data["synopsis"].progress_apply(
    lambda x: len(word_tokenize(x))
)
data["sent_total"] = data["synopsis"].progress_apply(
    lambda x: len([sent for line in x.split("\n") for sent in  sent_tokenize(line) if line.strip()])
)

  0%|          | 0/18374 [00:00<?, ?it/s]

  0%|          | 0/18374 [00:00<?, ?it/s]

  0%|          | 0/18374 [00:00<?, ?it/s]

In [27]:
# data["lines_total"] = data["synopsis"].progress_apply(lambda x: len(x.split("\n")))

  0%|          | 0/19393 [00:00<?, ?it/s]

In [5]:
data["token_max_char_len"] = data["synopsis"].progress_apply(
    lambda x: max([len(t) for t in word_tokenize(x)]) if x.strip() else 0
)

data["sent_max_char_len"] = data["synopsis"].progress_apply(
    lambda x: max([len(t) for t in sent_tokenize(x)]) if x.strip() else 0
)

data["line_max_char_len"] = data["synopsis"].progress_apply(
    lambda x: max([len(t) for t in x.split("\n")]) if x.strip() else 0
)

  0%|          | 0/19393 [00:00<?, ?it/s]

  0%|          | 0/19393 [00:00<?, ?it/s]

  0%|          | 0/19393 [00:00<?, ?it/s]

In [16]:
data["sent_char_ratio"] = [t.sent_total/t.char_total for t in data.itertuples()]
data["sent_token_ratio"] = [t.sent_total/t.token_total for t in data.itertuples()]
data["token_char_ratio"] = [t.token_total/t.char_total for t in data.itertuples()]
data["token_sent_ratio"] = [t.token_total/t.sent_total for t in data.itertuples()]
data["char_sent_ratio"] = [t.char_total/t.sent_total for t in data.itertuples()]
data["char_token_ratio"] = [t.char_total/t.token_total for t in data.itertuples()]

In [20]:
data.describe().astype("float16")

Unnamed: 0,max_count_token_ratio
count,18368.0
mean,0.072083
std,0.037292
min,0.026321
25%,0.054932
50%,0.065796
75%,0.081055
max,1.0


In [23]:
stats = data.drop(columns=["href", "synopsis"]) #.drop(columns=["line_max_char_len"])

# Outlier detection

[The svm.OneClassSVM is known to be sensitive to outliers and thus does not perform very well for outlier detection](https://scikit-learn.org/stable/modules/outlier_detection.html#outlier-detection)

In [24]:
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest

In [25]:
clf = LocalOutlierFactor(n_neighbors=20)
preds = clf.fit_predict(stats)

data["pred"] = preds
data["negative_outlier_factor_"] = clf.negative_outlier_factor_

In [26]:
# data[data["pred"] == -1]["synopsis"].sample(10).to_list()

In [28]:
data[data["pred"] == -1].sort_values(by="negative_outlier_factor_"
                                    )[['href', 'synopsis', 'pred', 'negative_outlier_factor_']].head(20)

Unnamed: 0,href,synopsis,pred,negative_outlier_factor_
14125,/30050-got7ing,"Second installment of GOT7ing.\n""NO script, NO mission, NO game, NO penalty"" !",-1,-78380980.0
11526,/4350-dae-jo-yeong,"The drama tells the story of the life of Dae Jo Yeong, also known as the founder of the kingdom of Balhae.",-1,-69892090.0
10179,/26992-the-flowers-filled-the-palace-and-missed-the-time-season-1,"The story is about the love triangle between the fourth prince, the crown prince and the number one female scholar – ShuYuan during the Qing dynasty. With the war for love is also the war for the ...",-1,-65944140.0
8242,/30980-making-of-bt21,"The members of BTS take part in creating their beloved LINE characters - RJ, Shooky, Mang, Koya, Chimmy, Tata, Cooky and VAN.",-1,-64348910.0
12508,/681279-lodi-x,"""LODI X"" is a survival show created to find the best female idol group in Thailand. Bringing together 20 girl groups, they go through battles to decide the best group chosen by the jury. The show ...",-1,-64185900.0
6364,/698855-destined-fated,"Destined/Fated is a story of love, betrayal, family, and friendship.",-1,-64185900.0
13818,/10399-reply-1994-epilogue,Special episodes that show behind the scenes footage and the first meetings with the actors before the filming for the drama starts.,-1,-61520100.0
10096,/63403-the-moment-special,"This is the special episode or behind the scenes episode of the drama ""The Moment"".",-1,-60120720.0
14311,/8064-fake-fiction,About a con artist posing as a magician.,-1,-60120720.0
6940,/686593-nana,Adapted from the Japanese Manga of the same name,-1,-60120720.0
