In [1]:
import json
import numpy as np
import pandas as pd
from collections import Counter

from nltk import word_tokenize, sent_tokenize

from tqdm.auto import tqdm
tqdm.pandas()

pd.set_option('display.max_colwidth', 200)

In [2]:
with open("../data/href2synopsis.json", encoding="utf-8") as f:
    href2synopsis = json.load(f)

    Takeaways:
        - replace \n or \r sequences with single \n
        - use language-specific tokenizer

# BOW stats

In [3]:
data = pd.DataFrame()

data["href"] = href2synopsis.keys()
data["synopsis"] = href2synopsis.values()

In [14]:
data = data[data["synopsis"] != ""]

In [4]:
data["char_total"] = data["synopsis"].progress_apply(len)
data["token_total"] = data["synopsis"].progress_apply(lambda x: len(word_tokenize(x)))
data["sent_total"] = data["synopsis"].progress_apply(lambda x: len(sent_tokenize(x)))

  0%|          | 0/19393 [00:00<?, ?it/s]

  0%|          | 0/19393 [00:00<?, ?it/s]

  0%|          | 0/19393 [00:00<?, ?it/s]

In [27]:
data["lines_total"] = data["synopsis"].progress_apply(lambda x: len(x.split("\n")))

  0%|          | 0/19393 [00:00<?, ?it/s]

In [5]:
data["token_max_char_len"] = data["synopsis"].progress_apply(
    lambda x: max([len(t) for t in word_tokenize(x)]) if x.strip() else 0
)

data["sent_max_char_len"] = data["synopsis"].progress_apply(
    lambda x: max([len(t) for t in sent_tokenize(x)]) if x.strip() else 0
)

data["line_max_char_len"] = data["synopsis"].progress_apply(
    lambda x: max([len(t) for t in x.split("\n")]) if x.strip() else 0
)

  0%|          | 0/19393 [00:00<?, ?it/s]

  0%|          | 0/19393 [00:00<?, ?it/s]

  0%|          | 0/19393 [00:00<?, ?it/s]

In [16]:
data["sent_char_ratio"] = [t.sent_total/t.char_total for t in data.itertuples()]
data["sent_token_ratio"] = [t.sent_total/t.token_total for t in data.itertuples()]
data["token_char_ratio"] = [t.token_total/t.char_total for t in data.itertuples()]
data["token_sent_ratio"] = [t.token_total/t.sent_total for t in data.itertuples()]
data["char_sent_ratio"] = [t.char_total/t.sent_total for t in data.itertuples()]
data["char_token_ratio"] = [t.char_total/t.token_total for t in data.itertuples()]

In [17]:
data.describe().astype(int)

Unnamed: 0,char_total,token_total,sent_total,token_max_char_len,sent_max_char_len,line_max_char_len,sent_char_ratio,sent_token_ratio,token_char_ratio,token_sent_ratio,char_sent_ratio,char_token_ratio
count,18374,18374,18374,18374,18374,18374,18374,18374,18374,18374,18374,18374
mean,521,105,5,12,157,393,0,0,0,20,102,4
std,349,70,3,2,61,230,0,0,0,7,39,1
min,1,1,1,1,1,1,0,0,0,1,1,1
25%,262,53,3,11,118,225,0,0,0,16,78,4
50%,454,92,5,12,151,364,0,0,0,19,97,4
75%,702,141,7,13,188,515,0,0,0,24,120,5
max,5804,1201,75,188,1701,2865,1,1,1,124,1701,130


In [18]:
stats = data.drop(columns=["href", "synopsis"])

# Outlier detection

[The svm.OneClassSVM is known to be sensitive to outliers and thus does not perform very well for outlier detection](https://scikit-learn.org/stable/modules/outlier_detection.html#outlier-detection)

In [19]:
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest

In [20]:
clf = LocalOutlierFactor(n_neighbors=20)
preds = clf.fit_predict(stats)
# clf.negative_outlier_factor_
data["pred"] = preds
data["negative_outlier_factor_"] = clf.negative_outlier_factor_

In [25]:
# data[data["pred"] == -1]["synopsis"].sample(10).to_list()