In [2]:
!pip install sentence-transformers umap-learn hdbscan scikit-learn regex numpy pandas tqdm



In [3]:
import re
import json
from pathlib import Path
from typing import List, Dict, Any
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import umap
import hdbscan
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

  $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### setting up **lexicons**

In [5]:
VIOLENCE_WORDS = [r'\bkill(s|ed|ing)?\b', r'\bshoot(s|ed|ing)?\b', r'\bshot\b', r'\bstab(s|bed|bing)?\b',
                   r'\bknife\b', r'\bexplod(e|es|ed|ing)?\b', r'\bbang\b', r'\bgun(s)?\b', r'\bpistol\b',
                   r'\brifle\b', r'\bassassin\a', r'\bslaughter\b','blood','dead','corpse','torture']
GORE_WORDS = ['blood drenched','skin off','skinned','guts','entrails','brain','mangled','third degree burns']
SEX_WORDS = [r'\brape\b', r'\bsexual\b', r'\bnaked\b', r'\bbreast(s)?\b', r'\bintercourse\b']
PROFANITY = [r'\bfuck\b', r'\bshit\b', r'\bmotherfucker\b', r'\bbitch\b', r'\bсука\b', r'\bбляд\b']
DRUG_WORDS = [r'\bdrug(s)?\b', r'\bheroin\b', r'\bcocaine\b', r'\bmarijuana\b', r'\bpill(s)?\b', r'\bweed\b']
CHILD_PATTERN = [r'\bchild\b', r'\bboy\b', r'\bgirl\b', r'\bdaughter\b', r'\bson\b', r'\bteen\b', r'\b\[?[0-9]{1,2}\]?\b']

In [6]:
#compile_regex
VIOLENCE_RE = [re.compile(pat, flags=re.I) for pat in VIOLENCE_WORDS]
SEX_RE = [re.compile(pat, flags=re.I) for pat in SEX_WORDS]
PROF_RE = [re.compile(pat, flags=re.I) for pat in PROFANITY]
DRUG_RE = [re.compile(pat, flags=re.I) for pat in DRUG_WORDS]
CHILD_RE = [re.compile(pat, flags=re.I) for pat in CHILD_PATTERN]

### scene **parser**

In [7]:
def parse_script_to_scenes(txt: str) -> List[Dict[str,Any]]:
    scenes = []
    #split_by_typical_scene_headings_words
    parts = re.split(r'(?=(?:INT\.|EXT\.|SCENE HEADING:|scene_heading:))', txt, flags=re.I)
    idx = 0
    for p in parts:
        text = p.strip()
        if not text:
            continue
        heading_match = re.match(r'((?:INT\.|EXT\.).{0,120})', text, flags=re.I)
        heading = heading_match.group(1).strip() if heading_match else f"sc_{idx}"
        scenes.append({'scene_id': idx, 'heading': heading, 'text': text})
        idx += 1
    return scenes

### feature **extraction** per **scene**

In [8]:
def count_matches(regex_list, text):
    s=0
    for rx in regex_list:
        matches = rx.findall(text)
        if matches:
            s += len(matches)
    return s

def scene_feature_vector(scene_text: str) -> Dict[str, float]:
    txt = scene_text.lower()
    v_viol = count_matches(VIOLENCE_RE, txt)
    v_gore = sum(1 for pat in GORE_WORDS if pat in txt)
    v_sex = count_matches(SEX_RE, txt)
    v_prof = count_matches(PROF_RE, txt)
    v_drug = count_matches(DRUG_RE, txt)
    v_child = count_matches(CHILD_RE, txt)
    length = len(txt.split())
    return {
        'violence_count': v_viol,
        'gore_count': v_gore,
        'sex_count': v_sex,
        'profanity_count': v_prof,
        'drug_count': v_drug,
        'child_mentions': v_child,
        'length': length
    }

### **Embeddings**

In [9]:
MODEL_NAME = "all-MiniLM-L6-v2" #SentenceTransformer (пока пусть будет готовая)
embedder = SentenceTransformer(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
def compute_scene_embeddings(scenes: List[Dict[str,Any]]):
    texts = [s['text'] for s in scenes]
    embs = embedder.encode(texts, show_progress_bar=True, convert_to_numpy=True)
    return embs

### Clustering (**UMAP + HDBSCAN**)

In [11]:
def cluster_embeddings(embs: np.ndarray, min_cluster_size=15):
    reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='cosine', random_state=42)
    emb2d = reducer.fit_transform(embs)
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, metric='euclidean', cluster_selection_method='eom')
    labels = clusterer.fit_predict(emb2d)
    return labels, emb2d, clusterer

### Map **cluster+features**

In [12]:
def score_scene_for_age(feat: Dict[str,float]) -> Dict[str,float]:
    v = feat['violence_count']
    gore = feat['gore_count']
    sex = feat['sex_count']
    prof = feat['profanity_count']
    drug = feat['drug_count']
    child = feat['child_mentions']
    length = max(1, feat['length'])
    violence_score = min(1.0, v/3.0 + gore*0.5)
    sex_score = min(1.0, sex/1.0)
    profanity_score = min(1.0, prof/2.0)
    drug_score = min(1.0, drug/1.0)
    child_risk = min(1.0, child * 0.5)
    return {
        'violence': violence_score,
        'gore': min(1.0, gore*0.6),
        'sex': sex_score,
        'profanity': profanity_score,
        'drugs': drug_score,
        'child_risk': child_risk
    }

In [13]:
def map_scores_to_rating(agg_scores: Dict[str,float]) -> Dict[str,Any]:
    if agg_scores['sex'] >= 0.8 or agg_scores['gore'] >= 0.8: #file-aggregated_maxima/amounts
        return {'rating':'18+','reasons':['graphic gore/sexual content']}
    if agg_scores['violence'] >= 0.7 and agg_scores['child_risk']>0:
        return {'rating':'18+','reasons':['severe violence involving children']}
    if agg_scores['violence'] >= 0.6 or agg_scores['gore'] >= 0.5:
        return {'rating':'16+','reasons':['explicit violence/murders']}
    if agg_scores['profanity'] >= 0.6 or agg_scores['drugs'] >= 0.5:
        return {'rating':'12+','reasons':['common swearing/drugs']}
    if agg_scores['violence'] >= 0.2:
        return {'rating':'12+','reasons':['some stage collisions/shooting']}
    return {'rating':'6+','reasons':['minimal questionable content']}

### Full **pipeline** by **file**

In [14]:
def analyze_script_file(path: str):
    txt = Path(path).read_text(encoding='utf-8', errors='ignore')
    scenes = parse_script_to_scenes(txt)
    feats = [scene_feature_vector(s['text']) for s in scenes]
    embs = compute_scene_embeddings(scenes)
    labels, emb2d, clusterer = cluster_embeddings(embs)
    #per-scene_scores
    scene_scores = [score_scene_for_age(f) for f in feats]
    # aggregate per-file (max)
    agg = {k: max([s[k] for s in scene_scores]) for k in scene_scores[0].keys()}
    #map_agg_to_rating
    rating_info = map_scores_to_rating(agg)
    #collect_top_triggering_scenes
    #sort_by_violence+gore+sex_contributions
    ranking = []
    for s, sc, scene in zip(scenes, scene_scores, scenes):
        weight = sc['violence']*0.5 + sc['gore']*0.8 + sc['sex']*0.9 + sc['profanity']*0.2 + sc['drugs']*0.3 + sc['child_risk']*1.0
        ranking.append((weight, s))
    ranking.sort(reverse=True, key=lambda x: x[0])
    top = []
    for w, scene in ranking[:5]:
        top.append({
            'scene_id': scene['scene_id'],
            'heading': scene['heading'],
            'sample_text': scene['text'][:600].replace('\n',' ')[:600],
            'weight': float(w)
        })
    result = {
        'file': str(path),
        'predicted_rating': rating_info['rating'],
        'reasons': rating_info['reasons'],
        'agg_scores': agg,
        'top_trigger_scenes': top
    }
    return result

In [16]:
if __name__ == '__main__':
    import sys
    p = "/content/drive/MyDrive/Colab Notebooks/dataset/BERT_annotations/My Best Friend s Wedding_0119738_anno.txt" #sys.argv[1]
    out = analyze_script_file(p)
    print(json.dumps(out, ensure_ascii=False, indent=2))

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

  warn(


{
  "file": "/content/drive/MyDrive/Colab Notebooks/dataset/BERT_annotations/My Best Friend s Wedding_0119738_anno.txt",
  "predicted_rating": "18+",
  "reasons": [
    "graphic gore/sexual content"
  ],
  "agg_scores": {
    "violence": 1.0,
    "gore": 0.6,
    "sex": 1.0,
    "profanity": 1.0,
    "drugs": 1.0,
    "child_risk": 1.0
  },
  "top_trigger_scenes": [
    {
      "scene_id": 19,
      "heading": "INT. DRESSING ROOM - LATER",
      "sample_text": "INT. DRESSING ROOM - LATER text: Julianne sits on the loveseat in her bra and panties, lighting one text: cigarette from the butt of another. Suddenly, she hears... text: ... an ARGUMENT outside the door. Strains to listen. It does sound text: like Michael and Kim. We can't make out the words. She jumps up, text: dashes for the door, STUBBING her toe on the platform, SHIT!, falls text: heavily AGAINST the door with a THUD that makes her wince, and... text: ... the argument stops. Damn. She opens the door a crack. Sees text: noth

