## named entity recognition

In [1]:
import spacy
from nltk import sent_tokenize

In [2]:
!python -m spacy download en_core_web_md


Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
     ---------------------------------------- 0.0/33.5 MB ? eta -:--:--
     ---------------------------------------- 0.3/33.5 MB ? eta -:--:--
     - -------------------------------------- 1.0/33.5 MB 6.6 MB/s eta 0:00:05
     -- ------------------------------------- 2.1/33.5 MB 4.8 MB/s eta 0:00:07
     --- ------------------------------------ 2.6/33.5 MB 4.6 MB/s eta 0:00:07
     --- ------------------------------------ 3.1/33.5 MB 3.7 MB/s eta 0:00:09
     ---- ----------------------------------- 3.4/33.5 MB 3.0 MB/s eta 0:00:10
     ---- ----------------------------------- 3.4/33.5 MB 3.0 MB/s eta 0:00:10
     ---- ----------------------------------- 3.4/33.5 MB 3.0 MB/s eta 0:00:10
     ---- ----------------------------------- 3.4/33.5 MB 3.0 MB/s eta 0:00:10
     ---- ----------------------------------- 3.

## load model

In [3]:
def load_model():
    nlp = spacy.load("en_core_web_md")
    return nlp

In [4]:
nlp_model = load_model()

  from .autonotebook import tqdm as notebook_tqdm


## load model

In [5]:
import os 
import sys
import pathlib
folder_path = pathlib.Path().parent.resolve()
sys.path.append(os.path.join(folder_path, '../'))
from utils import load_subtitles_dataset

In [6]:
dataset_path = "../data/Subtitles-cleaned/"
df = load_subtitles_dataset(dataset_path)

In [25]:
df.head()

Unnamed: 0,season,episode,script
0,1,1,"Oh, my god. Christ! Shit. Oh, God. Oh, my God...."
1,1,2,"Are you okay? - You are a lifesaver. - Yeah, m..."
2,1,3,<i>Let's break it down. </i> Hydrogen. What do...
3,1,4,Operation Icebreaker. - How we liking that? - ...
4,1,5,"Here's my rsum. I mean, technically it says ""c..."


In [26]:
sample_script = df.iloc[0]['script']
sample_script

'Oh, my god. Christ! Shit. Oh, God. Oh, my God. Oh, my God. Oh, my God. Think, think, think. Oh, my gosh. Okay. Come on, come on, come on. Come on. My name is Walter Hartwell White. I live at 308 Negra Arroyo Lane, Albuquerque, New Mexico, 87104. To all law-enforcement entities, this is not an admission of guilt. I am speaking to my family now. Skyler. You are the love of my life. I hope you know that. Walter Jr. You\'re my big man. There are... There are going to be some things... Things that you\'ll come to learn about me... ...in the next few days. I just want you to know that no matter how it may look... ...I only had you in my heart. Goodbye. <i>Money before the panel earlier this year. </i> Happy birthday. Look at that. That is veggie bacon. Believe it or not. Zero cholesterol... ...and you won\'t even taste the difference. What time do you think you\'ll be home? Same time. I don\'t want him dicking you around tonight. You get paid till 5, you work till 5. No later. Hey. - Hey, h

In [27]:
sentences = sent_tokenize(sample_script)

In [28]:
sentences = sentences[0:100]


In [29]:
sentence = ".".join(sentences)


In [30]:
sentence


"Oh, my god..Christ!.Shit..Oh, God..Oh, my God..Oh, my God..Oh, my God..Think, think, think..Oh, my gosh..Okay..Come on, come on, come on..Come on..My name is Walter Hartwell White..I live at 308 Negra Arroyo Lane, Albuquerque, New Mexico, 87104..To all law-enforcement entities, this is not an admission of guilt..I am speaking to my family now..Skyler..You are the love of my life..I hope you know that..Walter Jr. You're my big man..There are....There are going to be some things....Things that you'll come to learn about me... ...in the next few days..I just want you to know that no matter how it may look.......I only had you in my heart..Goodbye..<i>Money before the panel earlier this year..</i> Happy birthday..Look at that..That is veggie bacon..Believe it or not..Zero cholesterol... ...and you won't even taste the difference..What time do you think you'll be home?.Same time..I don't want him dicking you around tonight..You get paid till 5, you work till 5..No later..Hey..- Hey, happy 

## run the model

In [31]:
doc = nlp_model(sentence)


In [32]:
doc.ents


(Walter Hartwell,
 308,
 Negra,
 Albuquerque,
 New Mexico,
 87104,
 Skyler,
 Walter Jr.,
 the next few days,
 earlier this year,
 Zero,
 tonight,
 5,
 5,
 You're,
 first,
 million-billionth,
 We're,
 Chad,
 Chapter six,
 One,
 two,
 three,
 10,
 10,
 20)

In [33]:
for entity in doc.ents:
    print(entity, entity.label_)

Walter Hartwell PERSON
308 CARDINAL
Negra NORP
Albuquerque GPE
New Mexico GPE
87104 DATE
Skyler PERSON
Walter Jr. PERSON
the next few days DATE
earlier this year DATE
Zero CARDINAL
tonight TIME
5 DATE
5 CARDINAL
You're PERSON
first ORDINAL
million-billionth QUANTITY
We're PERSON
Chad PERSON
Chapter six LAW
One CARDINAL
two CARDINAL
three CARDINAL
10 CARDINAL
10 CARDINAL
20 CARDINAL


In [61]:
def get_ners_inference(script):
    # Character mapping integrated inside
    character_mapping = {
        # Walter White variations
        'walt': 'Walter White',
        'walter': 'Walter White', 
        'white': 'Walter White',
        'walt white': 'Walter White',
        'walter white': 'Walter White',
        'mr white': 'Walter White',
        'heisenberg': 'Walter White',
        
        # Jesse Pinkman variations  
        'jesse': 'Jesse Pinkman',
        'pinkman': 'Jesse Pinkman',
        'jesse pinkman': 'Jesse Pinkman',
        
        # Skyler White variations
        'skyler': 'Skyler White',
        'sky': 'Skyler White',
        'skyler white': 'Skyler White',
        
        # Hank Schrader variations
        'hank': 'Hank Schrader',
        'schrader': 'Hank Schrader',
        'hank schrader': 'Hank Schrader',
        
        # Marie Schrader variations
        'marie': 'Marie Schrader',
        'marie schrader': 'Marie Schrader',
        
        # Walter Jr variations
        'walter jr': 'Walter Jr',
        'walt jr': 'Walter Jr',
        'flynn': 'Walter Jr',
        
        # Saul Goodman variations
        'saul': 'Saul Goodman',
        'goodman': 'Saul Goodman',
        'saul goodman': 'Saul Goodman',
        'jimmy': 'Saul Goodman',
        'jimmy mcgill': 'Saul Goodman',
        
        # Gus Fring variations
        'gus': 'Gus Fring',
        'fring': 'Gus Fring',
        'gus fring': 'Gus Fring',
        'gustavo': 'Gus Fring',
        
        # Mike Ehrmantraut variations
        'mike': 'Mike Ehrmantraut',
        'ehrmantraut': 'Mike Ehrmantraut',
        'mike ehrmantraut': 'Mike Ehrmantraut',
    }

    script_sentences = sent_tokenize(script)
    ner_output = []

    for sentence in script_sentences:
        doc = nlp_model(sentence)
        ners = set()
        for entity in doc.ents:
            if entity.label_ == "PERSON":
                full_name = entity.text.strip().lower()

                # Normalize using mapping
                if full_name in character_mapping:
                    mapped_name = character_mapping[full_name]
                else:
                    # fallback on first token
                    first_name = full_name.split(" ")[0]
                    mapped_name = character_mapping.get(first_name, entity.text.strip())
                
                ners.add(mapped_name)
        ner_output.append(ners)

    return ner_output


df['ners'] = df['script'].apply(get_ners_inference)

In [62]:
df = df.head(10)

In [63]:
df


Unnamed: 0,season,episode,script,ners,ners_normalized,ners_filtered
0,1,1,"Oh, my god. Christ! Shit. Oh, God. Oh, my God....","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {...","[Walter White, Walter White, Ben, Lonic, Bogda...","[Walter White, Walter White, Bogdan, Walter Wh..."
1,1,2,"Are you okay? - You are a lifesaver. - Yeah, m...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {...","[Walter White, Dan, We've, Lisa, Joan, Wonderb...","[Walter White, Skyler White, Walter White, Wal..."
2,1,3,<i>Let's break it down. </i> Hydrogen. What do...,"[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {...","[Calcium, Walter White, Walter White, Marie Sc...","[Walter White, Walter White, Marie Schrader, M..."
3,1,4,Operation Icebreaker. - How we liking that? - ...,"[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {...","[Gom, Domingo, a., Emilio, Walter White, Jesus...","[Emilio, Walter White, Jesus, Skyler White, Ha..."
4,1,5,"Here's my rsum. I mean, technically it says ""c...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {...","[Jesse Pinkman, Walter White, Elliott, Elliott...","[Jesse Pinkman, Walter White, Elliott, Elliott..."
5,1,6,"Let's get something straight. This, the chemis...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {...","[Walter White, Walter White, Elliott, i., Walt...","[Walter White, Walter White, Elliott, Walter W..."
6,1,7,"In closing, I can tell you we take this very s...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {...","[Walter White, Walter White, Kjeldahl, Lex, Je...","[Walter White, Walter White, Tuco Salamanca, T..."
7,2,1,<i> Chemistry is the study of change. You unde...,"[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {...","[Tuco Salamanca, Michvanilly, Tuco Salamanca, ...","[Tuco Salamanca, Tuco Salamanca, Walter White,..."
8,2,2,"Synch: Tyno, Michvanilly www.forom.com www.sub...","[{Michvanilly}, {}, {Tuco Salamanca}, {}, {}, ...","[Michvanilly, Tuco Salamanca, Tuco Salamanca, ...","[Tuco Salamanca, Tuco Salamanca, Tuco Salamanc..."
9,2,3,"It's a bold plan, Mr. White. You sure this is ...","[{Walter White}, {}, {Michvanilly}, {}, {}, {}...","[Walter White, Michvanilly, Walter White, Uncl...","[Walter White, Walter White, Marie Schrader, H..."


In [64]:
df['ners'] = df['script'].apply(get_ners_inference)

In [65]:
df

Unnamed: 0,season,episode,script,ners,ners_normalized,ners_filtered
0,1,1,"Oh, my god. Christ! Shit. Oh, God. Oh, my God....","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {...","[Walter White, Walter White, Ben, Lonic, Bogda...","[Walter White, Walter White, Bogdan, Walter Wh..."
1,1,2,"Are you okay? - You are a lifesaver. - Yeah, m...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {...","[Walter White, Dan, We've, Lisa, Joan, Wonderb...","[Walter White, Skyler White, Walter White, Wal..."
2,1,3,<i>Let's break it down. </i> Hydrogen. What do...,"[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {...","[Calcium, Walter White, Walter White, Marie Sc...","[Walter White, Walter White, Marie Schrader, M..."
3,1,4,Operation Icebreaker. - How we liking that? - ...,"[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {...","[Gom, Domingo, a., Emilio, Walter White, Jesus...","[Emilio, Walter White, Jesus, Skyler White, Ha..."
4,1,5,"Here's my rsum. I mean, technically it says ""c...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {...","[Jesse Pinkman, Walter White, Elliott, Elliott...","[Jesse Pinkman, Walter White, Elliott, Elliott..."
5,1,6,"Let's get something straight. This, the chemis...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {...","[Walter White, Walter White, Elliott, i., Walt...","[Walter White, Walter White, Elliott, Walter W..."
6,1,7,"In closing, I can tell you we take this very s...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {...","[Walter White, Walter White, Kjeldahl, Lex, Je...","[Walter White, Walter White, Tuco Salamanca, T..."
7,2,1,<i> Chemistry is the study of change. You unde...,"[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {...","[Tuco Salamanca, Michvanilly, Tuco Salamanca, ...","[Tuco Salamanca, Tuco Salamanca, Walter White,..."
8,2,2,"Synch: Tyno, Michvanilly www.forom.com www.sub...","[{Michvanilly}, {}, {Tuco Salamanca}, {}, {}, ...","[Michvanilly, Tuco Salamanca, Tuco Salamanca, ...","[Tuco Salamanca, Tuco Salamanca, Tuco Salamanc..."
9,2,3,"It's a bold plan, Mr. White. You sure this is ...","[{Walter White}, {}, {Michvanilly}, {}, {}, {}...","[Walter White, Michvanilly, Walter White, Uncl...","[Walter White, Walter White, Marie Schrader, H..."


## character network

In [66]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from pyvis.network import Network

In [None]:
def character_netowork_generator(df):
    windows = 10
    entity_relationship = []

    for row in df['ners']:
        previous_entities_in_window = []

        for sentence in row:
            # sentence is already a set of names, no need to wrap with list()
            current_entities = list(sentence)
            previous_entities_in_window.append(current_entities)
            previous_entities_in_window = previous_entities_in_window[-windows:]

            # Flatten 2D List into 1D List
            previous_entities_flattened = sum(previous_entities_in_window, [])

            for entity in current_entities:
                for entity_in_window in previous_entities_flattened:
                    if entity != entity_in_window:
                        entity_relationship.append(sorted([entity, entity_in_window]))

    relationship_df = pd.DataFrame({'value': entity_relationship})
    relationship_df['source'] = relationship_df['value'].apply(lambda x: x[0])
    relationship_df['target'] = relationship_df['value'].apply(lambda x: x[1])
    relationship_df = relationship_df.groupby(['source', 'target']).count().reset_index()
    relationship_df = relationship_df.sort_values('value', ascending=False)

    return relationship_df


In [70]:
relationship_df = generate_character_network(df)

In [71]:
relationship_df

Unnamed: 0,source,target,value
219,Skyler White,Walter White,22
80,Elliott,Walter White,20
122,Gonzo,Tuco,19
228,Tuco,Walter White,18
145,Hank Schrader,Walter White,16
...,...,...,...
240,Walter White,You're,1
242,Walter White,i. e.,1
241,Walter White,bin,1
243,Walter White,jack,1


In [74]:
relationship_df = relationship_df.sort_values('value', ascending=False)

In [75]:
G = nx.from_pandas_edgelist(
    relationship_df, 
    source='source', 
    target='target', 
    edge_attr='value',
    create_using=nx.Graph()
)

net = Network(notebook=True, width="1000px", height="700px", bgcolor="#222222", font_color="white", cdn_resources="remote")
node_degree = dict(G.degree)

nx.set_node_attributes(G, node_degree, 'size')
net.from_nx(G)
net.show("breaking_bad.html")

breaking_bad.html
