### Named Entity Recognition

In [1]:
import spacy
from nltk import sent_tokenize

In [2]:
!python -m spacy download en_core_web_trf

Collecting en-core-web-trf==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.8.0/en_core_web_trf-3.8.0-py3-none-any.whl (457.4 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m457.4/457.4 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[38;5;2m‚úî Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_trf')


### Load the model

In [69]:
def load_model():
    """
    Load the spaCy transformer model.
    """
    nlp = spacy.load("en_core_web_trf")
    return nlp

In [70]:
nlp_model = load_model()

### Load the datatset

In [71]:
import os 
import sys
import pathlib
folder_path = pathlib.Path().parent.resolve()
sys.path.append(os.path.join(folder_path, '../'))
from utils import load_subtitles_dataset

In [72]:
dataset_path = '../data/Subtitles/'
df = load_subtitles_dataset(dataset_path)

In [73]:
df.head()

Unnamed: 0,episode,script
0,94,We are Fighting Dreamers aiming high\n Fightin...
1,80,We are Fighting Dreamers aiming high\n Fightin...
2,32,"Press down hard on the gas\n That‚Äôs right, the..."
3,185,"Rock away your existence,\n Shouting that you ..."
4,191,"Rock away your existence,\n Shouting that you ..."


In [74]:
sample_script = df.iloc[0]['script']
sample_script

'We are Fighting Dreamers aiming high\n Fighting Dreamers don\'t care what people think about them\n Fighting Dreamers follow what they believe\n Oli Oli Oli Oh! Just go my way\n Right here right now (Bang) Hit it straight like a line drive!\n Right here right now (Burn)\n Down a difficult road filled with endless struggles\n Where do you think you are going following someone else\'s map?\n An insightful crow comes along to tear up the map\n Now open your eyes and take a look at the truth (Yeah!)\n There\'s nothing to lose, so let\'s GO!!!\n We are Fighting Dreamers aiming high\n Fighting Dreamers don\'t care what people think about them\n Fighting Dreamers follow what they believe\n Oli Oli Oli Oh! Just go my way\n Right here right now (Bang) Hit it straight like a line drive!\n Right here right now (Burn) We\'re gonna do it and do our best!\n Right here right now (Bang) Hit it straight like a line drive!\n Right here right now (Burn) We\'re gonna do it and do our best! BANG!\n My bod

In [75]:
sentences = sent_tokenize(sample_script)
sentences[60:90]

['To think I‚Äôd already have my handicap spotted.',
 'You are such a disgraceful guy as always‚Ä¶\n Dang it.',
 'It kind of ticks me off when the Pervy Sage is mocked.',
 'Summoning‚Ä¶Jutsu!',
 'The Nine-Tailed Fox kid from the Chunin Exam.',
 'I wonder if I should‚Äôve killed him then, after all.',
 'For the sake of the Akatsuki organization,\n I acknowledged his achievement‚Ä¶ and let him slide by.',
 'But the only ones who can break a Five-Pronged Spell\n are the Sannin members and the Third Hokage.',
 'Which means Jiraiya may have already taught it to Naruto.',
 'Perhaps.',
 'If he‚Äôs able to control the Nine-Tailed Fox‚Äôs power‚Ä¶\n Oh‚Ä¶\n Th-This is‚Ä¶\n It doesn‚Äôt quite look to be so.',
 'He isn‚Äôt endowed with Shinobi ability to begin with, so‚Ä¶\n Why?',
 'He‚Äôs still got a long way to go, sure enough.',
 'Why?!',
 'Hello.',
 'Hey, Gamatatsu.',
 'Why did you come out?',
 'Oh, Brother Gamakichi.',
 'It‚Äôs the first time I‚Äôve been Summoned.',
 'I-I wonder if I can do 

In [76]:
sentence = ".".join(sentences[60:90])
sentence

'To think I‚Äôd already have my handicap spotted..You are such a disgraceful guy as always‚Ä¶\n Dang it..It kind of ticks me off when the Pervy Sage is mocked..Summoning‚Ä¶Jutsu!.The Nine-Tailed Fox kid from the Chunin Exam..I wonder if I should‚Äôve killed him then, after all..For the sake of the Akatsuki organization,\n I acknowledged his achievement‚Ä¶ and let him slide by..But the only ones who can break a Five-Pronged Spell\n are the Sannin members and the Third Hokage..Which means Jiraiya may have already taught it to Naruto..Perhaps..If he‚Äôs able to control the Nine-Tailed Fox‚Äôs power‚Ä¶\n Oh‚Ä¶\n Th-This is‚Ä¶\n It doesn‚Äôt quite look to be so..He isn‚Äôt endowed with Shinobi ability to begin with, so‚Ä¶\n Why?.He‚Äôs still got a long way to go, sure enough..Why?!.Hello..Hey, Gamatatsu..Why did you come out?.Oh, Brother Gamakichi..It‚Äôs the first time I‚Äôve been Summoned..I-I wonder if I can do my best?.Idiot..Get some snacks or something and hide..What?.I can get snacks

### Running the model

In [77]:
doc = nlp_model(sentence)
doc.ents

(the Pervy Sage,
 the Chunin Exam,
 Akatsuki,
 Five,
 Sannin,
 Third,
 Jiraiya,
 Naruto,
 Shinobi,
 Gamatatsu,
 Gamakichi,
 first,
 Jiraiya,
 Jiraiya,
 Sannin,
 Naruto)

In [78]:
for entity in doc.ents:
    print(entity.text, entity.label_)

the Pervy Sage PERSON
the Chunin Exam EVENT
Akatsuki ORG
Five CARDINAL
Sannin NORP
Third ORDINAL
Jiraiya PERSON
Naruto PERSON
Shinobi NORP
Gamatatsu PERSON
Gamakichi PERSON
first ORDINAL
Jiraiya PERSON
Jiraiya PERSON
Sannin NORP
Naruto PERSON


In [79]:
def get_ners_inference(script):
    script_sentences = sent_tokenize(script)
    
    ner_output = []
    for sentence in script_sentences:
        doc = nlp_model(sentence)
        ners = set()
        for entity in doc.ents:
            if entity.label_ == 'PERSON':
                full_name = entity.text
                first_name = full_name.split(" ")[0]
                first_name = first_name.strip()
                ners.add(first_name)
        ner_output.append(ners)
        
    return ner_output

In [80]:
df =df.head(10)
df

Unnamed: 0,episode,script
0,94,We are Fighting Dreamers aiming high\n Fightin...
1,80,We are Fighting Dreamers aiming high\n Fightin...
2,32,"Press down hard on the gas\n That‚Äôs right, the..."
3,185,"Rock away your existence,\n Shouting that you ..."
4,191,"Rock away your existence,\n Shouting that you ..."
5,190,"Rock away your existence,\n Shouting that you ..."
6,184,"Rock away your existence,\n Shouting that you ..."
7,27,"Press down hard on the gas\n That‚Äôs right, the..."
8,33,"Press down hard on the gas\n That‚Äôs right, the..."
9,81,We are Fighting Dreamers aiming high\n Fightin...


In [81]:
df['ners'] = df['script'].apply(get_ners_inference)

In [82]:
df

Unnamed: 0,episode,script,ners
0,94,We are Fighting Dreamers aiming high\n Fightin...,"[{Oli}, {}, {}, {}, {}, {Oli}, {}, {Burn}, {},..."
1,80,We are Fighting Dreamers aiming high\n Fightin...,"[{Oli}, {}, {}, {}, {}, {Oli}, {}, {Burn}, {},..."
2,32,"Press down hard on the gas\n That‚Äôs right, the...","[{}, {}, {}, {}, {}, {}, {Lee}, {}, {}, {}, {}..."
3,185,"Rock away your existence,\n Shouting that you ...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {..."
4,191,"Rock away your existence,\n Shouting that you ...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {Hinata},..."
5,190,"Rock away your existence,\n Shouting that you ...","[{}, {}, {}, {}, {}, {}, {}, {Naruto}, {}, {},..."
6,184,"Rock away your existence,\n Shouting that you ...","[{}, {}, {}, {}, {}, {Akamaru}, {}, {}, {Kiba}..."
7,27,"Press down hard on the gas\n That‚Äôs right, the...","[{}, {}, {}, {}, {}, {}, {}, {}, {Naruto}, {},..."
8,33,"Press down hard on the gas\n That‚Äôs right, the...","[{}, {}, {}, {}, {}, {}, {}, {}, {}, {Sasuke},..."
9,81,We are Fighting Dreamers aiming high\n Fightin...,"[{Oli}, {}, {}, {}, {}, {Oli}, {}, {Burn}, {},..."


### Character network

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from pyvis.network import Network


In [84]:
def geenerate_character_network(df):
    
    windows=10
    entity_relationships = []
    for row in df['ners']:
        previous_entities_in_window = []
        for sentences in row:
            previous_entities_in_window.append(list(sentences))
            previous_entities_in_window = previous_entities_in_window[-windows:]
            
            #Flatten 2D list into a 1D list
            previous_entities_flattened= sum(previous_entities_in_window, [])
            
            for entity in sentences:
                for entity_in_window in previous_entities_flattened:
                    if entity != entity_in_window:
                        entity_relationships.append(sorted([entity, entity_in_window]))
                        
    relationship_df = pd.DataFrame({'value':entity_relationships})
    relationship_df['source'] = relationship_df['value'].apply(lambda x: x[0])
    relationship_df['target'] = relationship_df['value'].apply(lambda x: x[1])
    relationship_df = relationship_df.groupby(['source', 'target']).count().reset_index()
    relationship_df = relationship_df.sort_values(by='value', ascending=False)
    
    return relationship_df

In [85]:
relationship_df = geenerate_character_network(df)
relationship_df

Unnamed: 0,source,target,value
2,Akamaru,Kiba,121
97,Hinata,Naruto,76
147,Ino,Sakura,40
297,Sakura,Sasuke,34
3,Akamaru,Naruto,32
...,...,...,...
99,Hinata,Sakura,1
207,Kakashi,Shadow,1
208,Kakashi,Shinobi,1
209,Kakashi,Tsunade,1


In [86]:
relationship_df = relationship_df.sort_values(by='value', ascending=False) # just making shure üòÇ
relationship_df = relationship_df.head(200)
relationship_df

Unnamed: 0,source,target,value
2,Akamaru,Kiba,121
97,Hinata,Naruto,76
147,Ino,Sakura,40
297,Sakura,Sasuke,34
3,Akamaru,Naruto,32
...,...,...,...
110,Hokage,Kakashi,1
164,Janin,The,1
109,Hokage,Kages,1
108,Hokage,Kabuto,1


In [87]:
G = nx.from_pandas_edgelist(relationship_df, source='source', target='target', edge_attr='value', create_using=nx.DiGraph())
net = Network(notebook=True, height='800px', width='100%', bgcolor='#222222', font_color='white', cdn_resources='remote')
node_degree = dict(G.degree())
nx.set_node_attributes(G, node_degree, 'size')
net.from_nx(G)
net.show('naruto.html')

naruto.html
