In [1]:
import pandas as pd
import spacy
from spacy import displacy
import re
import time

In [2]:
! python -m spacy download en_core_web_sm -q

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
character_df = pd.read_csv('data/character_df_cleaned.csv')

In [4]:
def remane_names(name):
    if name == 'Walter White Jr.':
        return re.sub(name, 'Walt Jr.', name)
    return name

In [5]:
character_df.Characters = character_df.Characters.apply(remane_names)
character_df['Characters_firstname'] = character_df.Characters.apply(lambda x : re.sub(r"'s", '', x.split()[0]))
character_df['Characters_lastname'] = character_df.Characters.apply(lambda x : re.sub(r"'s", '', x.split()[-1]))

character_df.head()

Unnamed: 0,Season,Characters,Characters_firstname,Characters_lastname
0,Season_1,Walter White,Walter,White
1,Season_1,Skyler White,Skyler,White
2,Season_1,Jesse Pinkman,Jesse,Pinkman
3,Season_1,Hank Schrader,Hank,Schrader
4,Season_1,Marie Schrader,Marie,Schrader


In [6]:
NER = spacy.load('en_core_web_sm')

In [7]:
with open('data/summaries/Season_5B.txt', 'r') as f:
    sea1_txt = f.read()

In [8]:
sea1_txt = sea1_txt.replace('\n', '\n. ')

In [9]:
ner1 = NER(sea1_txt)

In [10]:
# https://stackoverflow.com/a/51219483
# https://nanonets.com/blog/named-entity-recognition-with-nltk-and-spacy/ : For NLTK

displacy.render(ner1[:500], style = 'ent', jupyter = True)

In [11]:
entity_df = []

for sentence in ner1.sents:
    entity_lis = [ent.text for ent in sentence.ents]
    entity_df.append({'sentence' : sentence, 'entities' : entity_lis})
    
entity_df = pd.DataFrame(entity_df)
entity_df.head()

Unnamed: 0,sentence,entities
0,"(In, a, flashforward, ,, following, his, purch...","[M60, Walt]"
1,"(He, grabs, a, tire, iron, from, the, trunk, o...",[]
2,"(Trash, and, graffiti, are, everywhere, ;, a, ...",[]
3,"(Walt, notices, a, single, ,, spray, -, painte...",[Walt]
4,"(He, reaches, the, bedroom, and, retrieves, th...",[]


In [12]:
character_df.head()

Unnamed: 0,Season,Characters,Characters_firstname,Characters_lastname
0,Season_1,Walter White,Walter,White
1,Season_1,Skyler White,Skyler,White
2,Season_1,Jesse Pinkman,Jesse,Pinkman
3,Season_1,Hank Schrader,Hank,Schrader
4,Season_1,Marie Schrader,Marie,Schrader


In [13]:
def filter_entities(entity_list, char_df):
    
    return [entity for entity in entity_list
            if entity in list(char_df.Characters)
            or entity in list(char_df.Characters_firstname)
            or entity in list(char_df.Characters_lastname)]

In [14]:
entity_df['char_entities'] = entity_df.entities.apply(lambda x: filter_entities(x, character_df))
entity_df.head()

Unnamed: 0,sentence,entities,char_entities
0,"(In, a, flashforward, ,, following, his, purch...","[M60, Walt]",[Walt]
1,"(He, grabs, a, tire, iron, from, the, trunk, o...",[],[]
2,"(Trash, and, graffiti, are, everywhere, ;, a, ...",[],[]
3,"(Walt, notices, a, single, ,, spray, -, painte...",[Walt],[Walt]
4,"(He, reaches, the, bedroom, and, retrieves, th...",[],[]


In [15]:
entity_df = entity_df[entity_df['char_entities'].map(len) > 0]
entity_df.reset_index(inplace = True, drop = True)
entity_df.head()

Unnamed: 0,sentence,entities,char_entities
0,"(In, a, flashforward, ,, following, his, purch...","[M60, Walt]",[Walt]
1,"(Walt, notices, a, single, ,, spray, -, painte...",[Walt],[Walt]
2,"(After, exiting, the, house, ,, Walt, notices,...","[Walt, Carol]","[Walt, Carol]"
3,"(Hank, suffers, another, panic, attack, \n, .)",[Hank],[Hank]
4,"(A, stunned, Hank, leaves, the, bathroom, afte...","[Hank, Walt, Walt Whitman's, Grass]","[Hank, Walt]"


In [16]:
# https://stackoverflow.com/a/716489

for idx in range(entity_df.shape[0]):
    e_idx = min(idx+5, entity_df.shape[0])
    char_list = sum(entity_df.loc[idx:e_idx].char_entities, [])
    
    if idx % 220 == 0:
        print(char_list)

['Walt', 'Walt', 'Walt', 'Carol', 'Hank', 'Hank', 'Walt', 'Marie', 'Marie']
['Jesse', 'Brock', 'Jesse', 'Jesse', 'Walt', 'Jesse', 'Walt', 'Walt', 'Jack', 'Jesse']
['Lydia', 'Walt', 'Jesse', 'Walt', 'Jesse', 'Todd', 'Walt', 'Walt']


In [17]:
relationship = []

for idx in range(entity_df.shape[0]):
    e_idx = min(idx+5, entity_df.shape[0])
    char_list = sum(entity_df.loc[idx:e_idx].char_entities, [])

    unique_char = [char_list[i] for i in range(len(char_list)) if char_list[i] != char_list[i-1]]
    
    if idx % 220 == 0:
        print(char_list)
        print(unique_char)
        print('\n')

    if len(unique_char) > 1:
        for idx, a in enumerate(unique_char[:-1]):
            b = unique_char[idx + 1]
            relationship.append({'Source' : a, 'Destination' : b})

['Walt', 'Walt', 'Walt', 'Carol', 'Hank', 'Hank', 'Walt', 'Marie', 'Marie']
['Walt', 'Carol', 'Hank', 'Walt', 'Marie']


['Jesse', 'Brock', 'Jesse', 'Jesse', 'Walt', 'Jesse', 'Walt', 'Walt', 'Jack', 'Jesse']
['Brock', 'Jesse', 'Walt', 'Jesse', 'Walt', 'Jack', 'Jesse']


['Lydia', 'Walt', 'Jesse', 'Walt', 'Jesse', 'Todd', 'Walt', 'Walt']
['Lydia', 'Walt', 'Jesse', 'Walt', 'Jesse', 'Todd', 'Walt']




In [18]:
relationship_df = pd.DataFrame(relationship)
relationship_df.head(50)

Unnamed: 0,Source,Destination
0,Walt,Carol
1,Carol,Hank
2,Hank,Walt
3,Walt,Marie
4,Walt,Carol
5,Carol,Hank
6,Hank,Walt
7,Walt,Marie
8,Marie,Hank
9,Walt,Carol
