In [254]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')
!python -m spacy download en

[nltk_data] Downloading package punkt to
[nltk_data]     /home/diegofvargas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/diegofvargas/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.



[93m    Linking successful[0m
    /home/diegofvargas/anaconda3/lib/python3.7/site-packages/en_core_web_sm
    -->
    /home/diegofvargas/anaconda3/lib/python3.7/site-packages/spacy/data/en

    You can now load the model via spacy.load('en')



In [225]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Remove the location of each scene, they're in between parenthesis
    text = re.sub(r'[\([a-zA-Z]*.[a-zA-Z]*\)','',raw)
    #text = ' '.join(text.split())
    return text
    
# Load and clean the data.
The_Matrix = open('The Matrix Script.txt','r') 
raw=The_Matrix.read()
The_Matrix.close()
script = text_cleaner(raw)
tokens = nltk.word_tokenize(script)
text = nltk.Text(tokens)

In [228]:
script.splitlines()

['',
 '',
 'Cypher: Yeah.',
 'Trinity: Is everything in place?',
 "Cypher: You weren't supposed to relieve me.",
 'Trinity: I know, but I want to take your shift.',
 "Cypher: You like watching him, don't you? ",
 'Cypher: You like watching him.',
 "Trinity: Don't be ridiculous.",
 "Cypher: We're going to kill him, do you understand that?",
 'Trinity: Morpheus believes he is the one.',
 'Cypher: Do you?',
 "Trinity: It doesn't matter what I believe.",
 "Cypher: You don't, do you?",
 'Trinity: Did you hear that?',
 'Cypher: Hear what?',
 'Trinity: Are you sure this line is clean?',
 "Cypher: Yeah, of course I'm sure.",
 'Trinity: I better go.',
 '',
 '',
 'Cop: Freeze, Police. ',
 'Cop: Hands on your head. ',
 'Cop: Do it, Do it now.',
 '',
 '',
 'Agent Smith: Lieutenant',
 'Lieutenant: Oh shit.',
 'Agent Smith: Lieutenant, you were given specific orders.',
 "Lieutenant: Hey, I'm just doing my job. ",
 'Lieutenant: You give me juris- my diction crap, you can cram it up your ass.',
 'Agen

In [235]:
characters = []
sentences = []
for line in script.splitlines():
    if ':' in line:
        characters.append(line.split(':')[0])
        sentences.append(line.split(':')[1])

In [248]:
script_df = pd.DataFrame(np.column_stack([characters, sentences]), columns = ['character','sentences']) 

In [262]:
script_df.head()

Unnamed: 0,character,sentences,clean_sentences
0,Cypher,Yeah.,Yeah.
1,Trinity,Is everything in place?,Is everything place?
2,Cypher,You weren't supposed to relieve me.,You supposed relieve me.
3,Trinity,"I know, but I want to take your shift.","I know, I want take shift."
4,Cypher,"You like watching him, don't you?","You like watching him, you?"


In [266]:
print(script_df['character'].unique())
print(len(script_df['character'].unique()))

['Cypher' 'Trinity' 'Cop' 'Agent Smith' 'Lieutenant' 'Morpheus'
 'Agent Brown' 'Agent Jones' 'Neo' 'Choi' 'DuJour' 'Mr. Rhineheart'
 'FedEx man' 'Switch' 'Apoc' 'Dozer' 'Tank' 'Mouse' 'Priestess'
 'Spoon boy' 'Oracle' 'Police' 'Guard 1' 'Guard 2' 'Soldier' 'Pilot' 'Man'
 'The One']
28


In [256]:
stop_words = stopwords.words('english')

# tokenization
tokenized_script = script_df['sentences'].apply(lambda x: x.split())

# remove stop-words
tokenized_script = tokenized_script.apply(lambda x: [item for item in x if item not in stop_words])

# de-tokenization
detokenized_script = []
for i in range(len(script_df)):
    t = ' '.join(tokenized_script[i])
    detokenized_script.append(t)

script_df['clean_sentences'] = detokenized_script

In [258]:
script_df.head()

Unnamed: 0,character,sentences,clean_sentences
0,Cypher,Yeah.,Yeah.
1,Trinity,Is everything in place?,Is everything place?
2,Cypher,You weren't supposed to relieve me.,You supposed relieve me.
3,Trinity,"I know, but I want to take your shift.","I know, I want take shift."
4,Cypher,"You like watching him, don't you?","You like watching him, you?"


In [261]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', 
max_features= 5000,
max_df = 0.5, 
smooth_idf=True)

X = vectorizer.fit_transform(script_df['clean_sentences'])

X.shape # check shape of the document-term matrix

(630, 1081)

In [280]:
from sklearn.decomposition import TruncatedSVD

# SVD represent terms in vectors 
svd_model = TruncatedSVD(n_components=28, algorithm='randomized', n_iter=1000, random_state=101)

svd_model.fit(X)

len(svd_model.components_)

28

In [281]:
terms = vectorizer.get_feature_names()

for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:20]
    print("Character "+str(i)+": ")
    for t in sorted_terms:
        print(t[0])

Character 0: 
know
neo
morpheus
going
want
right
tell
trinity
hello
does
like
come
trying
matrix
looking
time
ve
told
ready
world
Character 1: 
neo
morpheus
come
hello
right
oracle
watching
like
easy
hurry
true
ready
fighting
matters
trust
yes
time
looking
tank
sacrificed
Character 2: 
morpheus
trinity
going
oracle
believe
don
alive
tank
believed
make
believes
told
got
ready
zion
gave
convinced
happened
sure
place
Character 3: 
trinity
believe
hello
help
real
focus
neo
matrix
hit
base
cracked
irs
right
ve
world
worry
oracle
yes
think
make
Character 4: 
yes
yeah
hell
right
beginning
old
mr
elevator
slowly
going
want
clear
perfectly
rhineheart
matrix
mean
tell
believe
time
anderson
Character 5: 
oh
shit
right
going
believe
matrix
tell
god
got
yeah
tank
cypher
come
need
ll
think
time
real
good
mr
Character 6: 
oh
shit
god
neo
yes
morpheus
lost
trinity
know
help
insane
happening
hello
phone
crick
squiddies
gonna
burn
ve
damn
Character 7: 
right
oh
shit
morpheus
trinity
hope
god
know
ll
yes