In [1]:
# Importing libraries
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
import os
import string
from nltk.corpus import stopwords

In [2]:
# To show max columns and rows
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
# Function to remove punctuation
def remove_punc(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [4]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [5]:
# Function to remove Stop words
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    x = new_text[:]
    new_text.clear()
    return " ".join(x)

In [6]:
# Iterating through each file in Harry potter data i.e. every novel and preprocessing every sentence. 
from nltk.tokenize import sent_tokenize
from gensim.utils import simple_preprocess
import os

story = []

# Iterate over each file in the folder
for filename in os.listdir('Harry Potter data'):
    # Open the file with the specified encoding
    with open(os.path.join('Harry Potter data', filename), 'r', encoding='latin1') as f:
        # Read the content of the file
        corpus = f.read()
        # Tokenize the text into sentences
        raw_sent = sent_tokenize(corpus)
        # Process each sentence as needed
        for sent in raw_sent:
            # Preprocess the sentence and append it to the story list
            sent = remove_stopwords(sent)
            sent = remove_punc(sent)
            story.append(simple_preprocess(sent))

In [7]:
# Library to train word2vec model
from gensim.utils import simple_preprocess
from gensim.models import word2vec

In [8]:
model = gensim.models.Word2Vec(vector_size=100, window=10, min_count = 1)

In [9]:
# Building Vocabulary
model.build_vocab(story)

In [10]:
# Training model
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(3028349, 3263415)

## Asking model questions based on Story to see how accurate it is

In [11]:
# Who is different among them
model.wv.doesnt_match(['fred', 'ron', 'ginny', 'harry'])

'harry'

In [12]:
# Vector representation of Harry
model.wv['harry']

array([-1.3807906 , -0.87840754, -0.10859574, -0.7069842 , -1.0132128 ,
       -0.96100277, -0.04678435,  1.5401655 , -0.55400985, -0.5306542 ,
        0.8713431 , -0.29431477, -0.42856765, -0.07665351, -0.57377625,
       -0.6327165 , -0.1821947 ,  0.3719117 , -1.1960412 , -0.54509777,
        0.85018265,  0.91295433,  1.3300729 ,  0.49912092,  0.47307768,
       -1.4431894 ,  0.84249985, -0.070495  , -0.6052108 , -1.5551046 ,
        0.20113248,  0.37834695,  0.32277176, -0.7359194 , -0.7649109 ,
       -0.00401795,  0.30872506, -0.8950192 , -1.0432777 ,  0.0287376 ,
        0.43341905,  0.29970556, -1.2587788 , -0.13105097,  1.346365  ,
        0.08186484, -0.8972897 ,  1.2924412 , -0.41959348,  0.8722393 ,
       -0.41030177, -0.92061484, -0.0070767 ,  0.47849327, -0.6817308 ,
        0.02659691,  1.0542881 , -0.66538036, -0.9599943 ,  0.9026554 ,
        1.2595073 ,  0.5580195 ,  0.66046774, -0.7259993 , -0.21638218,
        0.8048429 ,  0.09437727,  1.353998  , -1.4428216 ,  0.56

In [13]:
# Words highly associated with Harry
model.wv.most_similar('harry')

[('he', 0.8366943597793579),
 ('her', 0.7732561230659485),
 ('again', 0.7491076588630676),
 ('she', 0.7474890947341919),
 ('moment', 0.7446354031562805),
 ('hermione', 0.7361045479774475),
 ('balefully', 0.7351807951927185),
 ('him', 0.7240976095199585),
 ('skeptical', 0.7202335000038147),
 ('wideeyed', 0.714824914932251)]

In [14]:
# Checking smilarity between 2 words
model.wv.similarity('ginny','weasley')

0.38190442

In [15]:
# Words highly associated with muggles
model.wv.most_similar('muggles')

[('friends', 0.9839052557945251),
 ('choose', 0.9818298816680908),
 ('learning', 0.9807787537574768),
 ('fer', 0.9803354740142822),
 ('weekends', 0.9789878129959106),
 ('realise', 0.9786287546157837),
 ('azkaban', 0.9779224395751953),
 ('information', 0.9775538444519043),
 ('protection', 0.9774662852287292),
 ('moren', 0.9757421016693115)]

In [16]:
# model.wv.get_normed_vectors()

In [17]:
# Converting  model back to word/key
y = model.wv.index_to_key

In [18]:
y

['harry',
 'said',
 'he',
 'ron',
 'it',
 'hermione',
 'the',
 'you',
 'back',
 'him',
 'dumbledore',
 'could',
 'one',
 'like',
 'looked',
 'and',
 'would',
 'know',
 'well',
 'around',
 'but',
 'got',
 'professor',
 'see',
 'though',
 'there',
 'hagrid',
 'think',
 'get',
 'still',
 'time',
 'them',
 'they',
 'eyes',
 'looking',
 'wand',
 'right',
 'snape',
 'face',
 'weasley',
 'going',
 'what',
 'voice',
 'look',
 'go',
 'again',
 'room',
 'come',
 'she',
 'harrys',
 'head',
 'now',
 'door',
 'thought',
 'mr',
 'saw',
 'no',
 'then',
 'im',
 'something',
 'that',
 'malfoy',
 'behind',
 'me',
 'never',
 'hand',
 'seemed',
 'away',
 'way',
 'asked',
 'told',
 'potter',
 'turned',
 'last',
 'two',
 'much',
 'hes',
 'dark',
 'us',
 'little',
 'knew',
 'sirius',
 'long',
 'even',
 'her',
 'want',
 'this',
 'oh',
 'voldemort',
 'tell',
 'good',
 'all',
 'first',
 'made',
 'heard',
 'so',
 'people',
 'we',
 'left',
 'really',
 'felt',
 'up',
 'mrs',
 'yes',
 'say',
 'fred',
 'moment',
 'b