<a href="https://colab.research.google.com/github/jackychh7878/Colab_AI_Project/blob/main/Word2Vec_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training Word2Vec Model

In this project, I'm going to build a Word2Vec model using the game of thrones books

Data Source: https://www.kaggle.com/datasets/khulasasndh/game-of-thrones-books

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import nltk
import gensim
nltk.download("stopwords")
nltk.download('punkt')
nltk.download('punkt_tab')

from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
dir="/content/data"
os.listdir(dir)

['002ssb.txt', '001ssb.txt', '003ssb.txt', '005ssb.txt', '004ssb.txt']

In [None]:
filename="004ssb.txt"
os.path.join(dir, filename)

'/content/data/004ssb.txt'

# Step 1: Preprocessing the sentence into a list of tokens

In [None]:
stop_words = stopwords.words("english")

def preprocess(sent):
  words=word_tokenize(sent)
  words=[word for word in words if word not in stop_words]
  sent=" ".join(words)
  return sent

In [None]:
story = []
for filename in os.listdir(dir):
  f = open(os.path.join(dir, filename), mode="r", encoding="latin-1")
  corpus = f.read()
  sents=sent_tokenize(corpus)
  for sent in sents:
    sent=preprocess(sent)
    story.append(simple_preprocess(sent)) #doc into a list of tokens.

In [None]:
story

[['clash',
  'kings',
  'book',
  'two',
  'song',
  'ice',
  'fire',
  'by',
  'george',
  'martin',
  'prologue',
  'the',
  'comet',
  'tail',
  'spread',
  'across',
  'dawn',
  'red',
  'slash',
  'bled',
  'crags',
  'dragonstone',
  'like',
  'wound',
  'pink',
  'purple',
  'sky'],
 ['the', 'maester', 'stood', 'windswept', 'balcony', 'outside', 'chambers'],
 ['it', 'ravens', 'came', 'long', 'flight'],
 ['their',
  'droppings',
  'speckled',
  'gargoyles',
  'rose',
  'twelve',
  'feet',
  'tall',
  'either',
  'side',
  'hellhound',
  'wyvern',
  'two',
  'thousand',
  'brooded',
  'walls',
  'ancient',
  'fortress'],
 ['when',
  'first',
  'came',
  'dragonstone',
  'army',
  'stone',
  'grotesques',
  'made',
  'uneasy',
  'years',
  'passed',
  'grown',
  'used'],
 ['now', 'thought', 'old', 'friends'],
 ['the', 'three', 'watched', 'sky', 'together', 'foreboding'],
 ['the', 'maester', 'believe', 'omens'],
 ['and', 'yet'],
 [],
 [],
 ['old',
  'cressen',
  'never',
  'seen',
 

# Step 2: Train the model

In [None]:
model=gensim.models.Word2Vec(
    window = 7, # number of words in a sequence
    min_count = 2, # Minimum number of allowed in a doc
    workers = 8,
    vector_size = 200, # 100 by default
    epochs = 10
)

In [None]:
model.build_vocab(story)

In [None]:
model.train(story, total_examples=model.corpus_count, epochs=model.epochs)

(9619091, 10171670)

Number of corpus trained is the same as the len of story

In [None]:
model.corpus_count

145020

In [None]:
len(story)

145020

In [None]:
model.epochs

10

# Step 3: Test the model

In [None]:
# Aray and Sansa are sisters
model.wv.similarity("arya","sansa")

0.7120205

In [None]:
# Daenerys is the queen
model.wv.similarity("queen","daenerys")

0.51908207

In [None]:
# Check the most similar words to the word King
model.wv.most_similar("king")

[('baratheon', 0.6106770038604736),
 ('realm', 0.5637972950935364),
 ('royal', 0.5213494300842285),
 ('tourney', 0.5060192346572876),
 ('beholden', 0.496696799993515),
 ('conciliator', 0.4957672953605652),
 ('usurper', 0.49108681082725525),
 ('nage', 0.48472994565963745),
 ('defiled', 0.4735596179962158),
 ('conqueror', 0.4723212718963623)]

In [None]:
# Check the does not match word
model.wv.doesnt_match(['jon', 'robb', 'arya', 'sansa', 'bran'])

'jon'