# Character Shifts in Harry Potter Fanfics

# Creation of Word Embedding Models

### Last updated: 19.01.2022

### 1. Required Libraries

In [1]:
import glob
import os
import nltk
import spacy
!python -m spacy download de_core_news_lg
nlp = spacy.load('de_core_news_lg',exclude=["ner"],disable=["tagger","parser"])
import string
import csv
from gensim.models import Word2Vec
import pickle

Collecting de-core-news-lg==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_lg-3.2.0/de_core_news_lg-3.2.0-py3-none-any.whl (572.3 MB)
[+] Download and installation successful
You can now load the package via spacy.load('de_core_news_lg')


You should consider upgrading via the 'C:\Users\LitLab\anaconda3\python.exe -m pip install --upgrade pip' command.


In [2]:
path_models = r'Z:\Fanfiction\HP_Character-Distribution\pamphlet_character_shifts\results\vector_models'
path_pickled = r'Z:\Fanfiction\HP_Character-Distribution\pamphlet_character_shifts\results\pickled'

### 2. Read in data

In [3]:
with open(path_pickled + '\\corpusHPoriginals_words.pkl', 'rb') as f:
    corpusHPoriginals = pickle.load(f)

In [4]:
with open(path_pickled + '\\corpusHPFFs_words.pkl', 'rb') as f:
    corpusHPFFs = pickle.load(f)

### 3. Preprocessing

already covered by "generaliseEntities":
- removal paragraph markers
- lowercase the corpus
- generalising entities

done here:
- merge texts in corpus into two large lists
- clean data (remove quotation marks)
- tokenizing sentences
- lemmatising
- removing punctuation (no "_"!!!)
- saving files

In [19]:
# merge texts in corpus into two large lists

corpusHPoriginals_joined = ' '.join(corpusHPoriginals)

corpusHPFFs_joined = ' '.join(corpusHPFFs)

In [20]:
# clean data (remove quotation marks)

corpusHPoriginals_clean1 = corpusHPoriginals_joined.replace('“', ' ')

corpusHPoriginals_clean2 = corpusHPoriginals_clean1.replace('„', ' ')

In [13]:
corpusHPFFs_clean1 = corpusHPFFs_joined.replace('“', ' ')

corpusHPFFs_clean2 = corpusHPFFs_clean1.replace('„', ' ')

In [21]:
# tokenizing sentences

corpusHPoriginals_sentences = nltk.sent_tokenize(corpusHPoriginals_clean2, language='german')

In [None]:
corpusHPFFs_sentences = nltk.sent_tokenize(corpusHPFFs_clean2, language='german')

In [None]:
# divide HPFFs corpus into more manageable chunks

len(corpusHPFFs_sentences)

In [None]:
corpusHPFFs_sentences1 = corpusHPFFs_sentences[:2776735]

In [None]:
corpusHPFFs_sentences2 = corpusHPFFs_sentences[2776735:5553470]

In [None]:
corpusHPFFs_sentences3 = corpusHPFFs_sentences[5553470:]

In [22]:
# lemmatising

# books

corpusHPoriginals_lemmatized = [0]*len(corpusHPoriginals_sentences)

for i in range(0, len(corpusHPoriginals_sentences)):
    words = nlp(corpusHPoriginals_sentences[i])
    interim = [0]*len(words)
    for j in range(0, len(interim)):
        interim[j] = words[j].lemma_
    corpusHPoriginals_lemmatized[i] = interim 

In [23]:
corpusHPoriginals_lemmatized[:10]

[['einen',
  'junge',
  'überleben',
  '   ',
  'mr',
  'und',
  'PETUNIA_DURSLEY',
  'im',
  'ligusterweg',
  'nummer',
  '4',
  'sein',
  'stolz',
  'darauf',
  ',',
  'ganz',
  'und',
  'gar',
  'normal',
  'zu',
  'mein',
  ',',
  'sehr',
  'stolz',
  'sogar',
  '.'],
 ['niemand',
  'sein',
  'auf',
  'der',
  'idee',
  'kommen',
  ',',
  'ich',
  'können',
  'sich',
  'in',
  'einen',
  'merkwürdig',
  'und',
  'geheimnisvolle',
  'geschichte',
  'verstricken',
  ',',
  'denn',
  'mit',
  'solch',
  'unsinn',
  'wollen',
  'ich',
  'nichts',
  'zu',
  'tun',
  'haben',
  '.'],
 ['VERNON_DURSLEY',
  'sein',
  'direktor',
  'einer',
  'firma',
  'namens',
  'grunnings',
  ',',
  'der',
  'bohrmaschinen',
  'herstellen',
  '.'],
 ['ich',
  'sein',
  'groß',
  'und',
  'bullig',
  'und',
  'haben',
  'fast',
  'kein',
  'hals',
  ',',
  'dafür',
  'aber',
  'ein',
  'sehr',
  'groß',
  'schnurrbart',
  '.'],
 ['PETUNIA_DURSLEY',
  'sein',
  'dünnen',
  'und',
  'blond',
  'und',
  'be

In [24]:
corpusHPoriginals_lemmatized[-10:]

[['»', 'ich', 'sein', 'wegen', 'sich', '.'],
 ['ich',
  'sein',
  'extrem',
  'berühmt',
  '.',
  '«',
  ' ',
  'ALBUS_DUMBLEDORE',
  ',',
  'ROSE_GRANGER_WEASLEY',
  ',',
  'HUGO_GRANGER-WEASLEY',
  'und',
  'LILY_POTTER',
  'lachen',
  '.'],
 ['der',
  'zug',
  'setzen',
  'sich',
  'in',
  'bewegung',
  ',',
  'und',
  'HARRY_POTTER',
  'gehen',
  'neben',
  'ich',
  'her',
  'und',
  'beobachten',
  'der',
  'schmal',
  'gesicht',
  'mein',
  'sohnes',
  ',',
  'der',
  'schon',
  'glühen',
  'vor',
  'aufregung',
  '.'],
 ['HARRY_POTTER',
  'lächeln',
  'und',
  'winken',
  'unentwegt',
  ',',
  'auch',
  'wenn',
  'ich',
  'wie',
  'einen',
  'klein',
  'schmerzlich',
  'verlust',
  'sein',
  ',',
  'seinen',
  'sohn',
  'von',
  'sich',
  'weggleiten',
  'zu',
  'sehen',
  '…',
  ' ',
  'der',
  'letzt',
  'dampfschwaden',
  'lösen',
  'sich',
  'in',
  'der',
  'herbstluft',
  'auf',
  '.'],
 ['der', 'zug', 'fahren', 'in', 'einen', 'kurve', '.'],
 ['HARRY_POTTER',
  'haben',
  

In [16]:
corpusHPFFs_lemmatized1 = [0]*len(corpusHPFFs_sentences1)

for i in range(0, len(corpusHPFFs_sentences1)):
    words = nlp(corpusHPFFs_sentences1[i])
    interim = [0]*len(words)
    for j in range(0, len(interim)):
        interim[j] = words[j].lemma_
    corpusHPFFs_lemmatized1[i] = interim 

In [17]:
corpusHPFFs_lemmatized2 = [0]*len(corpusHPFFs_sentences2)

for i in range(0, len(corpusHPFFs_sentences2)):
    words = nlp(corpusHPFFs_sentences2[i])
    interim = [0]*len(words)
    for j in range(0, len(interim)):
        interim[j] = words[j].lemma_
    corpusHPFFs_lemmatized2[i] = interim 

In [18]:
corpusHPFFs_lemmatized3 = [0]*len(corpusHPFFs_sentences3)

for i in range(0, len(corpusHPFFs_sentences3)):
    words = nlp(corpusHPFFs_sentences3[i])
    interim = [0]*len(words)
    for j in range(0, len(interim)):
        interim[j] = words[j].lemma_
    corpusHPFFs_lemmatized3[i] = interim 

In [19]:
corpusHPFFs_lemmatized1[:10]

[[' ',
  'when',
  'HERMINE_GRANGER',
  'fights',
  '   ',
  '666',
  '  ',
  'der',
  'sein',
  'der',
  'ziel',
  '!'],
 ['der', 'ende', 'sein', 'erreichen', '!'],
 ['hallo',
  'all',
  'zusammen',
  '!',
  'dies',
  'sein',
  'meinen',
  'erste',
  'ff',
  ',',
  'also',
  'sein',
  'nicht',
  'zu',
  'hart',
  '.'],
 ['besonder',
  'warnung',
  ':',
  'der',
  'story',
  'sein',
  'definitiv',
  'ab',
  '18',
  '!'],
 ['ich',
  'sein',
  'auch',
  'alt',
  'und',
  'schreiben',
  'für',
  'erwachsene.inhalt',
  ':',
  'ich',
  'heißen',
  ',',
  'man',
  'haben',
  'immer',
  'einen',
  'wahl',
  'und',
  'ich',
  'habe',
  'wählen',
  '.'],
 ['ich', 'werden', 'kämpfen', '!'],
 ['oh', 'ja', '!'],
 ['mit',
  'alle',
  'sich',
  'zur',
  'verfügung',
  'stehend',
  'mitteln',
  'und',
  'ohne',
  'rücksicht',
  'auf',
  'verluste',
  '.'],
 ['mit',
  'voll',
  'einsatz',
  ',',
  'ob',
  'ich',
  'sich',
  'gefallen',
  'oder',
  'nicht',
  '.'],
 ['aber',
  'immer',
  'im',
  'verbo

In [20]:
corpusHPFFs_lemmatized3[-10:]

[[',', 'lächeln', 'ich', 'süffisant', '.'],
 ['sehr', 'witzig', ',', 'nun', 'komm', '!'],
 [',',
  'drängen',
  'ich',
  'und',
  'ich',
  'erheben',
  'sich',
  'folgsam',
  ',',
  'wollen',
  'ich',
  'ja',
  'nicht',
  'zu',
  'sehr',
  'reizen',
  '.'],
 ['miss', 'greengrass', ',', 'mr', '.'],
 ['BLAISE_ZABINI', '…', ',', 'der', 'rote', 'pesen', '!'],
 [',',
  'verabschieden',
  'ich',
  'der',
  'runden',
  'in',
  'sich',
  'so',
  'eigen',
  ',',
  'charmant',
  'art',
  '.'],
 ['HERMINE_GRANGER',
  ',',
  'pass',
  'auf',
  'sich',
  'auf',
  ',',
  'melden',
  'sich',
  '!'],
 [',', 'verabschieden', 'sich', 'all', 'irgendwie', 'furchtsam', '.'],
 ['SEVERUS_SNAPE',
  'verdrehen',
  'nerven',
  'der',
  'augen',
  ',',
  'reichen',
  'sich',
  ',',
  'nun',
  'wohl',
  'am',
  'ende',
  'sich',
  'nerven',
  ',',
  'dennoch',
  'sehr',
  'galant',
  'seinen',
  'arm',
  'und',
  'ich',
  'legen',
  'artig',
  'meinen',
  'hand',
  'auf',
  'der',
  'in',
  'der',
  'schwarz',
  

In [21]:
# merge lemmatized HPFF corpus

corpusHPFFs_lemmatized = corpusHPFFs_lemmatized1 + corpusHPFFs_lemmatized2 + corpusHPFFs_lemmatized3

In [22]:
corpusHPFFs_lemmatized[:10]

[[' ',
  'when',
  'HERMINE_GRANGER',
  'fights',
  '   ',
  '666',
  '  ',
  'der',
  'sein',
  'der',
  'ziel',
  '!'],
 ['der', 'ende', 'sein', 'erreichen', '!'],
 ['hallo',
  'all',
  'zusammen',
  '!',
  'dies',
  'sein',
  'meinen',
  'erste',
  'ff',
  ',',
  'also',
  'sein',
  'nicht',
  'zu',
  'hart',
  '.'],
 ['besonder',
  'warnung',
  ':',
  'der',
  'story',
  'sein',
  'definitiv',
  'ab',
  '18',
  '!'],
 ['ich',
  'sein',
  'auch',
  'alt',
  'und',
  'schreiben',
  'für',
  'erwachsene.inhalt',
  ':',
  'ich',
  'heißen',
  ',',
  'man',
  'haben',
  'immer',
  'einen',
  'wahl',
  'und',
  'ich',
  'habe',
  'wählen',
  '.'],
 ['ich', 'werden', 'kämpfen', '!'],
 ['oh', 'ja', '!'],
 ['mit',
  'alle',
  'sich',
  'zur',
  'verfügung',
  'stehend',
  'mitteln',
  'und',
  'ohne',
  'rücksicht',
  'auf',
  'verluste',
  '.'],
 ['mit',
  'voll',
  'einsatz',
  ',',
  'ob',
  'ich',
  'sich',
  'gefallen',
  'oder',
  'nicht',
  '.'],
 ['aber',
  'immer',
  'im',
  'verbo

In [23]:
corpusHPFFs_lemmatized[-10:]

[[',', 'lächeln', 'ich', 'süffisant', '.'],
 ['sehr', 'witzig', ',', 'nun', 'komm', '!'],
 [',',
  'drängen',
  'ich',
  'und',
  'ich',
  'erheben',
  'sich',
  'folgsam',
  ',',
  'wollen',
  'ich',
  'ja',
  'nicht',
  'zu',
  'sehr',
  'reizen',
  '.'],
 ['miss', 'greengrass', ',', 'mr', '.'],
 ['BLAISE_ZABINI', '…', ',', 'der', 'rote', 'pesen', '!'],
 [',',
  'verabschieden',
  'ich',
  'der',
  'runden',
  'in',
  'sich',
  'so',
  'eigen',
  ',',
  'charmant',
  'art',
  '.'],
 ['HERMINE_GRANGER',
  ',',
  'pass',
  'auf',
  'sich',
  'auf',
  ',',
  'melden',
  'sich',
  '!'],
 [',', 'verabschieden', 'sich', 'all', 'irgendwie', 'furchtsam', '.'],
 ['SEVERUS_SNAPE',
  'verdrehen',
  'nerven',
  'der',
  'augen',
  ',',
  'reichen',
  'sich',
  ',',
  'nun',
  'wohl',
  'am',
  'ende',
  'sich',
  'nerven',
  ',',
  'dennoch',
  'sehr',
  'galant',
  'seinen',
  'arm',
  'und',
  'ich',
  'legen',
  'artig',
  'meinen',
  'hand',
  'auf',
  'der',
  'in',
  'der',
  'schwarz',
  

In [25]:
# removing punctuation (no "_"!!!)

punctuation = """!"#$%&'()*+,-./:;<=>?@[\]^`{|}~«»"""

In [26]:
for sent in corpusHPoriginals_lemmatized:
    for word in sent:
        if word in punctuation:
            sent.remove(word)

In [26]:
for sent in corpusHPFFs_lemmatized:
    for word in sent:
        if word in punctuation:
            sent.remove(word)

In [27]:
# saving files

with open(path_pickled + '\\hp_originals_texts_lemmatized.pkl', 'wb') as f:
    pickle.dump(corpusHPoriginals_lemmatized, f)
    
#with open('hp_originals_texts_lemmatized.pkl', 'rb') as f:
#    corpusHPoriginals_lemmatized = pickle.load(f)

In [6]:
#with open('hp_ffs_texts_lemmatized.pkl', 'wb') as f:
#    pickle.dump(corpusHPFFs_lemmatized, f)
    
with open(path_pickled + '\\hp_ffs_texts_lemmatized.pkl', 'rb') as f:
    corpusHPFFs_lemmatized = pickle.load(f)

In [29]:
# saving files as csv

with open(path_results + '\\hp_originals_texts_lemmatized.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(corpusHPoriginals_lemmatized)

In [30]:
with open(path_results + '\\hp_ffs_texts_lemmatized.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(corpusHPFFs_lemmatized)

## 3. Creating Models

In [31]:
# training models

modelHPoriginalsA = Word2Vec(corpusHPoriginals_lemmatized, vector_size=300, window=5, min_count=50, sg=0, epochs=5)

modelHPoriginalsB = Word2Vec(corpusHPoriginals_lemmatized, vector_size=300, window=5, min_count=50, sg=0, epochs=10)

modelHPoriginalsC = Word2Vec(corpusHPoriginals_lemmatized, vector_size=300, window=5, min_count=50, sg=1, epochs=5)

modelHPoriginalsD = Word2Vec(corpusHPoriginals_lemmatized, vector_size=300, window=5, min_count=50, sg=1, epochs=10)

modelHPoriginalsE = Word2Vec(corpusHPoriginals_lemmatized, vector_size=300, window=5, min_count=50, sg=1, epochs=15)

modelHPoriginalsF = Word2Vec(corpusHPoriginals_lemmatized, vector_size=300, window=5, min_count=50, sg=1, epochs=20)

modelHPoriginalsG = Word2Vec(corpusHPoriginals_lemmatized, vector_size=300, window=5, min_count=50, sg=1, epochs=25)

modelHPoriginalsH = Word2Vec(corpusHPoriginals_lemmatized, vector_size=200, window=5, min_count=50, sg=1, epochs=5)

modelHPoriginalsI = Word2Vec(corpusHPoriginals_lemmatized, vector_size=200, window=5, min_count=50, sg=1, epochs=10)

modelHPoriginalsJ = Word2Vec(corpusHPoriginals_lemmatized, vector_size=200, window=5, min_count=50, sg=1, epochs=15)

modelHPoriginalsK = Word2Vec(corpusHPoriginals_lemmatized, vector_size=200, window=5, min_count=50, sg=1, epochs=20)

modelHPoriginalsL = Word2Vec(corpusHPoriginals_lemmatized, vector_size=200, window=5, min_count=50, sg=1, epochs=25)

modelHPoriginalsM = Word2Vec(corpusHPoriginals_lemmatized, vector_size=100, window=5, min_count=50, sg=1, epochs=5)

modelHPoriginalsN = Word2Vec(corpusHPoriginals_lemmatized, vector_size=100, window=5, min_count=50, sg=1, epochs=10)

modelHPoriginalsO = Word2Vec(corpusHPoriginals_lemmatized, vector_size=100, window=5, min_count=50, sg=1, epochs=15)

modelHPoriginalsP = Word2Vec(corpusHPoriginals_lemmatized, vector_size=100, window=5, min_count=50, sg=1, epochs=20)

modelHPoriginalsQ = Word2Vec(corpusHPoriginals_lemmatized, vector_size=100, window=5, min_count=50, sg=1, epochs=25)


In [32]:
# saving models

modelHPoriginalsA.wv.save(path_models + '\\modelHPoriginalsA_vectors.kv')

modelHPoriginalsB.wv.save(path_models + '\\modelHPoriginalsB_vectors.kv')

modelHPoriginalsC.wv.save(path_models + '\\modelHPoriginalsC_vectors.kv')

modelHPoriginalsD.wv.save(path_models + '\\modelHPoriginalsD_vectors.kv')

modelHPoriginalsE.wv.save(path_models + '\\modelHPoriginalsE_vectors.kv')

modelHPoriginalsF.wv.save(path_models + '\\modelHPoriginalsF_vectors.kv')

modelHPoriginalsG.wv.save(path_models + '\\modelHPoriginalsG_vectors.kv')

modelHPoriginalsH.wv.save(path_models + '\\modelHPoriginalsH_vectors.kv')

modelHPoriginalsI.wv.save(path_models + '\\modelHPoriginalsI_vectors.kv')

modelHPoriginalsJ.wv.save(path_models + '\\modelHPoriginalsJ_vectors.kv')

modelHPoriginalsK.wv.save(path_models + '\\modelHPoriginalsK_vectors.kv')

modelHPoriginalsL.wv.save(path_models + '\\modelHPoriginalsL_vectors.kv')

modelHPoriginalsM.wv.save(path_models + '\\modelHPoriginalsM_vectors.kv')

modelHPoriginalsN.wv.save(path_models + '\\modelHPoriginalsN_vectors.kv')

modelHPoriginalsO.wv.save(path_models + '\\modelHPoriginalsO_vectors.kv')

modelHPoriginalsP.wv.save(path_models + '\\modelHPoriginalsP_vectors.kv')

modelHPoriginalsQ.wv.save(path_models + '\\modelHPoriginalsQ_vectors.kv')


In [33]:
# training models

modelHPFFsA = Word2Vec(corpusHPFFs_lemmatized, vector_size=300, window=5, min_count=50, sg=0, epochs=5)

In [34]:
modelHPFFsB = Word2Vec(corpusHPFFs_lemmatized, vector_size=300, window=5, min_count=50, sg=0, epochs=10)

In [35]:
modelHPFFsC = Word2Vec(corpusHPFFs_lemmatized, vector_size=300, window=5, min_count=50, sg=1, epochs=5)

In [36]:
modelHPFFsD = Word2Vec(corpusHPFFs_lemmatized, vector_size=300, window=5, min_count=50, sg=1, epochs=10)

In [37]:
modelHPFFsE = Word2Vec(corpusHPFFs_lemmatized, vector_size=300, window=5, min_count=50, sg=1, epochs=15)

In [38]:
modelHPFFsF = Word2Vec(corpusHPFFs_lemmatized, vector_size=300, window=5, min_count=50, sg=1, epochs=20)

In [39]:
modelHPFFsG = Word2Vec(corpusHPFFs_lemmatized, vector_size=300, window=5, min_count=50, sg=1, epochs=25)

In [40]:
modelHPFFsH = Word2Vec(corpusHPFFs_lemmatized, vector_size=200, window=5, min_count=50, sg=1, epochs=5)

In [None]:
modelHPFFsI = Word2Vec(corpusHPFFs_lemmatized, vector_size=200, window=5, min_count=50, sg=1, epochs=10)

In [None]:
modelHPFFsJ = Word2Vec(corpusHPFFs_lemmatized, vector_size=200, window=5, min_count=50, sg=1, epochs=15)

In [None]:
modelHPFFsK = Word2Vec(corpusHPFFs_lemmatized, vector_size=200, window=5, min_count=50, sg=1, epochs=20)

In [None]:
modelHPFFsL = Word2Vec(corpusHPFFs_lemmatized, vector_size=200, window=5, min_count=50, sg=1, epochs=25)

In [None]:
modelHPFFsM = Word2Vec(corpusHPFFs_lemmatized, vector_size=100, window=5, min_count=50, sg=1, epochs=5)

In [None]:
modelHPFFsN = Word2Vec(corpusHPFFs_lemmatized, vector_size=100, window=5, min_count=50, sg=1, epochs=10)

In [None]:
modelHPFFsO = Word2Vec(corpusHPFFs_lemmatized, vector_size=100, window=5, min_count=50, sg=1, epochs=15)

In [None]:
modelHPFFsP = Word2Vec(corpusHPFFs_lemmatized, vector_size=100, window=5, min_count=50, sg=1, epochs=20)

In [None]:
modelHPFFsQ = Word2Vec(corpusHPFFs_lemmatized, vector_size=100, window=5, min_count=50, sg=1, epochs=25)

In [None]:
# saving models

modelHPFFsA.wv.save(path_models + '\\modelHPFFsA_vectors.kv')

modelHPFFsB.wv.save(path_models + '\\modelHPFFsB_vectors.kv')

modelHPFFsC.wv.save(path_models + '\\modelHPFFsC_vectors.kv')

In [None]:
modelHPFFsD.wv.save(path_models + '\\modelHPFFsD_vectors.kv')

modelHPFFsE.wv.save(path_models + '\\modelHPFFsE_vectors.kv')

modelHPFFsF.wv.save(path_models + '\\modelHPFFsF_vectors.kv')

In [None]:
modelHPFFsG.wv.save(path_models + '\\modelHPFFsG_vectors.kv')

modelHPFFsH.wv.save(path_models + '\\modelHPFFsH_vectors.kv')

modelHPFFsI.wv.save(path_models + '\\modelHPFFsI_vectors.kv')

In [None]:
modelHPFFsJ.wv.save(path_models + '\\modelHPFFsJ_vectors.kv')

modelHPFFsK.wv.save(path_models + '\\modelHPFFsK_vectors.kv')

modelHPFFsL.wv.save(path_models + '\\modelHPFFsL_vectors.kv')

In [None]:
modelHPFFsM.wv.save(path_models + '\\modelHPFFsM_vectors.kv')

modelHPFFsN.wv.save(path_models + '\\modelHPFFsN_vectors.kv')

modelHPFFsO.wv.save(path_models + '\\modelHPFFsO_vectors.kv')

In [None]:
modelHPFFsP.wv.save(path_models + '\\modelHPFFsP_vectors.kv')

modelHPFFsQ.wv.save(path_models + '\\modelHPFFsQ_vectors.kv')

### 4. Analyzing Corpus

In [18]:
# amount of tokens

i = sum([len(sent) for sent in corpusHPoriginals_lemmatized])

print(i)

1175381


In [20]:
# amount of types (in models)

len(modelHPoriginalsA)

1806

In [21]:
# amount of tokens

i = sum([len(sent) for sent in corpusHPFFs_lemmatized])

print(i)

110693619


In [22]:
# amount of types (in models)

len(modelHPFFsA)

30029

In [23]:
# create word lists

HPoriginals_words = [word for sent in corpusHPoriginals_lemmatized for word in sent]

In [24]:
HPFFs_words = [word for sent in corpusHPFFs_lemmatized for word in sent]

In [None]:
# saving files as csv

with open(path_results + '\\hp_originals_texts_lemmatized_words.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows([HPoriginals_words[index]] for index in range(0, len(HPoriginals_words)))

In [None]:
with open(path_results + '\\hp_ffs_texts_lemmatized_words.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows([HPFFs_words[index]] for index in range(0, len(HPFFs_words)))

In [25]:
HPoriginals_words.count('HARRY_POTTER')

18886

In [26]:
HPFFs_words.count('HARRY_POTTER')

610407

### 5. Analyzing Models

In [28]:
modelHPoriginalsA.wv.most_similar('HARRY_POTTER')

[('HERMINE_GRANGER', 0.6897292733192444),
 ('RUBEUS_HAGRID', 0.6699568033218384),
 ('NEVILLE_LONGBOTTOM', 0.608005702495575),
 ('VIKTOR_KRUM', 0.6076350212097168),
 ('MYRTE', 0.5934661626815796),
 ('CHO_CHANG', 0.566474199295044),
 ('MINERVA_MCGONAGALL', 0.5595462918281555),
 ('ARTHUR_WEASLEY', 0.5574615001678467),
 ('RUFUS_SCRIMGEOUR', 0.5480220317840576),
 ('SEVERUS_SNAPE', 0.5377324223518372)]

In [29]:
modelHPFFsA.wv.most_similar('HARRY_POTTER')

[('HERMINE_GRANGER', 0.8018375039100647),
 ('DRACO_MALFOY', 0.6502677798271179),
 ('ich', 0.622545063495636),
 ('SEVERUS_SNAPE', 0.6139376759529114),
 ('NEVILLE_LONGBOTTOM', 0.5550857186317444),
 ('GINNY_WEASLEY', 0.5460676550865173),
 ('LUCIUS_MALFOY', 0.5442930459976196),
 ('REMUS_LUPIN', 0.5366066694259644),
 ('LILY_POTTER', 0.5361157655715942),
 ('ALBUS_DUMBLEDORE', 0.5178130865097046)]