# Character Shifts in Harry Potter Fanfics

# Character Names Cooccurrences

### Last updated: 19.01.2022

### 1. Required Libraries

In [1]:
import glob
import string
import os
import nltk
from nltk import FreqDist
from string import punctuation
import numpy as np
import pandas as pd
import ast
import re
import matplotlib.pyplot as plt
import seaborn as sns
import csv
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import spacy
from spacy.lang.de import German
import codecs

In [2]:
path_data = r'Z:\Fanfiction\HP_Character-Distribution\pamphlet_character_shifts\data'
path_corpora = r'Z:\Fanfiction\HP_Character-Distribution\pamphlet_character_shifts\corpora'
path_pickled = r'Z:\Fanfiction\HP_Character-Distribution\pamphlet_character_shifts\results\pickled'
path_cooccurrences = r'Z:\Fanfiction\HP_Character-Distribution\pamphlet_character_shifts\results\results\cooccurrences'

### 2. Read in Dictionary

In [3]:
file = open(path_data + '\\entities\\full_names.csv')
contents = file. read()
full_names_dict = ast.literal_eval(contents)

In [4]:
file = open(path_data + '\\entities\\entities.csv')
contents = file. read()
names_dict = ast.literal_eval(contents)

In [5]:
# Sort the synonyms by decreasing length (so that "Albus Dumbledore" is, for example, checked before "Albus")
# Note: Proper names can be tricky, because some of them are ambiguously used in the text (e.g. Barty Crouch Junior & Senior). 

sorted_names_full_names = {}
for name, synonyms in full_names_dict.items():
    sorted_synonyms = list(sorted(synonyms, key = len, reverse = True))
    sorted_synonyms = [each_string.lower() for each_string in sorted_synonyms]
    sorted_names_full_names[name]= sorted_synonyms

In [6]:
sorted_names = {}
for name, synonyms in names_dict.items():
    sorted_synonyms = list(sorted(synonyms, key = len, reverse = True))
    sorted_synonyms = [each_string.lower() for each_string in sorted_synonyms]
    sorted_names[name]= sorted_synonyms

### 3. Co-occurences

Specify spaCy model

In [7]:
nlp = German()
nlp.add_pipe('sentencizer')
nlp.max_length = 1500000000

Read in pickled tokenised texts with generalised entities

Originals

In [8]:
with open(path_pickled + '\\corpusHPoriginals_words.pkl', 'rb') as f:
    corpusHPoriginals_words = pickle.load(f)

In [20]:
corpusHPoriginals_sentences = [0]*len(corpusHPoriginals_words)
num_sentences = 0
i = 0

for texts in corpusHPoriginals_words:
    doc = nlp(texts)
    corpusHPoriginals_sentences[i] = [[token.text for token in sent] for sent in doc.sents]
    num_sentences += len(corpusHPoriginals_sentences[i])
    i += 1

In [21]:
originals_cooccurring_entities = [0]*len(corpusHPoriginals_sentences)

In [22]:
i = 0

for texts in corpusHPoriginals_sentences:
    cooccurring_entities = [0]*len(texts)
    j = 0
    for sentences in texts:
        listed_cooccurrences = sorted(list(set(sentences) & set(list(sorted_names_full_names.keys()))))
        occurring_entities = list(listed_cooccurrences)
        cooccurring_entities[j] = [entry for entry in occurring_entities if len(entry) != 1]
        j += 1
    originals_cooccurring_entities[i] = cooccurring_entities
    i += 1

In [23]:
with open(path_pickled + '\\originals_cooccurring_entities_all_sentences.pkl', 'wb') as f:
    pickle.dump(originals_cooccurring_entities, f)

In [24]:
originals_cooccurring_entities_count = [0]*len(originals_cooccurring_entities)
i = 0

for text in originals_cooccurring_entities:
    originals_cooccurring_entities_count[i] = [len(entry) for entry in text]
    i += 1

In [25]:
originals_cooccurring_sentences = [0]*len(originals_cooccurring_entities)
i = 0

for i in range(0, len(originals_cooccurring_entities_count)):
    text = originals_cooccurring_entities_count[i]
    sentences = []
    for j in range(0, len(text)):
        if originals_cooccurring_entities_count[i][j] > 1:
            sentences.append([originals_cooccurring_entities[i][j], corpusHPoriginals_sentences[i][j]])
        j += 1
    originals_cooccurring_sentences[i] = sentences
    i += 1

In [26]:
originals_cooccurring_entities = [list(filter(None, text)) for text in originals_cooccurring_entities]

In [27]:
cooccurrences_originals_df = pd.DataFrame(columns = list(sorted_names_full_names.keys()), 
                                          index = list(sorted_names_full_names.keys()))
cooccurrences_originals_df.fillna(0, inplace = True)

In [28]:
for text in originals_cooccurring_entities:
    for entry in text:
        for i in range(0, len(entry)-1):
            j = i +1
            while j < len(entry): 
                cooccurrences_originals_df.loc[entry[i], entry[j]] += 1
                cooccurrences_originals_df.loc[entry[j], entry[i]] += 1
                #print(entry[i] +  " " + entry[j])
                j += 1

In [29]:
cooccurrences_originals_df.to_csv(path_cooccurrences + '\\cooccurrences_originals_count_df_sentences.csv', sep = ';', encoding = 'utf-8')
cooccurrences_originals_rel_df = 100/num_sentences*cooccurrences_originals_df
cooccurrences_originals_rel_df.to_csv(path_cooccurrences + '\\cooccurrences_originals_rel_df_sentences.csv', sep = ';', encoding = 'utf-8')

In [30]:
with open(path_pickled + '\\cooccurrences_originals_df_sentences.pkl', 'wb') as f:
    pickle.dump(cooccurrences_originals_df, f)
    
with open(path_pickled + '\\cooccurrences_originals_rel_df_sentences.pkl', 'wb') as f:
    pickle.dump(cooccurrences_originals_rel_df, f)

In [31]:
names = cooccurrences_originals_rel_df.columns.values
list_links = []

for i in range(0, len(cooccurrences_originals_rel_df.index)):
    for j in range(0, len(cooccurrences_originals_rel_df.columns)):
        list_links.append([{names[i], names[j]}, cooccurrences_originals_rel_df.iloc[i][j]])

In [32]:
df_links = pd.DataFrame(list_links, columns =['Pairs', 'Co-occurrences'])
df_links = df_links.sort_values(by=['Co-occurrences'], ascending = False)
df_links.to_csv(path_cooccurrences + '\\df_originals_links_sentences.csv', sep = ';', encoding = 'utf-8')

In [33]:
top_links = df_links.iloc[:1000]
top_pairs = list(top_links['Pairs'])
pairs = []

i = 0
for pair in top_pairs:
    if i % 2 != 0:
        pairs.append(top_pairs[i])
    i += 1

In [34]:
originals_pairs_sentences = [0]*len(pairs)
i = 0

for pair in pairs:
    sentences = []
    for text in originals_cooccurring_sentences:
        for sent in text:
            intersect = pair.issubset(set(sent[0]))
            if intersect:
                sentences.append(sent[1])    
    originals_pairs_sentences[i] = [pair, sentences]
    i += 1

In [35]:
originals_pairs_sentences[0][1][1]

['fragte',
 'HARRY_POTTER',
 ',',
 'der',
 'RON_WEALSEY',
 'genauso',
 'interessant',
 'fand',
 'wie',
 'RON_WEALSEY',
 'ihn',
 '.']

In [37]:
for pair in originals_pairs_sentences:
    file_name = '_'.join(pair[0])
    file_name = path_cooccurrences + '\\originals_pairs_sentences\\' + file_name + '.txt'
    sentences = pair[1]
    strings = ''
    for sent in sentences:
        string = ' '.join(sent)
        string = string + '\n'
        strings += string
    with open(file_name, 'wb') as f:
        f.write(strings.encode("UTF-8"))   

In [38]:
with open(path_pickled + '\\originals_pairs_with_sentences.pkl', 'wb') as f:
    pickle.dump(originals_pairs_sentences, f)

Fanfics

In [39]:
with open(path_pickled + '\\corpusHPFFs_words.pkl', 'rb') as f:
    corpusHPFFs_words = pickle.load(f)

In [40]:
corpusHPFFs_sentences = [0]*len(corpusHPFFs_words)
num_sentences = 0
i = 0

for texts in corpusHPFFs_words:
    doc = nlp(texts)
    corpusHPFFs_sentences[i] = [[token.text for token in sent] for sent in doc.sents]
    num_sentences += len(corpusHPFFs_sentences[i])
    i += 1

In [41]:
ffs_cooccurring_entities = [0]*len(corpusHPFFs_sentences)

In [42]:
i = 0

for texts in corpusHPFFs_sentences:
    cooccurring_entities = [0]*len(texts)
    j = 0
    for sentences in texts:
        listed_cooccurrences = sorted(list(set(sentences) & set(list(sorted_names_full_names.keys()))))
        occurring_entities = list(listed_cooccurrences)
        cooccurring_entities[j] = [entry for entry in occurring_entities if len(entry) != 1]
        j += 1
    ffs_cooccurring_entities[i] = cooccurring_entities
    i += 1

In [43]:
with open(path_pickled + '\\ffs_cooccurring_entities_all_sentences.pkl', 'wb') as f:
    pickle.dump(ffs_cooccurring_entities, f)

In [44]:
ffs_cooccurring_entities_count = [0]*len(ffs_cooccurring_entities)
i = 0

for text in ffs_cooccurring_entities:
    ffs_cooccurring_entities_count[i] = [len(entry) for entry in text]
    i += 1

In [45]:
ffs_cooccurring_sentences = [0]*len(ffs_cooccurring_entities)
i = 0

for i in range(0, len(ffs_cooccurring_entities_count)):
    text = ffs_cooccurring_entities_count[i]
    sentences = []
    for j in range(0, len(text)):
        if ffs_cooccurring_entities_count[i][j] > 1:
            sentences.append([ffs_cooccurring_entities[i][j], corpusHPFFs_sentences[i][j]])
        j += 1
    ffs_cooccurring_sentences[i] = sentences
    i += 1

In [46]:
ffs_cooccurring_entities = [list(filter(None, text)) for text in ffs_cooccurring_entities]

In [47]:
cooccurrences_ffs_df = pd.DataFrame(columns = list(sorted_names_full_names.keys()), 
                                          index = list(sorted_names_full_names.keys()))
cooccurrences_ffs_df.fillna(0, inplace = True)

In [48]:
for text in ffs_cooccurring_entities:
    for entry in text:
        for i in range(0, len(entry)-1):
            j = i +1
            while j < len(entry): 
                cooccurrences_ffs_df.loc[entry[i], entry[j]] += 1
                cooccurrences_ffs_df.loc[entry[j], entry[i]] += 1
                #print(entry[i] +  " " + entry[j])
                j += 1

In [49]:
cooccurrences_ffs_df.to_csv(path_cooccurrences + '\\cooccurrences_ffs_count_df_sentences.csv', sep = ';', encoding = 'utf-8')
cooccurrences_ffs_rel_df = 100/num_sentences*cooccurrences_ffs_df
cooccurrences_ffs_rel_df.to_csv(path_cooccurrences + '\\cooccurrences_ffs_rel_df_sentences.csv', sep = ';', encoding = 'utf-8')

In [50]:
with open(path_pickled + '\\cooccurrences_ffs_df_sentences.pkl', 'wb') as f:
    pickle.dump(cooccurrences_ffs_df, f)
    
with open(path_pickled + '\\cooccurrences_ffs_rel_df_sentences.pkl', 'wb') as f:
    pickle.dump(cooccurrences_ffs_rel_df, f)

In [51]:
names = cooccurrences_ffs_rel_df.columns.values
list_links = []

for i in range(0, len(cooccurrences_ffs_rel_df.index)):
    for j in range(0, len(cooccurrences_ffs_rel_df.columns)):
        list_links.append([{names[i], names[j]}, cooccurrences_ffs_rel_df.iloc[i][j]])

In [52]:
df_links = pd.DataFrame(list_links, columns =['Pairs', 'Co-occurrences'])
df_links = df_links.sort_values(by=['Co-occurrences'], ascending = False)
df_links.to_csv(path_cooccurrences + '\\df_ffs_links_sentences.csv', sep = ';', encoding = 'utf-8')

In [53]:
top_links = df_links.iloc[:1000]
top_pairs = list(top_links['Pairs'])
pairs = []

i = 0
for pair in top_pairs:
    if i % 2 != 0:
        pairs.append(top_pairs[i])
    i += 1

In [54]:
ffs_pairs_sentences = [0]*len(pairs)
i = 0

for pair in pairs:
    sentences = []
    for text in ffs_cooccurring_sentences:
        for sent in text:
            intersect = pair.issubset(set(sent[0]))
            if intersect:
                sentences.append(sent[1])    
    ffs_pairs_sentences[i] = [pair, sentences]
    i += 1

In [55]:
for pair in ffs_pairs_sentences:
    file_name = '_'.join(pair[0])
    file_name = path_cooccurrences + '\\ffs_pairs_sentences\\' + file_name + '.txt'
    sentences = pair[1]
    strings = ''
    for sent in sentences:
        string = ' '.join(sent)
        string = string + '\n'
        strings += string
    with open(file_name, 'wb') as f:
        f.write(strings.encode("UTF-8"))   

In [56]:
with open(path_pickled + '\\ffs_pairs_with_sentences.pkl', 'wb') as f:
    pickle.dump(ffs_pairs_sentences, f)