# Setup
We need to get everything set up.

In [None]:
# %pip install https://huggingface.co/easyh/de_fnhd_nerdh/resolve/main/de_fnhd_nerdh-any-py3-none-any.whl
!python -m spacy download "de_core_news_lg"
%pip install pyldavis
%pip install gensim

In [None]:
import spacy
from pathlib import Path
import gensim
from gensim.models.ldamulticore import LdaMulticore
import pyLDAvis
from pyLDAvis import gensim_models as gsv
from constants import STOPWORDS as stops
import matplotlib.pyplot as plt
from gensim.models import CoherenceModel
from collections import Counter
import nltk
import math
import pandas as pd
import itertools
from helpers import leven_worker, pre_cleaning
%matplotlib inline

In [None]:
nlp = spacy.load("de_core_news_lg")
nlp.max_length = 1500000

In [None]:
input_files = Path("./input/").rglob("*.txt")

In [None]:
full_corpus = {}

for i in input_files:
    with open(i, "r+") as f:
        txt = f.read()
    txt = pre_cleaning(txt)
    book_name = str(i).split("/")[-1]
    book_name = book_name.split(".")[0]
    full_corpus[book_name] = txt
print(len(full_corpus))

# Teil 1: Topic Modelling

In [None]:
by_chapters = {}

for k, v in full_corpus.items():
    chaps = v.split("Capitel")
    chap_num = 0
    for i in chaps:
        curr_chap = f"{k}-{chap_num}-Kap"
        by_chapters[curr_chap] = i
        chap_num += 1
print(len(by_chapters))

In [None]:
docs_list: list[list[str]] = []
for i in nlp.pipe(by_chapters.values(), n_process=7):
    itoks = [token.lemma_.lower() for token in i if token.lower_ not in stops and "\n" not in token.lower_ and not token.is_stop]
    docs_list.append(itoks)

In [None]:
bigram_phrases = gensim.models.Phrases(docs_list, min_count=5, threshold=2)
trigram_phrases = gensim.models.Phrases(bigram_phrases[docs_list], threshold=2)
bigrams = gensim.models.phrases.Phraser(bigram_phrases)
trigrams = gensim.models.phrases.Phraser(trigram_phrases)
docs_list = trigrams[bigrams[docs_list]]
corpus_dictionary = gensim.corpora.Dictionary(docs_list)
analysis_corpus = [corpus_dictionary.doc2bow(i) for i in docs_list]

In [None]:
topics = []
score = []
for i in range(1, 40,1):
   lda_model = LdaMulticore(corpus=analysis_corpus,
               id2word=corpus_dictionary,
               iterations=75,
               num_topics=i,
               workers = 7,
               passes=12,
               random_state=100)
   cm = CoherenceModel(model=lda_model,
                       corpus=analysis_corpus,
                       dictionary=corpus_dictionary,
                       coherence='c_v',
                       texts=docs_list)
   topics.append(i)
   score.append(cm.get_coherence())
_=plt.plot(topics, score)
_=plt.xlabel('Number of Topics')
_=plt.ylabel('Coherence Score')
plt.show()

In [None]:

lda_model = LdaMulticore(corpus=analysis_corpus,
                                              id2word=corpus_dictionary,
                                              iterations=75,
                                              num_topics=25,
                                              workers = 7,
                                              passes=12)


In [None]:
lda_display = gsv.prepare(lda_model, analysis_corpus, corpus_dictionary)
pyLDAvis.display(lda_display)

#### Concordances

In [None]:
searchword = "könig"
locations: list[tuple[int, int]] = []
for doc in enumerate(docs_list):
    doc_no, doc_toks = doc
    for toks in enumerate(doc_toks):
        tok_no, tok = toks
        if tok == searchword:
            locations.append((doc_no, tok_no))

for location in locations:
    doc_loc = location[0]
    tok_scope_from = location[1]-5
    tok_scope_to = location[1]+5
    print(docs_list[doc_loc][tok_scope_from:tok_scope_to])



#### Frequenzanalyse

In [None]:
searchword = "abend"
count = 0
for doc in docs_list:
    for tok in doc:
        if tok == searchword:
            count += 1
print(count)

# Teil 2: Stilometrie

In [None]:
author_dict: dict[str, list[str]] = {"Montalvo": ['4__Buch', '1__Buch'], "Silva": ['7__Buch', '10__Buch'], "Unknown": ["22__Buch"], "Roseo": ["19__Buch"]}
translator_dict: dict[str, str] = {}
known = ["Montalvo", "Silva", "Roseo"]

## Basic: Wort- und Satzlängen

### Wortlängen

In [None]:
whole_books_clean: dict[str, list[str]] = {}
for k, v in full_corpus.items():
    i = nlp(v)
    itoks = [token.lemma_.lower() for token in i if token.lower_ not in stops and "\n" not in token.lower_ and not token.is_stop]
    whole_books_clean[k] = itoks

In [None]:
length_by_book = {}
for k, v in whole_books_clean.items():
    token_lengths = [len(token) for token in v]
    freq_dist = Counter(token_lengths)
    length_by_book[k] = freq_dist


In [None]:
for k, v in length_by_book.items():
    plt.bar(v.keys(), v.values())
    plt.title(f"{k}")
    plt.show()

### Satzlängen

In [None]:
books_by_sents: dict[str, list[str]] = {}
for k, v in full_corpus.items():
    i = nlp(v)
    sents = [sent.text for sent in i.sents]
    books_by_sents[k] = sents

In [None]:
length_by_book = {}
for k, v in books_by_sents.items():
    sent_lengths = [len(sent.split()) for sent in v]
    freq_dist = Counter(sent_lengths)
    length_by_book[k] = freq_dist

In [None]:
for k, v in length_by_book.items():
    plt.bar(v.keys(), v.values())
    plt.title(f"{k}")
    plt.show()

## Advanced: Burrows Delta

In [None]:
whole_books_burrows: dict[str, list[str]] = {}
for k, v in full_corpus.items():
    i = nlp(v)
    itoks = [token.lemma_.lower() for token in i if "\n" not in token.lower_ and "--" not in token.lower_]
    whole_books_burrows[k] = itoks

In [None]:
complete_corpus = []
for k, v in whole_books_burrows.items():
    complete_corpus.extend(v)
len(complete_corpus)

In [None]:
whole_corpus_freq_dist = list(nltk.FreqDist(complete_corpus).most_common(30))
whole_corpus_freq_dist[ :10 ]

In [None]:
features = [word for word,freq in whole_corpus_freq_dist]
feature_freqs = {}

for i in known:
    feature_freqs[i] = {}
    personal_complete = []

    for ii in author_dict[i]:
        personal_complete.extend(whole_books_burrows[ii])

    token_count = len(personal_complete)
    for feature in features:
        presence = personal_complete.count(feature)
        feature_freqs[i][feature] = presence / token_count

In [None]:
corpus_features = {}

for feature in features:
    corpus_features[feature] = {}

    feature_avg = 0
    for person in known:
        feature_avg += feature_freqs[person][feature]
    feature_avg /= len(known)
    corpus_features[feature]["Mean"] = feature_avg

    feature_stdev = 0
    for person in known:
        diff = feature_freqs[person][feature] - corpus_features[feature]["Mean"]
        feature_stdev += diff*diff
    feature_stdev /= (len(known) - 1)
    feature_stdev = math.sqrt(feature_stdev)
    corpus_features[feature]["StdDev"] = feature_stdev

In [None]:
feature_zscores = {}
for person in known:
    feature_zscores[person] = {}
    for feature in features:

        feature_val = feature_freqs[person][feature]
        feature_mean = corpus_features[feature]["Mean"]
        feature_stdev = corpus_features[feature]["StdDev"]
        feature_zscores[person][feature] = ((feature_val-feature_mean) /
                                            feature_stdev)


In [None]:
for i in author_dict["Unknown"]:
    unkown_text = whole_books_burrows[i]

    unkown_count = len(unkown_text)
    unkown_freqs = {}
    for feature in features:
        presence = unkown_text.count(feature)
        unkown_freqs[feature] = presence / unkown_count
    
    unkown_zscores = {}
    for feature in features:
        feature_val = unkown_freqs[feature]
        feature_mean = corpus_features[feature]["Mean"]
        feature_stdev = corpus_features[feature]["StdDev"]
        unkown_zscores[feature] = (feature_val - feature_mean) / feature_stdev

In [None]:
for person in known:
    delta = 0
    for feature in features:
        delta += math.fabs((unkown_zscores[feature] -
                            feature_zscores[person][feature]))
    delta /= len(features)
    print(f"Delta score for candidate {person} is {delta}")


## Bonus: Text reuse

### Simple trick?

In [None]:
all_sents = [sent for sents in books_by_sents.values() for sent in sents if len(sent.split()) > 4]

In [None]:
reoccurences = Counter(all_sents)
most_freq_sents = []
most_freq_counts = []
for sent in reoccurences:
    if reoccurences[sent] > 1:
        most_freq_sents.append(sent)
        most_freq_counts.append(int(reoccurences[sent]))

In [None]:
df_dict = {"Sentence": most_freq_sents, "Count": most_freq_counts}
freqs_df = pd.DataFrame(df_dict)
freqs_df.sort_values(by=["Count"], ascending=False)

### Levenshtein

In [None]:
sent_combinations = itertools.combinations(all_sents[:6000], 2)
matches_dict = {}
matches_list = []
for i in leven_worker(sent_combinations):
    matches_dict[i[0]] = i[1]
    matches_list.append(i[0])
    print(i)


In [None]:
len(matches_list)