## Install Packages

### python -m venv .venv -> source .venv/Scripts/activate -> pip install -r ./requirements.txt

In [None]:
import os
import json
import string
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np
from wordcloud import WordCloud
from matplotlib import pyplot as plt
from nltk.util import ngrams
from nltk.corpus import PlaintextCorpusReader,stopwords
from nltk.stem import PorterStemmer
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.metrics.pairwise import cosine_similarity,euclidean_distances
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

python-> import nltk ->
nltk.download("punkt_tab")->
nltk.download("stopwords")

## Create documents corpus

In [None]:
corpus_dir="./Literature-original"
corpus = PlaintextCorpusReader(corpus_dir,".*\.txt")
file_names = corpus.fileids()

### Corpus documents preprocessing

In [None]:
documents = {}
for file_name in file_names:
    documents[file_name] = corpus.raw(file_name)
print(json.dumps(documents,indent=4,ensure_ascii=False))

In [None]:
lengths ={}
for file_name in documents:
    lengths[file_name] = {"pre":len(word_tokenize(documents[file_name]))}
print(json.dumps(lengths,indent=4,ensure_ascii=False))

In [None]:
ps=PorterStemmer()

In [None]:
for file_name in documents:
    documents[file_name]= documents[file_name].lower()
    documents[file_name]= "".join([char for char in documents[file_name] if char not in string.punctuation])
    documents[file_name]= "".join([char for char in documents[file_name] if not char.isdigit()])
    documents[file_name]= " ".join([ps.stem(word) for word in word_tokenize(documents[file_name])])
    documents[file_name]= " ".join([word for word in word_tokenize(documents[file_name]) if word not in list(stopwords.words("english"))])
print(json.dumps(documents,indent=4,ensure_ascii=False))
    

In [None]:
for file_name in documents:
    lengths[file_name]["post"]=len(word_tokenize(documents[file_name]))
print(json.dumps(lengths,indent=4,ensure_ascii=False))

In [None]:
lengths=pd.DataFrame.from_dict(lengths,orient="index")

In [None]:
lengths['diff']= lengths["pre"]-lengths["post"]
lengths["pct"]= lengths["diff"]/lengths["pre"]
lengths

### Create freaquency matrix

In [None]:
docs = pd.DataFrame.from_dict(documents,orient="index")
docs.columns = ["content"]
docs

In [None]:
cv =CountVectorizer()
matrix_tf=cv.fit_transform(docs["content"])
sparsity_tf=1-(matrix_tf.getnnz()/(matrix_tf.shape[0]*matrix_tf.shape[1]))
sparsity_tf

In [None]:
tv = TfidfVectorizer()
matrix_tfidf=tv.fit_transform(docs["content"])
sparsity_tfidf=1-(matrix_tfidf.getnnz()/(matrix_tfidf.shape[0]*matrix_tfidf.shape[1]))
sparsity_tfidf

### Directories for results

In [None]:
if not os.path.exists("./wordclouds"):
    os.mkdir("./wordclouds")
if not os.path.exists("./topic_modeling"):
    os.mkdir("./topic_modeling")
if not os.path.exists("./topic_modeling/topics"):
    os.mkdir("./topic_modeling/topics")
if not os.path.exists("./topic_modeling/documents"):
    os.mkdir("./topic_modeling/documents")
if not os.path.exists("./clustering"):
    os.mkdir("./clustering")
if not os.path.exists("./ngrams"):
    os.mkdir("./ngrams") 

### Wordclouds

In [None]:
wordcloud = WordCloud(
    background_color="white",
    max_words=5000,
    contour_width=3,
    contour_color="steelblue"
)

In [None]:
for index, row in docs.iterrows():
    wordcloud.generate(row['content'])
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.title(index.replace(".txt",""))
    plt.savefig('./wordclouds/{}' .format(index.replace("txt", "png")))
    plt.close()


### Topic Modeling

In [None]:
def plot_top_words(model, feature_names, n_top_words, title, size):
    colors = ['forestgreen','snow','white','salmon','gold','skyblue','olivedrab','cyan','violet','wheat']
    fig, axes = plt.subplots(*size, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[-n_top_words:]
        top_features = feature_names[top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7, color= colors[topic_idx])
        ax.set_title(f"Topic {topic_idx + 1}", fontdict={"fontsize": 30})
        ax.tick_params(axis="both", which="major", labelsize=20)
        for i in "top right left".split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.savefig(f"./topic_modeling/topics/{title}.png")
    plt.close()

In [None]:
def plot_documents(model, matrix, n_topics, title):
    colors = ['forestgreen','snow','white','salmon','gold','skyblue','olivedrab','cyan','violet','wheat']
    docs_topics = pd.DataFrame(model.transform(matrix), columns=[f"Topic {x}" for x in range(n_topics)])
    docs_topics.index= [file_name.replace(".txt","") for file_name in file_names]
    plt.figure(figsize=(7,4))
    left = [0] *len(docs_topics)
    for i, col in enumerate(docs_topics.columns):
        plt.barh(docs_topics.index, docs_topics[col], left=left, label=col, color=colors[i])
        left = [left[j]+docs_topics[col].iloc[j] for j in range(len(docs_topics))]
    plt.savefig(f"./topic_modeling/documents/{title}.png")
    plt.close()

In [None]:
n_topics = 10
featured_names = cv.get_feature_names_out()
n_top_words = 20
size = (2,5)

In [None]:
lda= LatentDirichletAllocation(
    n_components=n_topics,
    max_iter=5,
    learning_method='online',
    learning_offset=50,
    random_state=0
)
lda.fit(matrix_tf)
plot_top_words(lda, featured_names, n_top_words,"Topics in LDA", size)
plot_documents(lda, matrix_tf,n_topics, "Topics in LDA")

In [None]:
nmf_fn = NMF(
    n_components=n_topics,
    random_state=1,
    alpha_H=.00005,
    alpha_W=.00005,
    l1_ratio=.5
)
nmf_fn.fit(matrix_tfidf)
plot_top_words(nmf_fn, featured_names, n_top_words,"Topics in NMF (FN)", size)
plot_documents(nmf_fn, matrix_tfidf,n_topics, "Topics in NMF (FN)")

In [None]:
nmf_kl = NMF(
    n_components=n_topics,
    random_state=1,
    alpha_H=.00005,
    alpha_W=.00005,
    l1_ratio=.5,
    beta_loss="kullback-leibler",
    solver='mu',
    max_iter=1000
)
nmf_kl.fit(matrix_tfidf)
plot_top_words(nmf_kl, featured_names, n_top_words,"Topics in NMF (KL)", size)
plot_documents(nmf_kl, matrix_tfidf,n_topics, "Topics in NMF (KL)")

### Clustering

In [None]:
def plot_dendrogram(model, **kwargs):
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)
    dendrogram(linkage_matrix, **kwargs)

In [None]:
cs = cosine_similarity(matrix_tfidf, matrix_tfidf).flatten().reshape(matrix_tfidf.shape[0], matrix_tfidf.shape[0])

In [None]:
clustering_cs_ward = AgglomerativeClustering(
    n_clusters=6,
    metric='euclidean',
    linkage='ward',
    compute_distances=True
)
clustering_cs_ward.fit(cs)
print(clustering_cs_ward.labels_)
plot_dendrogram(
    clustering_cs_ward,
    labels = file_names,
    truncate_mode = "level",
    orientation ="right",
)
plt.savefig("./clustering/cs_ward")
plt.close()

In [None]:
ed = euclidean_distances(matrix_tf, matrix_tf).flatten().reshape(matrix_tf.shape[0], matrix_tf.shape[0])

In [None]:
clustering_ed_ward = AgglomerativeClustering(
    n_clusters=6,
    metric='precomputed',
    linkage='complete',
    compute_distances=True
)
clustering_ed_ward.fit(ed)
print(clustering_ed_ward.labels_)
plot_dendrogram(
    clustering_ed_ward,
    labels = file_names,
    truncate_mode = "level",
    orientation ="right",
)
plt.savefig("./clustering/ed_complete")
plt.close()

### N-grams

In [None]:
documents_tokenized = {}
for key in documents:
    documents_tokenized[key] = word_tokenize(documents[key], language='english')
print(json.dumps(documents_tokenized,indent=4,ensure_ascii=False))

In [None]:
n = 3
tokens = 5
for n in range(1, 4):
    for title in documents_tokenized:
        n_gram = pd.Series(ngrams(documents_tokenized[title], n)).value_counts()
        n_gram[:tokens].plot.barh()
        plt.savefig(f"./ngrams/{n}_{title}.png")
        plt.close()