In [139]:
import nltk
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import numpy as np
import math
import sys

In [2]:
metadata_df = pd.read_json("../data/metadata_rich_df.json")


In [3]:
sourcepath = "../data/large_files/trigrams_lemmatized/"
with open(sourcepath + "trigrams_lemmata_id_0.txt".format(str(id)), "r") as f:
    trigrams_list = f.readlines()

In [4]:
[el.strip() for el in trigrams_list][:10]

['religious matter',
 'divine hope',
 'divine hope',
 'leave unresolved',
 'leave unresolved',
 'racial equality',
 'racial equality',
 'joint action',
 'joint action',
 'joint action']

# Load vocabulary

In [5]:
bidecades_vocabs_counts_df = pickle.load(open("../data/bidecades_vocabs_counts_df.pickle", "rb"))
bidecades_vocabs_counts_df.head(5)

Unnamed: 0,1900-1919,1920-1939,1940-1959,1960-1979,1980-1999,2000-2019
man,11218.0,8342.0,24108.0,39199.0,48435.0,100670.0
new,10183.0,10192.0,23469.0,42092.0,97421.0,293457.0
Jesus,9650.0,4911.0,12134.0,23416.0,49765.0,215687.0
et,9325.0,11818.0,20805.0,35969.0,49326.0,73464.0
God,9248.0,6276.0,26328.0,45751.0,99505.0,293240.0


In [6]:
vocabulary = dict([(el[1], el[0]) for el in enumerate(bidecades_vocabs_counts_df.index)])

# Load trigrams

In [7]:
bidecades_strs = sorted([el for el in list(set(metadata_df["bidecade"])) if el != None])
bidecades_strs

['1900-1919', '1920-1939', '1940-1959', '1960-1979', '1980-1999', '2000-2019']

In [8]:
bidecade = "1900-1919"
ids = metadata_df[metadata_df["bidecade"]==bidecade]["id_kase"]
len(ids)

782

In [23]:
with open("../data/large_files/bidecade_trigrams_{}.txt".format(bidecade), "r") as f:
    subcorpus_ngrams = f.readlines()

In [24]:
[el.strip() for el in subcorpus_ngrams[:10]]

['Michigan Ann Arbor',
 'Bonner University',
 'Bonner University',
 'Iowa Iowa',
 'Iowa Iowa',
 'Kansas Lawrence',
 'Ginn Co',
 'Elliott University',
 'Elliott University',
 'Kansas Lawrence Kansas']

In [120]:
vectorizer = TfidfVectorizer(vocabulary=vocabulary, lowercase=False)
X = vectorizer.fit_transform(subcorpus_ngrams)

In [121]:
X.shape

(722798, 4804)

In [122]:
cooc = X.T * X

In [123]:
cooc.max()

6277.248125246523

In [124]:
cooc.todense().max()

6277.248125246523

In [125]:
cooc_norm_dict = {}
for bidecade in bidecades_strs:
    with open("../data/large_files/bidecade_trigrams_{}.txt".format(bidecade), "r") as f:
        subcorpus_ngrams = f.readlines()
    vectorizer = TfidfVectorizer(vocabulary=vocabulary, lowercase=False)
    X = vectorizer.fit_transform(subcorpus_ngrams)
    cooc = X.T * X
    cooc_norm_dict[bidecade] = cooc

In [127]:
pickle.dump(cooc_norm_dict, open("../data/cooc_dict.pickle", "wb"))

In [39]:
cooc_norm_dict["1980-1999"]["Paul"].sort_values(ascending=False)

Paul        1.000000
letter      0.052684
Apostle     0.025385
use         0.021093
epistle     0.016693
              ...   
opposite    0.000000
Simeon      0.000000
withdraw    0.000000
vote        0.000000
less        0.000000
Name: Paul, Length: 4804, dtype: float64

# Vectorization example

In [360]:
corpus = [("apostle Paul be"), ("apostle Paul be"), ("scholar write be"), ("scholar be article")]
vocabulary = list(set([w for doc in [doc.split() for doc in corpus] for w in doc]))
vectorizer_tfidf = TfidfVectorizer(vocabulary=vocabulary, lowercase=False)
doc_term_tfidf = vectorizer_tfidf.fit_transform(corpus)

vectorizer_count = CountVectorizer(vocabulary=vocabulary, lowercase=False)
doc_term_count = vectorizer_count.fit_transform(corpus)

corpus = [("apostle Paul be"),
          ("apostle Paul be"),
          ("scholar write be"),
          ("scholar be article")]


In [361]:
doc_term_matrix_count = pd.DataFrame(doc_term_count.todense(), columns=vocabulary, index=["doc1", "doc2", "doc3", "doc4"])
doc_term_matrix_count

Unnamed: 0,Paul,apostle,scholar,be,write,article
doc1,1,1,0,1,0,0
doc2,1,1,0,1,0,0
doc3,0,0,1,1,1,0
doc4,0,0,1,1,0,1


In [362]:
doc_term_matrix_tfidf = pd.DataFrame(doc_term_tfidf.todense(), columns=vocabulary, index=["doc1", "doc2", "doc3", "doc4"])
doc_term_matrix_tfidf

Unnamed: 0,Paul,apostle,scholar,be,write,article
doc1,0.640434,0.640434,0.0,0.423897,0.0,0.0
doc2,0.640434,0.640434,0.0,0.423897,0.0,0.0
doc3,0.0,0.0,0.572892,0.379192,0.726641,0.0
doc4,0.0,0.0,0.572892,0.379192,0.0,0.726641


In [382]:
# TfIdf of Paul in doc1:
TF = 1 # term frequency
N = 4 # number of documents
DF = 2 # document frequency
IDF = np.log((N + 1) / (DF + 1)) + 1
doc1_Paul_tfidf = TF * IDF

1.5108256237659907


In [364]:
doc_term_matrix_tfidf.loc["doc1"]

Paul       0.640434
apostle    0.640434
scholar    0.000000
be         0.423897
write      0.000000
article    0.000000
Name: doc1, dtype: float64

In [372]:
def tfidf(corpus, vocabulary):
    # Compute document frequency (df) for each term in the vocabulary
    df = {word: 0 for word in vocabulary}
    for doc in corpus:
        words = doc.split()
        for word in set(words):
            if word in vocabulary:
                df[word] += 1

    # Compute inverse document frequency (idf) for each term in the vocabulary
    idf = {}
    N = len(corpus)
    for word in vocabulary:
        idf[word] = np.log((N + 1) / (df[word] + 1)) + 1

    # Compute term frequency (tf) for each document
    tf = np.zeros((N, len(vocabulary)))
    for i, doc in enumerate(corpus):
        words = doc.split()
        for j, word in enumerate(vocabulary):
            tf[i, j] = words.count(word)

    # Compute tf-idf for each document-term pair
    tfidf_matrix = tf * np.array(list(idf.values()))
    tfidf_manually = pd.DataFrame(tfidf_matrix, columns=vocabulary, index=["doc1", "doc2", "doc3", "doc4"])
    return tfidf_manually

In [373]:
tfidf_manually = tfidf(corpus, vocabulary)
tfidf_manually

Unnamed: 0,Paul,apostle,scholar,be,write,article
doc1,1.510826,1.510826,0.0,1.0,0.0,0.0
doc2,1.510826,1.510826,0.0,1.0,0.0,0.0
doc3,0.0,0.0,1.510826,1.0,1.916291,0.0
doc4,0.0,0.0,1.510826,1.0,0.0,1.916291


doc1_Paul_tfidf = 1.51
eucl_norm = sqrt((1.51^2)+(1.51^2)+(0.0^2)+(1.0^2)+(0.0^2)+(0.0^2)) = 2.36
doc1_Paul_tfidf_norm = 1.51 / 2.36 = 0.64



In [384]:
doc1_Paul_tfidf = 1.51
eucl_norm = sum(np.sqrt((1.51^2)+(1.51^2)+(0.0^2)+(1.0^2)+(0.0^2)+(0.0^2))) # = 2.36
doc1_Paul_tfidf_norm = 1.51 / 2.36 # = 0.64

TypeError: unsupported operand type(s) for ^: 'float' and 'float'

In [374]:
def exponent_as_string(val):
    return "(" + str(np.round(val,2)) + "^2)"

In [375]:
"eucl_norm = sqrt(" + "+".join(tfidf_manually.loc["doc1"].apply(exponent_as_string).tolist()) + ")"


'eucl_norm = sqrt((1.51^2)+(1.51^2)+(0.0^2)+(1.0^2)+(0.0^2)+(0.0^2))'

In [338]:
pd.DataFrame(tfidf_manually.loc["doc1"].apply(exponent_as_string)).T

Unnamed: 0,Paul,apostle,scholar,be,write,article
doc1,1.51*1.51,1.51*1.51,0.0*0.0,1.51*1.51,0.0*0.0,0.0*0.0


In [376]:
vector = tfidf_manually.loc["doc1"]
eucl_norm = np.sqrt(sum([el*el for el in vector]))
eucl_norm

2.359065096782153

In [377]:
# L2 normalization, also known as Euclidean normalization, is a technique used to scale the vector length to a unit length. In the case of the TF-IDF matrix, L2 normalization is applied to each row vector (corresponding to a document) of the matrix.

# The normalization process involves dividing each element of the row vector by the Euclidean norm of the vector. The Euclidean norm of a vector v is defined as the square root of the sum of the squares of its components.
def euclidean_normalization(vector, order=2):
    eucl_norm = np.sqrt(sum([el*el for el in vector])) # The Euclidean norm of a vector v is defined as the square root of the sum of the squares of its components.
    vector_norm = vector / eucl_norm
    return vector_norm

In [389]:
tfidf_manually.apply(euclidean_normalization, axis=1)

Unnamed: 0,Paul,apostle,scholar,be,write,article
doc1,0.640434,0.640434,0.0,0.423897,0.0,0.0
doc2,0.640434,0.640434,0.0,0.423897,0.0,0.0
doc3,0.0,0.0,0.572892,0.379192,0.726641,0.0
doc4,0.0,0.0,0.572892,0.379192,0.0,0.726641


# document-term to term-term matrix

In [391]:
term_doc_matrix_tfidf.T

Unnamed: 0,doc1,doc2,doc3,doc4
Paul,0.640434,0.640434,0.0,0.0
apostle,0.640434,0.640434,0.0,0.0
scholar,0.0,0.0,0.572892,0.572892
be,0.423897,0.423897,0.379192,0.379192
write,0.0,0.0,0.726641,0.0
article,0.0,0.0,0.0,0.726641


In [395]:
cooc = pd.DataFrame((doc_term_tfidf.todense().T * doc_term_tfidf.todense()), columns=vocabulary, index=vocabulary)
cooc

Unnamed: 0,Paul,apostle,scholar,be,write,article
Paul,0.820312,0.820312,0.0,0.542956,0.0,0.0
apostle,0.820312,0.820312,0.0,0.542956,0.0,0.0
scholar,0.0,0.0,0.656412,0.434472,0.416287,0.416287
be,0.542956,0.542956,0.434472,0.64695,0.275536,0.275536
write,0.0,0.0,0.416287,0.275536,0.528008,0.0
article,0.0,0.0,0.416287,0.275536,0.0,0.528008


In [397]:
cooc.sort_values("Paul", ascending=False)["Paul"]

Paul       0.820312
apostle    0.820312
be         0.542956
scholar    0.000000
write      0.000000
article    0.000000
Name: Paul, dtype: float64