In [6]:
import itertools
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import networkx as nx
import json
from community import community_louvain
from infomap import Infomap
from tqdm.auto import tqdm

In [7]:
class SemanticNetworkAnalysis:
    def __init__(self, data_file):
        self.data = self.load_data(data_file)
        self.corpus, self.tokens = self.prepare_corpus()
        self.vectorizer, self.tfidf_matrix = self.calculate_tfidf(min_df=0)
        self.vocab = self.vectorizer.get_feature_names_out()

    def load_data(self, data_file):
        with open(data_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        return data

    def prepare_corpus(self):
        corpus = []
        tokens = []
        for post in tqdm(self.data, desc="Preparing corpus"):
            corpus.append(' '.join(post['title_tokens']))
            tokens.append(post['title_tokens'])
            for comment in post['comments']:
                corpus.append(' '.join(comment['body_tokens']))
                tokens.append(comment['body_tokens'])
        return corpus, tokens

    def calculate_tfidf(self, min_df=10):
        vectorizer = TfidfVectorizer(min_df=min_df)
        tfidf_matrix = vectorizer.fit_transform(tqdm(self.corpus, desc="Calculating TF-IDF"))
        return vectorizer, tfidf_matrix

    def co_occurrence_matrix_with_tfidf(self, window_size=5):
        vocab_size = len(self.vocab)
        matrix = np.zeros((vocab_size, vocab_size), dtype=np.float32)
        word_to_index = {word: i for i, word in enumerate(self.vocab)}

        for token_list, row_tfidf in tqdm(zip(self.tokens, self.tfidf_matrix), total=len(self.tokens), desc="Computing co-occurrence matrix"):
            for i in range(len(token_list) - window_size + 1):
                pairs = itertools.combinations(token_list[i:i + window_size], 2)
                for u, v in pairs:
                    if u in word_to_index and v in word_to_index:
                        index_u = word_to_index[u]
                        index_v = word_to_index[v]
                        tfidf_u = row_tfidf[0, index_u]
                        tfidf_v = row_tfidf[0, index_v]
                        matrix[index_u][index_v] += (tfidf_u * tfidf_v)
                        matrix[index_v][index_u] += (tfidf_u * tfidf_v)

        return matrix

    def create_semantic_network(self, co_occurrence_matrix, min_weight=6):
        G = nx.Graph()

        for i, word1 in tqdm(enumerate(self.vocab), desc="Creating semantic network", total=len(self.vocab)):
            for j, word2 in enumerate(self.vocab):
                if i != j and co_occurrence_matrix[i][j] >= min_weight:
                    G.add_edge(word1, word2, weight=co_occurrence_matrix[i][j])

        return G

    def normalize_edge_weights(self, G):
        max_edge_weight = max([d['weight'] for u, v, d in G.edges(data=True)])
        for u, v, d in tqdm(G.edges(data=True), desc="Normalizing edge weights"):
            d['normalized_weight'] = d['weight'] / max_edge_weight

    def export_to_gephi(self, G, filename):
        nx.write_gexf(G, filename)


In [8]:
'''if __name__ == '__main__':
    analysis = SemanticNetworkAnalysis('serialkillers_cleaned.json')
    co_occurrence_with_tfidf = analysis.co_occurrence_matrix_with_tfidf(window_size=6)
    semantic_network = analysis.create_semantic_network(co_occurrence_with_tfidf, min_weight=8)
    analysis.normalize_edge_weights(semantic_network)
    analysis.export_to_gephi(semantic_network, "serialkillers_tfidf.gexf")'''

'if __name__ == \'__main__\':\n    analysis = SemanticNetworkAnalysis(\'serialkillers_cleaned.json\')\n    co_occurrence_with_tfidf = analysis.co_occurrence_matrix_with_tfidf(window_size=6)\n    semantic_network = analysis.create_semantic_network(co_occurrence_with_tfidf, min_weight=8)\n    analysis.normalize_edge_weights(semantic_network)\n    analysis.export_to_gephi(semantic_network, "serialkillers_tfidf.gexf")'

In [9]:
'''if __name__ == '__main__':
    analysis = SemanticNetworkAnalysis('serialkillers_cleaned.json')
    co_occurrence_with_tfidf = analysis.co_occurrence_matrix_with_tfidf(window_size=8)
    semantic_network = analysis.create_semantic_network(co_occurrence_with_tfidf, min_weight=4)
    analysis.normalize_edge_weights(semantic_network)
    analysis.export_to_gephi(semantic_network, "serialkillers_tfidf.gexf")'''

'if __name__ == \'__main__\':\n    analysis = SemanticNetworkAnalysis(\'serialkillers_cleaned.json\')\n    co_occurrence_with_tfidf = analysis.co_occurrence_matrix_with_tfidf(window_size=8)\n    semantic_network = analysis.create_semantic_network(co_occurrence_with_tfidf, min_weight=4)\n    analysis.normalize_edge_weights(semantic_network)\n    analysis.export_to_gephi(semantic_network, "serialkillers_tfidf.gexf")'

In [10]:
if __name__ == '__main__':
    analysis = SemanticNetworkAnalysis('unresolved_cleaned.json')
    co_occurrence_with_tfidf = analysis.co_occurrence_matrix_with_tfidf(window_size=4)
    semantic_network = analysis.create_semantic_network(co_occurrence_with_tfidf, min_weight=4)
    analysis.normalize_edge_weights(semantic_network)
    analysis.export_to_gephi(semantic_network, "unresolved_tfidf2.gexf")

Preparing corpus: 100%|██████████| 965/965 [00:00<00:00, 20899.94it/s]
Calculating TF-IDF: 100%|██████████| 79102/79102 [00:00<00:00, 135650.74it/s]
Computing co-occurrence matrix: 100%|██████████| 79102/79102 [02:37<00:00, 502.46it/s]
Creating semantic network: 100%|██████████| 44527/44527 [29:09<00:00, 25.46it/s]
Normalizing edge weights: 100%|██████████| 2967/2967 [00:00<00:00, 2202566.37it/s]
