In [1]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import math
import json
import sys
import time
import json
import re
import csv
from tqdm.notebook import tqdm
from itertools import combinations
from PorterStemmer import *
ps = PorterStemmer()

In [2]:
delim = '''[ ',<>(){}.:;"`\n]'''
def getTokensFromText(text):
    words = re.split(delim, text)
    res = []
    for w in words:
        if(len(w) > 2):
            temp = ps.stem(w.lower(), 0, len(w)-1)
            temp = w.lower()
            if not re.search('[0-9]+', temp):
                res.append(temp)
    return res

In [3]:
data_dir = r"C:\Files\a3data\20news-bydate-test"

In [4]:
def process_data():
    data = {}
    
    for topic in tqdm(os.listdir(data_dir)):
        topic_path = os.path.join(data_dir,topic)
        for uid in os.listdir(topic_path):
            full_path = os.path.join(topic_path,uid)
            with open(full_path) as filedata:
                filedata = filedata.read()
                tokens = getTokensFromText(filedata)
                data[uid] = tokens
        break
    return data
data = process_data()

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




In [5]:
def get_idf_dict_new():
    idf_dict = {}
    for uid in tqdm(data):
        tokens = data[uid]
        for token in tokens:
            if token not in idf_dict:
                idf_dict[token] = 1
            else:
                idf_dict[token] += 1
    return idf_dict
idf_dict = get_idf_dict_new()

HBox(children=(FloatProgress(value=0.0, max=319.0), HTML(value='')))




In [6]:
def create_vocab_dict(idf_dict):
    vocab = {}
    for i, word in enumerate(idf_dict.keys()):
        vocab[word] = i
    return vocab

In [7]:
N = len(data)
vocab = create_vocab_dict(idf_dict)

In [8]:
def get_idf(term):
    return math.log(1+N/idf_dict[term], 2)

In [9]:
def get_tf_dict(docid):
    tf_dict = {}
    tokens = data[docid]
    for token in tokens:
        if token not in tf_dict:
            tf_dict[token] = 1
        else:
            tf_dict[token] += 1
    return tf_dict

In [10]:
def getDocVector(docid):
    tf_dict = get_tf_dict(docid)
    vec = np.zeros(len(vocab))
    for word, tf in tf_dict.items():
        if word in vocab:
            pos = vocab[word]
            normtf = 1 + math.log(tf, 2)
            normidf = get_idf(word)
            tfidf = normtf * normidf
            vec[pos] = tfidf
    return vec

In [11]:
def precomputeDocVecs():
    vecs = {}
    alluids = list(data.keys())
    for uid in tqdm(alluids):
        vec = getDocVector(uid)
        vecs[uid] = vec
    return vecs
uid_to_vec = precomputeDocVecs()

HBox(children=(FloatProgress(value=0.0, max=319.0), HTML(value='')))




In [14]:
def getCosineSim(uid1, uid2):
    docVec1 = uid_to_vec[uid1]
    docVec2 = uid_to_vec[uid2]
    if np.linalg.norm(docVec1) == 0 or np.linalg.norm(docVec2) == 0:
        return 0.0
    return np.dot(docVec1, docVec2)/(np.linalg.norm(docVec1) * np.linalg.norm(docVec2))

In [15]:
def getJacobianSim(uid1, uid2):
    tokens1 = set(data[uid1])
    tokens2 = set(data[uid2])
    intersection = tokens1.intersection(tokens2)
    union = tokens1.union(tokens2)
    return len(intersection)/len(union)

In [16]:
def getWeightedEdges(similarity):
    alluids = list(data.keys())
    edges = list(combinations(alluids, 2))
    edges = edges[:20000]
    weighted_edges = []
    for e in tqdm(edges):
        v1 = e[0]
        v2 = e[1]
        if similarity == 1:
            w = getCosineSim(v1,v2)
        else:
            w = getJacobianSim(v1,v2)
        weighted_edges.append((v1,v2,w))
        weighted_edges.append((v2,v1,w))
    return weighted_edges

In [17]:
alluids = list(data.keys())
edges = list(combinations(alluids, 2))
len(edges)

50721

In [18]:
weights = getWeightedEdges(2)

HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




In [19]:
import networkx as nx

In [20]:
G = nx.Graph()

In [21]:
G.add_nodes_from(list(data.keys()))

In [22]:
G.add_weighted_edges_from(weights)

In [23]:
pr = nx.pagerank(G, alpha=0.85)

In [24]:
dict(sorted(pr.items(), key=lambda item: item[1],reverse=True))

{'53261': 0.010227675542109883,
 '53280': 0.009622442985388844,
 '53322': 0.009333126862441034,
 '53343': 0.009309251274469247,
 '53319': 0.009236913632321556,
 '53342': 0.009217498374517349,
 '53350': 0.008823798580121755,
 '53068': 0.00869421406923838,
 '53409': 0.008685147830630686,
 '53397': 0.008675722878272962,
 '53316': 0.008650493387893856,
 '53328': 0.008587084070073199,
 '53408': 0.008520180222355235,
 '53265': 0.008504505583868591,
 '53346': 0.008450724042225827,
 '53326': 0.00836562782705504,
 '53404': 0.008352468778677485,
 '53332': 0.008313891589754315,
 '53313': 0.008298996571050007,
 '53398': 0.008248389324332615,
 '53302': 0.008238776974870757,
 '53355': 0.008217748447912258,
 '53320': 0.00817133689728981,
 '53340': 0.008142533983119416,
 '53333': 0.008134368685616207,
 '53396': 0.008124991835212751,
 '53338': 0.008107349253255104,
 '53329': 0.008102167629566429,
 '53358': 0.008050171222438233,
 '53257': 0.008045547827847308,
 '53260': 0.008044538585003506,
 '53359': 0

In [25]:
weights = getWeightedEdges(1)

HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




In [26]:
G = nx.Graph()
G.add_nodes_from(list(data.keys()))
G.add_weighted_edges_from(weights)
pr = nx.pagerank(G, alpha=0.85)
dict(sorted(pr.items(), key=lambda item: item[1],reverse=True))

{'53398': 0.013626424760508215,
 '53280': 0.013098410606109128,
 '53332': 0.0123423032080349,
 '53261': 0.012302099738228073,
 '53320': 0.012131744230562446,
 '53328': 0.012013263530877784,
 '53340': 0.01183944486796705,
 '53404': 0.011023104133878763,
 '53322': 0.011016322158570875,
 '53396': 0.010951325882721909,
 '53397': 0.01086018821118871,
 '53343': 0.010554052058050318,
 '53293': 0.010143267180174921,
 '53257': 0.01011855092927956,
 '53342': 0.010046633198056903,
 '53346': 0.009604576407198582,
 '53265': 0.009458231335364076,
 '53324': 0.009377018872518872,
 '53355': 0.00930669231421712,
 '53327': 0.009207032570490381,
 '53260': 0.009152131756504598,
 '53408': 0.008804891430140653,
 '53406': 0.008707652927961612,
 '53409': 0.008647432408378359,
 '53350': 0.008555400895496446,
 '53319': 0.008462076972426586,
 '53313': 0.008434045648341769,
 '53358': 0.00833361781959852,
 '53278': 0.008275263423891928,
 '53325': 0.00812548542965706,
 '53394': 0.008028119055747648,
 '53333': 0.0080