In [50]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import math
import json
import sys
import time
import json
import re
import csv
from tqdm.notebook import tqdm
from itertools import combinations
from PorterStemmer import *
ps = PorterStemmer()

In [51]:
#delim = '''[ ',<>(){}.:;"`\n]'''
delim = '''[ ',(){}.@#+!_~&*%^=`|$:;"`\n]'''
def getTokensFromText(text):
    text = text.replace('[',':')
    text = text.replace(']',':')
    text = text.replace('\t',':')
    words = re.split(delim, text)
    res = []
    for w in words:
        if(len(w) > 0):
            temp = ps.stem(w.lower(), 0, len(w)-1)
            temp = w.lower()
            if not re.search('[0-9]+', temp):
                res.append(temp)
    return res

In [52]:
data_dir = r"C:\Files\a3data\20news-bydate-test"

In [53]:
def process_data():
    data = {}
    
    for topic in tqdm(os.listdir(data_dir)):
        if topic == 'talk.politics.mideast':
            topic_path = os.path.join(data_dir,topic)
            for uid in os.listdir(topic_path):
                full_path = os.path.join(topic_path,uid)
                with open(full_path) as filedata:
                    filedata = filedata.read()
                    tokens = getTokensFromText(filedata)
                    data[topic+"/"+uid] = tokens
            break
    return data
data = process_data()

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




In [54]:
def get_idf_dict_new():
    idf_dict = {}
    for uid in tqdm(data):
        tokens = data[uid]
        for token in tokens:
            if token not in idf_dict:
                idf_dict[token] = 1
            else:
                idf_dict[token] += 1
    return idf_dict
idf_dict = get_idf_dict_new()

HBox(children=(FloatProgress(value=0.0, max=376.0), HTML(value='')))




In [55]:
def create_vocab_dict(idf_dict):
    vocab = {}
    for i, word in enumerate(idf_dict.keys()):
        vocab[word] = i
    return vocab

In [56]:
N = len(data)
vocab = create_vocab_dict(idf_dict)

In [57]:
def get_idf(term):
    return math.log(1+N/idf_dict[term], 2)

In [58]:
def get_tf_dict(docid):
    tf_dict = {}
    tokens = data[docid]
    for token in tokens:
        if token not in tf_dict:
            tf_dict[token] = 1
        else:
            tf_dict[token] += 1
    return tf_dict

In [59]:
def getDocVector(docid):
    tf_dict = get_tf_dict(docid)
    vec = np.zeros(len(vocab))
    for word, tf in tf_dict.items():
        if word in vocab:
            pos = vocab[word]
            #normtf = 1 + math.log(tf, 2)
            normtf = 1+ math.log(tf, 2)
            normidf = get_idf(word)
            tfidf = normtf * normidf
            vec[pos] = tfidf
    return vec

In [60]:
def precomputeDocVecs():
    vecs = {}
    alluids = list(data.keys())
    for uid in tqdm(alluids):
        vec = getDocVector(uid)
        vecs[uid] = vec
    return vecs
uid_to_vec = precomputeDocVecs()

HBox(children=(FloatProgress(value=0.0, max=376.0), HTML(value='')))




In [61]:
def precomputeNorms():
    norms = {}
    alluids = list(data.keys())
    for uid in tqdm(alluids):
        norm = np.linalg.norm(uid_to_vec[uid])
        norms[uid] = norm
    return norms
uid_to_norms = precomputeNorms()

HBox(children=(FloatProgress(value=0.0, max=376.0), HTML(value='')))




In [62]:
def createUIDtoNumMapping():
    alluids = list(data.keys())
    mapping = {}
    for i,uid in enumerate(alluids):
        mapping[uid] = i
    return mapping
uid_to_num = createUIDtoNumMapping()

In [90]:
def getDotProdMat():
    M = []
    alluids = list(data.keys())
    for uid in tqdm(alluids):
        M.append(uid_to_vec[uid])
    M = np.array(M)
    a = M
    b = M.T
    cos_mat = np.dot(a,b)
    p1 = 1/np.diag(cos_mat)
    norm = np.sqrt(p1)
    ans = cos_mat * norm
    ans = ans.T * norm
    return ans
dotProdMat = getDotProdMat()

HBox(children=(FloatProgress(value=0.0, max=376.0), HTML(value='')))




In [89]:
uid_to_norms['talk.politics.mideast/76355']

50.641088062807704

In [91]:
def getCosineSim(uid1, uid2):
#     norm1 = uid_to_norms[uid1]
#     norm2 = uid_to_norms[uid2]
#     if norm1 == 0 or norm2 == 0:
#         return 0.0
#     return dotProdMat[uid_to_num[uid1]][uid_to_num[uid2]]/norm1 * norm2
    return dotProdMat[uid_to_num[uid1]][uid_to_num[uid2]]

In [63]:
def getJacobianSim(uid1, uid2):
    tokens1 = set(data[uid1])
    tokens2 = set(data[uid2])
    intersection = tokens1.intersection(tokens2)
    union = tokens1.union(tokens2)
    return len(intersection)/len(union)

In [92]:
def getWeightedEdges(similarity):
    alluids = list(data.keys())
    edges = list(combinations(alluids, 2))
    weighted_edges = []
    for e in tqdm(edges):
        v1 = e[0]
        v2 = e[1]
        if similarity == 1:
            w = getCosineSim(v1,v2)
        else:
            w = getJacobianSim(v1,v2)
        weighted_edges.append((v1,v2,w))
        #weighted_edges.append((v2,v1,w))
    return weighted_edges

In [66]:
alluids = list(data.keys())
edges = list(combinations(alluids, 2))
len(edges)

70500

In [70]:
weights = getWeightedEdges(2)


HBox(children=(FloatProgress(value=0.0, max=70500.0), HTML(value='')))




In [21]:
import networkx as nx

In [71]:
G = nx.Graph()

In [72]:
G.add_nodes_from(list(data.keys()))

In [73]:
G.add_weighted_edges_from(weights)

In [74]:
pr = nx.pagerank(G, alpha=0.85)

In [75]:
(sorted(pr.items(), key=lambda item: item[1],reverse=True))

[('talk.politics.mideast/77192', 0.0033064317385378046),
 ('talk.politics.mideast/76562', 0.0032994871896049245),
 ('talk.politics.mideast/76452', 0.0032811338275078664),
 ('talk.politics.mideast/77286', 0.00326528134538172),
 ('talk.politics.mideast/77196', 0.0032546994669033387),
 ('talk.politics.mideast/77276', 0.003243427171411387),
 ('talk.politics.mideast/77805', 0.003236057840889324),
 ('talk.politics.mideast/77242', 0.0032231731961810204),
 ('talk.politics.mideast/76444', 0.003213294851193184),
 ('talk.politics.mideast/77271', 0.003206738327224792),
 ('talk.politics.mideast/76542', 0.003203664116953406),
 ('talk.politics.mideast/77201', 0.0032017806890300904),
 ('talk.politics.mideast/76446', 0.003186673396326752),
 ('talk.politics.mideast/77215', 0.003180727449365584),
 ('talk.politics.mideast/77293', 0.003175257265072441),
 ('talk.politics.mideast/77253', 0.0031703951894759513),
 ('talk.politics.mideast/77375', 0.003169097411019068),
 ('talk.politics.mideast/76455', 0.0031648

In [93]:
weights = getWeightedEdges(1)

HBox(children=(FloatProgress(value=0.0, max=70500.0), HTML(value='')))




In [94]:
G = nx.Graph()
G.add_nodes_from(list(data.keys()))
G.add_weighted_edges_from(weights)
pr = nx.pagerank(G, alpha=0.85)
dict(sorted(pr.items(), key=lambda item: item[1],reverse=True))

{'talk.politics.mideast/77198': 0.004366318820926407,
 'talk.politics.mideast/77186': 0.0042636864548584955,
 'talk.politics.mideast/77195': 0.004236349624314801,
 'talk.politics.mideast/76539': 0.004122567785446187,
 'talk.politics.mideast/77230': 0.004109217278250145,
 'talk.politics.mideast/76479': 0.004096109059119693,
 'talk.politics.mideast/77276': 0.004072352133625169,
 'talk.politics.mideast/77249': 0.00400255849699887,
 'talk.politics.mideast/77364': 0.00400231397704169,
 'talk.politics.mideast/76475': 0.003925656734623501,
 'talk.politics.mideast/77257': 0.003886128109759034,
 'talk.politics.mideast/77397': 0.0038776929762634068,
 'talk.politics.mideast/77192': 0.003844142992827525,
 'talk.politics.mideast/77201': 0.0038360816414900944,
 'talk.politics.mideast/77378': 0.003782065622255457,
 'talk.politics.mideast/76465': 0.0037695340943159857,
 'talk.politics.mideast/77379': 0.0037397704214683584,
 'talk.politics.mideast/77319': 0.003688517385678402,
 'talk.politics.mideast/7

In [64]:
def getCosineSim(uid1, uid2):
    docVec1 = uid_to_vec[uid1]
    docVec2 = uid_to_vec[uid2]
    if np.linalg.norm(docVec1) == 0 or np.linalg.norm(docVec2) == 0:
        return 0.0
    return np.dot(docVec1, docVec2)/(np.linalg.norm(docVec1) * np.linalg.norm(docVec2))

In [68]:
weights = getWeightedEdges(1)

HBox(children=(FloatProgress(value=0.0, max=70500.0), HTML(value='')))




In [69]:
G = nx.Graph()
G.add_nodes_from(list(data.keys()))
G.add_weighted_edges_from(weights)
pr = nx.pagerank(G, alpha=0.85)
dict(sorted(pr.items(), key=lambda item: item[1],reverse=True))

{'talk.politics.mideast/77198': 0.004366318820926407,
 'talk.politics.mideast/77186': 0.0042636864548584955,
 'talk.politics.mideast/77195': 0.004236349624314801,
 'talk.politics.mideast/76539': 0.00412256778544619,
 'talk.politics.mideast/77230': 0.004109217278250144,
 'talk.politics.mideast/76479': 0.004096109059119694,
 'talk.politics.mideast/77276': 0.004072352133625168,
 'talk.politics.mideast/77249': 0.004002558496998869,
 'talk.politics.mideast/77364': 0.00400231397704169,
 'talk.politics.mideast/76475': 0.003925656734623502,
 'talk.politics.mideast/77257': 0.0038861281097590343,
 'talk.politics.mideast/77397': 0.0038776929762634076,
 'talk.politics.mideast/77192': 0.0038441429928275247,
 'talk.politics.mideast/77201': 0.0038360816414900944,
 'talk.politics.mideast/77378': 0.003782065622255456,
 'talk.politics.mideast/76465': 0.0037695340943159853,
 'talk.politics.mideast/77379': 0.003739770421468358,
 'talk.politics.mideast/77319': 0.003688517385678401,
 'talk.politics.mideast/