In [1]:
import pandas as pd
from collections import Counter
import networkx as nx 
from networkx.readwrite import json_graph
import json
import scipy
import numpy as np

In [2]:
event_name = '[TEDxNations]'

In [3]:
event_data = pd.read_table('data/%s_data_features.txt' % event_name, sep='\t', header=0, encoding='utf-8')
event_data_clusters = pd.read_table('data/%s_data_clusters.txt' % event_name, sep='\t', header=0, encoding='utf-8', index_col=0)

In [4]:
event_data = pd.merge(event_data, event_data_clusters, left_index=True, right_index=True)

In [76]:
#compute top tokens

topk = 50

def gather_tokens(data):
    all_tokens = []
    for doc in data:
        tokens = doc.split()
        all_tokens.extend(tokens)
    return all_tokens

#gather tokens from all docs 
all_tokens = gather_tokens(event_data['text_clean_tokens'])
#create counter object
tokens_cntr = Counter(all_tokens)

top_tokens = [token for token, count in tokens_cntr.most_common(topk)]

In [28]:
#tokens_cntr.most_common(50)

In [77]:
#manually construct scipy sparse matrix - alternative to sklearn vectorizer
vocabulary = {} #terms with indices
data = [] #1
row = [] #docs
col = [] #terms

for i,doc in event_data.iterrows():
    for term in doc['text_clean_tokens'].split():
        # get column index, adding the term to the vocabulary if needed
        j = vocabulary.setdefault(term, len(vocabulary))
        data.append(1) # uniform weights
        row.append(i)
        col.append(j)

A = scipy.sparse.coo_matrix((data, (row, col)))

In [78]:
#compute co occurrences matrix (terms by terms)
X = A.T * A

In [83]:
#create network of top terms cooccurrences
G = nx.Graph()

for term1 in top_tokens:
    index1 = vocabulary.get(term1)
    G.add_node(term1, node_type='term', frequency=tokens_cntr[term1])
    for term2 in top_tokens:
        index2 = vocabulary.get(term2)
        if term1 != term2:
            if X[index1, index2] != 0:
                G.add_edge(term1, term2, weight=X[index1, index2])
                
remove = [node for node,degree in G.degree().items() if degree == 0]
G.remove_nodes_from(remove)

G_nld = json_graph.node_link_data(G)

# write json
json.dump(G_nld, open('data/d3force_terms_network_data.json', 'w'), default=int, sort_keys=True, indent=4)

In [49]:
#create network of random sample tweets and top terms

samplek = 1000

G = nx.Graph()

for term1 in top_tokens:
    G.add_node(term1, node_type='term', frequency=tokens_cntr[term1], group=0)
    
tweets = event_data.sample(samplek)
for i,doc in tweets.iterrows():
    G.add_node(doc['id'], node_type='tweet', frequency=10, group=doc['cluster'])
    
for i,doc in tweets.iterrows():
    for term in top_tokens:
        if term in doc['text_clean_tokens'].split():
            G.add_edge(doc['id'], term)
            
remove = [node for node,degree in G.degree().items() if degree == 0]
G.remove_nodes_from(remove)

G_nld = json_graph.node_link_data(G)

# write json
json.dump(G_nld, open('data/d3force_tweets_network_data.json', 'w'), default=int, sort_keys=True, indent=4)