## The Matrix

In [23]:
import numpy
import pandas
import string
from stop_words import get_stop_words


# Load descriptions and titles
colnames = ['title', 'views' , 'descr']
data = pandas.read_csv('./data/descriptions-2.csv', names=colnames)
titles = data.title.tolist()
views = data.views.tolist()
descriptions = data.descr.tolist()

# Load deswordlists from CSV
colnames = ['descriptions', 'words']
data = pandas.read_csv('./data/desPlusWords-2.csv', names=colnames)
descriptions = data.descriptions.tolist()
words = data.words.tolist()


# A couple of functions

def ourtokens(ourstring):
    
    stoplist = set(get_stop_words('en'))
    finalList = []
    
    wordList = ourstring.lower().split()
    for i in range(len(wordList)):
        #wordList[i] = re.sub('[^a-zA-Z\']', '', wordList[i]).strip(chr(8212)) 
        #NOTE: the above left spaces and added empty strings
        no_punc = wordList[i].strip(string.punctuation) #remove most punctuation
        no_emphwhatever = no_punc.strip(chr(8212)) # remove that weirdness
        no_num = no_emphwhatever.strip(string.digits) #remove numbers
        if (len(no_num) > 0) and (no_num not in stoplist): # Requires stop_words
            # First conditional stops empty strings from being added
            finalList.append(no_num)            
    return finalList

def jaccard_similarity(query, document):
    intersection = set(query).intersection(set(document))
    #print(intersection)
    union = set(query).union(set(document))
    #print(union)
    return len(intersection)/len(union)

# Create des_word_lists 
des_word_lists = []
for i in range(len(descriptions)):
    # Create list of words for each description
    words = ourtokens(descriptions[i])
    des_word_lists.append({'descriptions': descriptions[i], 'words': words})
    
    # Tells you where you are in the rows
    if (i % 100) == 0:
        print(str(i) + " rows completed")
        
# Create JACCARD MATRIX
# From http://stackoverflow.com/questions/568962/how-do-i-create-an-empty-array-matrix-in-numpy
Ndes = len(des_word_lists)
full_jac_mat = numpy.zeros(shape=(Ndes,Ndes))
thresh_JM = numpy.zeros(shape=(Ndes,Ndes))
jac_lst = []
for i in range(Ndes):
    if (i % 100) == 0:
        print(str(i) + " rows completed")    
    # Start the pairwise computations
    for j in range((i+1),Ndes):
        # Pull the ith and jth document
        doc_i = des_word_lists[i]['words']
        doc_j = des_word_lists[j]['words']
        # Get the Jaccard similarity
        jac_ij = jaccard_similarity(doc_i, doc_j)
        # Since the Jaccard will be the same between i and j as it will between
        # j and i, we set JAC_MAT[i,j] and JAC_MAT[j,i] to be the same value
        jac_mat[i,j] = jac_ij
        jac_mat[j,i] = jac_ij
        
        if jac_ij > 0.01:
            thresh_JM[i,j] = jac_ij
            thresh_JM[j,i] = jac_ij
        
        # Get all the non-zero Jaccard values
        #if jac_ij != 0:
            #jac_lst.append(jac_ij)

0 rows completed
100 rows completed
200 rows completed
300 rows completed
400 rows completed
500 rows completed
600 rows completed
700 rows completed
800 rows completed
900 rows completed
1000 rows completed
1100 rows completed
1200 rows completed
1300 rows completed
1400 rows completed
1500 rows completed
1600 rows completed
1700 rows completed
1800 rows completed
1900 rows completed
2000 rows completed
2100 rows completed
2200 rows completed
0 rows completed
100 rows completed
200 rows completed
300 rows completed
400 rows completed
500 rows completed
600 rows completed
700 rows completed
800 rows completed
900 rows completed
1000 rows completed
1100 rows completed
1200 rows completed
1300 rows completed
1400 rows completed
1500 rows completed
1600 rows completed
1700 rows completed
1800 rows completed
1900 rows completed
2000 rows completed
2100 rows completed
2200 rows completed


In [22]:
import numpy as np

sort_jac_lst = sorted(jac_lst)
np.histogram(jac_lst, bins = 5)

(array([969863,   9310,    114,     19,      6]),
 array([ 0.00813008,  0.06737363,  0.12661718,  0.18586073,  0.24510428,
         0.30434783]))

In [32]:
#jac_mat[0:8, 0:10]
thresh_JM[0:8, 0:10]
numpy.savetxt("./testThreshJM.csv",thresh_JM, fmt = '%1.5f', delimiter=",")

In [18]:
# This block find the maximum for the matrix

# Initialize the max to be zero. 
mat_max = 0

# Loop over all the rows
for i in range(Ndes):
    # Find the maximum for each row
    row_max = max(jac_mat[i])
    
    # Check if the current row's maximum is higher than the current MAT_MAX.
    # If the row maximum is bigger, then set MAT_MAX to the row maximum.
    if row_max > mat_max:
        mat_max = row_max

print(mat_max)

0.304347826087


## Network

In [26]:
import networkx as nx
import matplotlib.pyplot as plt

# Create a network from the matrix
G = nx.from_numpy_matrix(thresh_JM)

In [27]:
# Draw the graph

pos = nx.spring_layout(G)
fig = plt.figure(figsize=(18,18))
nx.draw(G, 
        pos)
plt.savefig('./outputs/descriptions-thresh.png')

In [None]:
# Save edge list
# write_edgelist(G, path, delimiter=',', data=True, encoding='utf-8')
nx.write_weighted_edgelist(G, './outputs/desc-net.csv', comments='#', delimiter=',', encoding='utf-8')

In [14]:
# Save JSON graph
import json
import networkx as nx

G_in_json = json_graph.node_link_data(G)

# with open('./outputs/desc_network.json', 'w') as myoutfile:
#    myoutfile.write(json.dumps(nx.json_graph.node_link_data(G)))

NameError: name 'json_graph' is not defined

### Prune the Network

In [None]:
# SCANNING
# Gdegree = nx.average_degree_connectivity(G)
print(Gdegree)

# PRUNING                            
# remove = [node for node, degree in G.degree().items() if degree <= 4]
print(len(remove))
# gmt2 = G.remove_nodes_from(remove)
# print(len(gmt2.nodes()), len(gmt2.edges()))

In [27]:
import csv

with open('./outputs/desc-labels.csv', 'w', newline='\n') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    wr.writerow(titles)

In [23]:
type(titles)

list

In [1]:
# Example of writing JSON format graph data and using the D3 Javascript library 
# to produce an HTML/Javascript drawing.
#    Copyright (C) 2011-2012 by
#    Aric Hagberg <hagberg@lanl.gov>
#    Dan Schult <dschult@colgate.edu>
#    Pieter Swart <swart@lanl.gov>
#    All rights reserved.
#    BSD license.

__author__ = """Aric Hagberg <aric.hagberg@gmail.com>"""
import json
import networkx as nx
from networkx.readwrite import json_graph
import http_server

G = nx.barbell_graph(6,3)
# this d3 example uses the name attribute for the mouse-hover value,
# so add a name to each node
for n in G:
    G.node[n]['name'] = n
# write json formatted data
d = json_graph.node_link_data(G) # node-link format to serialize
# write json
json.dump(d, open('force/force.json','w'))
print('Wrote node-link JSON data to force/force.json')
# open URL in running web browser
http_server.load_url('force/force.html')
print('Or copy all files in force/ to webserver and load force/force.html')