## Graph Based Recommendations With Neo4j, NLP, and Python Data Science Tools

In [None]:
!pip install py2neo
!pip 
!pip install summa
!pip install python-igraph

## Import data
- show the domain
- show the CSV heads
- 

In [2]:
from py2neo import Graph
graph = Graph()

### Import Groups and Topics
![](http://guides.neo4j.com/bostonmeetup/img/group_has_topic.png)

In [None]:
graph.run("CREATE CONSTRAINT ON (g:Group) ASSERT g.id IS UNIQUE;")

In [None]:
graph.run("CREATE CONSTRAINT ON (t:Topic) ASSERT t.id IS UNIQUE;")

In [None]:
graph.run("CREATE INDEX ON :Group(name)")

In [None]:
graph.run("CREATE INDEX ON :Topic(name)")

In [None]:
graph.run('''
LOAD CSV WITH HEADERS
FROM "https://raw.githubusercontent.com/johnymontana/harvard-bar/master/data/groups.csv"
AS row
MERGE (group:Group { id:row.id })
ON CREATE SET
  group.name = row.name,
  group.urlname = row.urlname,
  group.rating = toInt(row.rating),
  group.created = toInt(row.created)
''')

In [None]:
graph.run('''
LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/johnymontana/harvard-bar/master/data/groups_topics.csv"  AS row
MERGE (topic:Topic {id: row.id})
ON CREATE SET topic.name = row.name, topic.urlkey = row.urlkey
''')

In [None]:
graph.run('''
LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/johnymontana/harvard-bar/master/data/groups_topics.csv"  AS row
MATCH (topic:Topic {id: row.id})
MATCH (group:Group {id: row.groupId})
MERGE (group)-[:HAS_TOPIC]->(topic)
''')

### Find similar groups to Graph Database Boston
By looking at topics, can we find groups that have similar topics to Graph Database Boston?

In [4]:
result = graph.run('''
MATCH (group:Group)-[:HAS_TOPIC]->(topic)<-[:HAS_TOPIC]-(otherGroup)
WHERE group.name CONTAINS "Graph Database"
RETURN otherGroup.name, COUNT(topic) AS topicsInCommon,
       COLLECT(topic.name) AS topics
ORDER BY topicsInCommon DESC, otherGroup.name
LIMIT 10
''')

for row in result:
    print(row)

(u'otherGroup.name': u'Analytics.Club Boston', u'topicsInCommon': 6, u'topics': [u'Data Mining', u'NoSQL', u'Data Analytics', u'Data Visualization', u'Hadoop', u'Big Data'])
(u'otherGroup.name': u'AnalyticsClub MetroWest, MA', u'topicsInCommon': 6, u'topics': [u'Data Mining', u'NoSQL', u'Data Analytics', u'Data Visualization', u'Hadoop', u'Big Data'])
(u'otherGroup.name': u'Big Data Developers in Boston', u'topicsInCommon': 6, u'topics': [u'Data Mining', u'NoSQL', u'Data Analytics', u'Data Visualization', u'Hadoop', u'Big Data'])
(u'otherGroup.name': u'Boston Data Engineering Meetup', u'topicsInCommon': 6, u'topics': [u'Data Management', u'Data Mining', u'Data Analytics', u'Data Visualization', u'Hadoop', u'Big Data'])
(u'otherGroup.name': u'Data Science Professional Development Boston', u'topicsInCommon': 6, u'topics': [u'Data Management', u'Data Mining', u'Data Analytics', u'Data Visualization', u'Hadoop', u'Big Data'])
(u'otherGroup.name': u'Boston Analytics Professionals', u'topics

## Topic Similarity
Clustering topics based on similarity

In [5]:
from igraph import Graph as IGraph

Find all pairs of topics and find the number of common groups that share each pair of topics. We'll use this as weight to build a "virtual graph" of the form`(Topic)-[:OCCURS_WITH {weight}]-(Topic)`

In [7]:

query = """
MATCH (topic:Topic)<-[:HAS_TOPIC]-()-[:HAS_TOPIC]->(other:Topic)
WHERE ID(topic) < ID(other)
RETURN topic.name, other.name, COUNT(*) AS weight
ORDER BY weight DESC
LIMIT 10
"""

result = graph.run(query)
for row in result:
    print(row)


(u'topic.name': u'Software Development', u'other.name': u'Computer programming', u'weight': 101)
(u'topic.name': u'Web Development', u'other.name': u'Software Development', u'weight': 88)
(u'topic.name': u'Open Source', u'other.name': u'Software Development', u'weight': 77)
(u'topic.name': u'New Technology', u'other.name': u'Software Development', u'weight': 72)
(u'topic.name': u'Big Data', u'other.name': u'Data Analytics', u'weight': 68)
(u'topic.name': u'Web Technology', u'other.name': u'Software Development', u'weight': 65)
(u'topic.name': u'Web Development', u'other.name': u'Computer programming', u'weight': 60)
(u'topic.name': u'Big Data', u'other.name': u'Big Data Analytics', u'weight': 59)
(u'topic.name': u'Open Source', u'other.name': u'Computer programming', u'weight': 54)
(u'topic.name': u'Big Data', u'other.name': u'Data Science', u'weight': 51)


Now let's run this query again and build an igraph instance from the results:

In [8]:
query = """
MATCH (topic:Topic)<-[:HAS_TOPIC]-()-[:HAS_TOPIC]->(other:Topic)
WHERE ID(topic) < ID(other)
RETURN topic.name, other.name, COUNT(*) AS weight
"""

ig = IGraph.TupleList(graph.run(query), weights=True)
ig

<igraph.Graph at 0x107ec9148>

Now we'll run the Walktrap community detection algorithm to find clusters / communities:

In [9]:
clusters = IGraph.community_walktrap(ig, weights="weight")
clusters = clusters.as_clustering()
len(clusters)

39

Let's inspect the results:

In [10]:

nodes = [node["name"] for node in ig.vs]
nodes = [{"id": x, "label": x} for x in nodes]
nodes[:5]

for node in nodes:
    idx = ig.vs.find(name=node["id"]).index
    node["group"] = clusters.membership[idx]
    
nodes[:5]

[{'group': 0,
  'id': u'Software QA and Testing',
  'label': u'Software QA and Testing'},
 {'group': 0, 'id': u'Test Automation', 'label': u'Test Automation'},
 {'group': 1, 'id': u'Game Development', 'label': u'Game Development'},
 {'group': 2, 'id': u'Games', 'label': u'Games'},
 {'group': 3,
  'id': u'MicroStrategy Best Practices',
  'label': u'MicroStrategy Best Practices'}]

Now we'll write the results back to Neo4j, extended our graph model:
![](http://guides.neo4j.com/bostonmeetup/img/cluster_datamodel.png)

In [11]:
query = """
UNWIND {params} AS p 
MATCH (t:Topic {name: p.id}) 
MERGE (cluster:Cluster {name: p.group})
MERGE (t)-[:IN_CLUSTER]->(cluster)
"""

graph.run(query, params = nodes)

<py2neo.database.Cursor at 0x108786790>

We can see which clusters the Python related topics end up being in:
![](http://guides.neo4j.com/bostonmeetup/img/python_cluster.png)

In [None]:
graph.run('''
MATCH (cluster:Cluster)<-[inCluster:IN_CLUSTER]-(topic)
WHERE topic.name CONTAINS "Python"
RETURN *
''')

## My Similar Groups

We need to add Member data in order to build more relevant
recommendations:
![](http://guides.neo4j.com/bostonmeetup/img/group_has_topic_member_of.png)

In [12]:
graph.run('''
CREATE CONSTRAINT ON (m:Member)
ASSERT m.id IS UNIQUE''')

<py2neo.database.Cursor at 0x108786750>

In [13]:
graph.run('''
USING PERIODIC COMMIT 10000
LOAD CSV WITH HEADERS
FROM "https://raw.githubusercontent.com/johnymontana/harvard-bar/master/data/members.csv" AS row
WITH DISTINCT row.id AS id, row.name AS name
MERGE (member:Member {id: id})
ON CREATE SET member.name = name
''')

<py2neo.database.Cursor at 0x108786490>

In [14]:
graph.run('''
USING PERIODIC COMMIT 10000
LOAD CSV WITH HEADERS
FROM "https://raw.githubusercontent.com/johnymontana/harvard-bar/master/data/members.csv" AS row
WITH row WHERE NOT row.joined is null
MATCH (member:Member {id: row.id})
MATCH (group:Group {id: row.groupId})
MERGE (member)-[membership:MEMBER_OF]->(group)
ON CREATE SET membership.joined=toInt(row.joined);
''')

<py2neo.database.Cursor at 0x108786510>

In [15]:
graph.run('''
MATCH (member:Member)-[membership:MEMBER_OF]->(group)
RETURN member, group, membership
LIMIT 10
''')

<py2neo.database.Cursor at 0x1087863d0>

![](http://guides.neo4j.com/bostonmeetup/img/group_members.png)

In [17]:
graph.run("CREATE INDEX ON :Member(name)")

<py2neo.database.Cursor at 0x109866a90>

### Find my similar groups

In [18]:
results = graph.run('''MATCH (member:Member {name: "Will Lyon"})-[:MEMBER_OF]->()-[:HAS_TOPIC]->()<-[:HAS_TOPIC]-(otherGroup:Group)
WHERE NOT (member)-[:MEMBER_OF]->(otherGroup)
RETURN otherGroup.name,
       COUNT(*) AS topicsInCommon
ORDER BY topicsInCommon DESC
LIMIT 10''')

for row in results:
    print(row)

(u'otherGroup.name': u'Data Science Professional Development Boston', u'topicsInCommon': 6)
(u'otherGroup.name': u'AnalyticsClub MetroWest, MA', u'topicsInCommon': 6)
(u'otherGroup.name': u'Analytics.Club Boston', u'topicsInCommon': 6)
(u'otherGroup.name': u'Big Data Developers in Boston', u'topicsInCommon': 6)
(u'otherGroup.name': u'Boston Data Engineering Meetup', u'topicsInCommon': 6)
(u'otherGroup.name': u'Social Data and Analytics Meetup - Boston', u'topicsInCommon': 5)
(u'otherGroup.name': u'Boston Deep Learning', u'topicsInCommon': 5)
(u'otherGroup.name': u'Boston Analytics Professionals', u'topicsInCommon': 5)
(u'otherGroup.name': u'New England MicroStrategy User Group', u'topicsInCommon': 5)
(u'otherGroup.name': u'Boston Smart Data Meetup Group', u'topicsInCommon': 5)


## Events
![](http://guides.neo4j.com/bostonmeetup/img/event_datamodel.png)

In [19]:
graph.run("CREATE CONSTRAINT ON (e:Event) ASSERT e.id IS UNIQUE")

<py2neo.database.Cursor at 0x10984d0d0>

In [20]:
graph.run("CREATE INDEX ON :Event(time)")

<py2neo.database.Cursor at 0x10984d3d0>

In [21]:
graph.run('''USING PERIODIC COMMIT 10000
LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/johnymontana/harvard-bar/master/data/events.csv" AS row
MERGE (event:Event {id: row.id})
ON CREATE SET event.name = row.name,
              event.description = row.description,
              event.time = toInt(row.time),
              event.utcOffset = toInt(row.utc_offset)
''')
              

<py2neo.database.Cursor at 0x10984d490>

In [22]:
graph.run('''
USING PERIODIC COMMIT 10000
LOAD CSV WITH HEADERS FROM "https://raw.githubusercontent.com/johnymontana/harvard-bar/master/data/events.csv" AS row

WITH distinct row.group_id as groupId, row.id as eventId
MATCH (group:Group {id: groupId})
MATCH (event:Event {id: eventId})
MERGE (group)-[:HOSTED_EVENT]->(event)
''')

<py2neo.database.Cursor at 0x10984d110>

In [23]:
graph.run('''
MATCH (group:Group)-[hosted:HOSTED_EVENT]->(event)
WHERE group.name CONTAINS "Graph Database" AND event.time < timestamp()
RETURN event, group, hosted
ORDER BY event.time DESC
LIMIT 10
''')

<py2neo.database.Cursor at 0x1082db390>

![](![](http://guides.neo4j.com/bostonmeetup/img/graph_database_events.png)

### Extracting keywords from event descriptions
Note that we have topics for groups, but not for Events. We can use some NLP techniques to extract keywords from event descriptions and extend our datamodel to take those keywords into account in our recommendation queries.

![](![](http://guides.neo4j.com/bostonmeetup/img/keyword_datamodel.png)

In [None]:
# Helper function for stripping HTML
from HTMLParser import HTMLParser
class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
...     def handle_data(self, d):
...         self.fed.append(d)
...     def get_data(self):
...         return ''.join(self.fed)
...
>>> def strip_tags(html):
...     s = MLStripper()
...     s.feed(html)
...     return s.get_data()