## Notebook to Identify Communities from Tweet Text
This notebook uses the entities and relations extracted previously to build a knowledge graph and uses Label Propagation to identify communities within the graph.

In [1]:
from functools import reduce
from pyspark.sql.functions import col, lit, when, desc
from graphframes import *

from pyspark.sql import SparkSession

from nltk.corpus import stopwords
import string
import pickle

In [2]:
database = "analysisDB" #mongoDB Database Name
collection = "covid_vertices" # blm_vertices | covid_vertices
collection_2 = "covid_edges" # blm_edges | covid_edges
topic = "covid" # blm | covid

In [3]:
spark = SparkSession \
    .builder \
    .appName("myApp") \
    .config("spark.mongodb.input.uri=mongodb://mongo:27017/"+database+"."+collection) \
    .config("spark.mongodb.output.uri=mongodb://mongo:27017/"+database+"."+collection) \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.2') \
    .getOrCreate()

sc = spark.sparkContext

In [4]:
vert_df = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri","mongodb://mongo:27017/"+database+"."+collection).load()
vert_df.columns

['_id', 'vertex']

In [5]:
spark = SparkSession \
    .builder \
    .appName("myApp") \
    .config("spark.mongodb.input.uri=mongodb://mongo:27017/"+database+"."+collection_2) \
    .config("spark.mongodb.output.uri=mongodb://mongo:27017/"+database+"."+collection_2) \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.2') \
    .getOrCreate()

sc = spark.sparkContext

In [6]:
edge_df = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri","mongodb://mongo:27017/"+database+"."+collection_2).load()
edge_df.columns

['_id', 'edge']

In [7]:
edge_list = [tuple(i.edge) for i in edge_df.select('edge').distinct().collect()]
edge_list[:5]

[('infections rate', 'floor', 'has dropped through'),
 ('it', 'level of normality', 'may seem'),
 ('our rwchr', 'online conference', 'host'),
 ('adrian miedema',
  'canada begins to returntowork via @canlawmag',
  'discusses employment obligations'),
 ('life', 'covid19', 'eliminating')]

In [8]:
vert_list = [tuple(i.vertex) for i in vert_df.select('vertex').distinct().collect()]
vert_list[:5]

[('important', 'important'),
 ('expert speakers', 'expert speakers'),
 ('keeping', 'keeping'),
 ('covid19 recovery', 'covid19 recovery'),
 ('march against injustice', 'march against injustice')]

In [9]:
vertices = spark.createDataFrame(vert_list, ["id", "text"])
edges = spark.createDataFrame(edge_list, ["src", "dst", "relationship"])
g = GraphFrame(vertices, edges)
print(g)

GraphFrame(v:[id: string, text: string], e:[src: string, dst: string ... 1 more field])


In [10]:
# run LPA with 5 iterations - number of communities plateaus at 5
communities = g.labelPropagation(maxIter=5)
communities.persist().show(10)

+--------------------+--------------------+-------------+
|                  id|                text|        label|
+--------------------+--------------------+-------------+
|           126 cases|           126 cases|1657857377929|
|        160,350 sets|        160,350 sets|1657857377929|
|      covid19 update|      covid19 update| 979252543748|
|      many come home|      many come home|          964|
|your professional...|your professional...| 670014899496|
|                  🌟|                  🌟|  85899347343|
|300 families in d...|300 families in d...|   8589934658|
|   brand new product|   brand new product| 395136991297|
|died in worldwaro...|died in worldwaro...|   8589935171|
|         disinfected|         disinfected|1090921693608|
+--------------------+--------------------+-------------+
only showing top 10 rows



In [11]:
print (f"There are {communities.select('label').distinct().count()} communities in sample graph.") #60455

There are 213372 communities in sample graph.


In [12]:
communities_df = communities.groupBy('label').count().sort(desc('count'))
communities_df.show()

+-------------+-----+
|        label|count|
+-------------+-----+
|1657857377929|14827|
| 214748366473| 7507|
| 816043787198| 6150|
|1030792151530| 3674|
|1108101563177| 3555|
| 412316861910| 2511|
| 738734376112| 1442|
|  68719478317| 1205|
| 532575946237| 1174|
|1692217115374| 1140|
|1520418423213|  850|
| 326417515393|  607|
|1288490190460|  597|
|  85899346393|  560|
|1486058685990|  544|
|1047972021602|  453|
|1460288881331|  425|
| 137438954652|  396|
| 609885357075|  372|
|1451698946414|  370|
+-------------+-----+
only showing top 20 rows



In [13]:
communities_list = [i.label for i in communities_df.collect()]
communities_list[:5]

[1657857377929, 214748366473, 816043787198, 1030792151530, 1108101563177]

In [24]:
sample_comm = communities.where(communities['label'] == communities_list[200])
print(sample_comm.count())
sample_comm.show()

comm_nodes_list = [i.text for i in sample_comm.collect()]
comm_edges_list = [[i.src, i.dst] for i in g.edges.where(g.edges['src'].isin(comm_nodes_list) | g.edges['dst'].isin(comm_nodes_list)).collect()]

60
+--------------------+--------------------+------------+
|                  id|                text|       label|
+--------------------+--------------------+------------+
|rising covid19 nu...|rising covid19 nu...|910533067362|
|@dwuhlfelderlaw s...|@dwuhlfelderlaw s...|910533067362|
|homestead-miami s...|homestead-miami s...|910533067362|
|  badly hit by virus|  badly hit by virus|910533067362|
|       nearly $ 900m|       nearly $ 900m|910533067362|
|second wave of co...|second wave of co...|910533067362|
|          water park|          water park|910533067362|
|response to recor...|response to recor...|910533067362|
|current trend of ...|current trend of ...|910533067362|
|     new daily cases|     new daily cases|910533067362|
|19 states with up...|19 states with up...|910533067362|
|wealthiest commun...|wealthiest commun...|910533067362|
|increase of 1,096...|increase of 1,096...|910533067362|
|        ill children|        ill children|910533067362|
|             avoided|      

In [25]:
data_file_dir = 'data/covid_edges_6.txt'
with open(data_file_dir, 'wb') as fp:
    pickle.dump(comm_edges_list, fp)