This notebook finds occurrences of COVID-19 related entities and relations in the Black Lives Matter tweets and vicer versa

In [1]:
from functools import reduce
from pyspark.sql.functions import col, lit, when, desc
from graphframes import *

from pyspark.sql import SparkSession

from nltk.corpus import stopwords
import string
import pickle

In [2]:
database = "analysisDB" # mongoDB database name
collection = "blm_vertices"
collection_2 = "blm_edges"
topic = "blm"

In [3]:
spark = SparkSession \
    .builder \
    .appName("myApp") \
    .config("spark.mongodb.input.uri=mongodb://mongo:27017/"+database+"."+collection) \
    .config("spark.mongodb.output.uri=mongodb://mongo:27017/"+database+"."+collection) \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.2') \
    .getOrCreate()

sc = spark.sparkContext

In [4]:
vert_df = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri","mongodb://mongo:27017/"+database+"."+collection).load()
vert_df.columns

['_id', 'vertex']

In [6]:
spark = SparkSession \
    .builder \
    .appName("myApp") \
    .config("spark.mongodb.input.uri=mongodb://mongo:27017/"+database+"."+collection_2) \
    .config("spark.mongodb.output.uri=mongodb://mongo:27017/"+database+"."+collection_2) \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.2') \
    .getOrCreate()

sc = spark.sparkContext

In [7]:
edge_df = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri","mongodb://mongo:27017/"+database+"."+collection_2).load()
edge_df.columns

['_id', 'edge']

In [8]:
edge_list = [tuple(i.edge) for i in edge_df.select('edge').distinct().collect()]
edge_list[:5]

[('birthplace', 'black 🖤 blacklivesmatter https://t.co/fjtsmgh1nk', 'is'),
 ('black lives', 'country', 'matter in'),
 ('woman', 'failure signal', 'was arrested for'),
 ('@bts_twt', 'donation', 'has'),
 ('chicago pd', 'restr', 'arrested')]

In [9]:
vert_list = [tuple(i.vertex) for i in vert_df.select('vertex').distinct().collect()]
vert_list[:5]

[('75 year old guy', '75 year old guy'),
 ('kid listening', 'kid listening'),
 ('irish lad', 'irish lad'),
 ('fascism', 'fascism'),
 ('fight', 'fight')]

In [10]:
vertices = spark.createDataFrame(vert_list, ["id", "text"])
edges = spark.createDataFrame(edge_list, ["src", "dst", "relationship"])
g = GraphFrame(vertices, edges)
print(g)

GraphFrame(v:[id: string, text: string], e:[src: string, dst: string ... 1 more field])


In [30]:
search_list = ['covid', 'corona']

covid_in_blm_df = g.edges.where(g.edges['src'].isin(search_list) | g.edges['dst'].isin(search_list))
covid_in_blm_df.show()

+--------------+--------------------+--------------------+
|           src|                 dst|        relationship|
+--------------+--------------------+--------------------+
|         covid|               trump|          kinda like|
|    @fti_us ft|               covid|           is amc by|
|            we|               covid|          had due to|
|            it|               covid|             come to|
| many unknowns|               covid|             is with|
|    datesheets|              corona| increasing cases of|
|        corona|_ opposition rest...|             restart|
|         covid|       inevitability|                 was|
|         covid|       manmade virus|                  is|
|        corona|              strong|                  is|
|           one|               covid|  would fully expect|
|        corona|waziristan waziri...|are again spreadi...|
|         covid|       our door step|   has come right at|
|         covid|                rise|           has give

In [31]:
covid_in_blm_df.count()

1234

In [14]:
database = "analysisDB" # mongoDB database name
collection = "covid_vertices"
collection_2 = "covid_edges"
topic = "covid"

In [15]:
spark = SparkSession \
    .builder \
    .appName("myApp") \
    .config("spark.mongodb.input.uri=mongodb://mongo:27017/"+database+"."+collection) \
    .config("spark.mongodb.output.uri=mongodb://mongo:27017/"+database+"."+collection) \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.2') \
    .getOrCreate()

sc = spark.sparkContext

vert_df = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri","mongodb://mongo:27017/"+database+"."+collection).load()
vert_df.columns

['_id', 'vertex']

In [16]:
spark = SparkSession \
    .builder \
    .appName("myApp") \
    .config("spark.mongodb.input.uri=mongodb://mongo:27017/"+database+"."+collection_2) \
    .config("spark.mongodb.output.uri=mongodb://mongo:27017/"+database+"."+collection_2) \
    .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.3.2') \
    .getOrCreate()

sc = spark.sparkContext

edge_df = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri","mongodb://mongo:27017/"+database+"."+collection_2).load()
edge_df.columns

['_id', 'edge']

In [17]:
edge_list = [tuple(i.edge) for i in edge_df.select('edge').distinct().collect()]
edge_list[:5]

[('infections rate', 'floor', 'has dropped through'),
 ('it', 'level of normality', 'may seem'),
 ('our rwchr', 'online conference', 'host'),
 ('adrian miedema',
  'canada begins to returntowork via @canlawmag',
  'discusses employment obligations'),
 ('life', 'covid19', 'eliminating')]

In [18]:
vert_list = [tuple(i.vertex) for i in vert_df.select('vertex').distinct().collect()]
vert_list[:5]

[('important', 'important'),
 ('expert speakers', 'expert speakers'),
 ('keeping', 'keeping'),
 ('covid19 recovery', 'covid19 recovery'),
 ('march against injustice', 'march against injustice')]

In [19]:
vertices = spark.createDataFrame(vert_list, ["id", "text"])
edges = spark.createDataFrame(edge_list, ["src", "dst", "relationship"])
g = GraphFrame(vertices, edges)
print(g)

GraphFrame(v:[id: string, text: string], e:[src: string, dst: string ... 1 more field])


In [28]:
search_list = ['blm', 'blacklivesmatter']
blm_in_covid_df = g.edges.where(g.edges['src'].isin(search_list) | g.edges['dst'].isin(search_list))
blm_in_covid_df.show()

+----------------+--------------------+--------------------+
|             src|                 dst|        relationship|
+----------------+--------------------+--------------------+
|             blm|     political cause|          is seen as|
|             blm|            protests|               is in|
|            they|                 blm|     're sneering at|
|            fuck|                 blm|                 are|
|blacklivesmatter|           political|                  is|
|    noui @ca_edd|    blacklivesmatter|         broke alone|
|             blm|             traitor| is corporate funded|
|             blm|             version|                  is|
|             blm|     group of people|       are infecting|
|             blm|               today|ca has brushfires...|
|             blm|     mostly opinions|      of coverage is|
|              we|                 blm|         moved on to|
|blacklivesmatter|                  it|           cancelled|
|             blm|      

In [29]:
blm_in_covid_df.count()

96

In [26]:
df = covid_in_blm_df # covid_in_blm_df | blm_in_covid_df
comm_edges_list = [[i.src, i.dst] for i in df.collect()]

In [27]:
with open('data/covid_in_blm_edges.txt', 'wb') as fp:
    pickle.dump(comm_edges_list, fp)