In [None]:
import sparknlp
import json
import os
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from sparknlp.annotator import *
from sparknlp.base import *
from pyspark.sql.functions import from_unixtime
from pyspark.sql.functions import unix_timestamp
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.stat import Correlation
from pyspark.ml.clustering import BisectingKMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql.functions import col, to_timestamp,date_format
from pyspark import StorageLevel
import pyspark.sql.functions as F
from sparknlp.pretrained import PretrainedPipeline
from collections import Counter
from wordcloud import WordCloud
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer,LemmatizerModel, StopWordsCleaner)
from pyspark.ml import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns

def start():
    builder = SparkSession.builder \
        .appName("Spark NLP Licensed") \
        .master("local[*]") \
        .config("spark.driver.memory", "24G") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.kryoserializer.buffer.max", "2040M") \
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.5.1") \
        .config("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") \
        .config("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
    return builder.getOrCreate()
spark = start()
spark.version
!ps -ef | grep spark


In [None]:
spark.version
!ps -ef | grep spark
!free -g

In [None]:
#sourceData1 = spark.read.format("json").load("gs://covid19-tweets/2020-04/coronavirus-tweet-id-2020-04-15*.jsonl.gz")
#sourceData1.repartition(100).write.save("parquetFile1.parquet")
parquetFile1 = spark.read.parquet("parquetFile1.parquet")
parquetFile1.createOrReplaceTempView("tweetView")

In [None]:
sourceData2 = spark.read.format("json").load("gs://covid19-tweets/2020-04/*.gz")
sourceData2.repartition(200).write.save("parquetFile2.parquet")
parquetFile2 = spark.read.parquet("parquetFile2.parquet")
parquetFile2.createOrReplaceTempView("tweetView2")

In [None]:
tempDF1 = spark.sql("""
SELECT entities.hashtags.text AS hashtags, COUNT(*) as cnt
FROM tweetView
GROUP BY hashtags
ORDER BY cnt DESC limit 50
""")
tempDF1.show()

In [7]:
tempDF.show()

+--------------------+------+
|            hashtags|   cnt|
+--------------------+------+
|                  []|720335|
|           [COVID19]| 26611|
|       [coronavirus]|  9560|
|           [Covid19]|  3742|
|       [Coronavirus]|  3348|
|             [China]|  2379|
|[BaksosTniPolriut...|  1998|
|           [covid19]|  1596|
|          [StayHome]|  1566|
|             [COVID]|  1510|
| [DapurUmumTNIPolri]|  1190|
|[SuperM, Together...|  1185|
|          [BREAKING]|  1049|
|          [lockdown]|   940|
|         [COVIDー19]|   902|
|[COVID19, coronav...|   701|
|            [Corona]|   658|
|          [Covid_19]|   622|
|         [เขื่อนโขง]|   609|
|        [StayAtHome]|   607|
+--------------------+------+
only showing top 20 rows



In [19]:
tempDF2 = spark.sql('''
SELECT entities.user_mentions.name AS mentioned_user, COUNT(*) as cnt
FROM tweetView
GROUP BY mentioned_user
ORDER BY cnt DESC limit 20
''')

In [None]:
tempDF2.show()

In [96]:
df3  = spark.sql('''SELECT user.name as username,   
 CASE WHEN possibly_sensitive = true THEN 1 ELSE 0 END AS sensitive_count,
 retweet_count , favorite_count, user.followers_count as followers_count
from tweetView where possibly_sensitive = true and lang = 'en' and lower(concat_ws(' ',entities.hashtags.text)) rlike 'corona|covid|stay|wuhan|virus|who|cdc|trump|epidemic|pandemic|outbreak' ''')

df3.show(10,False)



+-------------------+---------------+-------------+--------------+---------------+
|username           |sensitive_count|retweet_count|favorite_count|followers_count|
+-------------------+---------------+-------------+--------------+---------------+
|Richard gibb       |1              |108          |0             |2922           |
|Lean Cartel Dreular|1              |0            |0             |15             |
|Toni Tannoury 🇱🇧 |1              |5            |9             |860            |
|Nnamdi UK          |1              |70           |0             |712            |
|FXL                |1              |194          |0             |347            |
|FEMBEAUTIES        |1              |6            |32            |13077          |
|REBEL CITY RECORDS |1              |16           |65            |365            |
|freedom            |1              |2543         |0             |16             |
|Occupy Schagen     |1              |6            |0             |8955           |
|Disgr

In [97]:
df3.createOrReplaceTempView("selective_sensitive_tweets")
                 
df6= spark.sql('''SELECT username, 
sum(sensitive_count) as sensitive_count, (sum(retweet_count)) as retweet_count, 
(sum(favorite_count)) as favourite_count, first(followers_count) as followers_count 
from selective_sensitive_tweets group by username order by sensitive_count desc limit 50''')

df6.show()

+--------------------+---------------+-------------+---------------+---------------+
|            username|sensitive_count|retweet_count|favourite_count|followers_count|
+--------------------+---------------+-------------+---------------+---------------+
|         V. D. निखिल|              3|            0|              0|             12|
|            Cheer-Up|              3|            1|              4|           8186|
|       Kenlam274🇭🇰|              2|            1|              3|            471|
|🦠️🗣ERADICATE CO...|              2|            0|              0|           1711|
|Erotic Art Photog...|              2|            1|              3|            398|
|        Tammy Searle|              2|            0|              0|           3651|
|           Nico Gaia|              2|            8|              8|          35473|
|          Sabah Alam|              2|            0|              2|           3285|
|  SNAP : FREAKYGUY_X|              2|            0|              0| 

In [82]:
#df3.createOrReplaceTempView("sensitive_tweets")
                 
df4= spark.sql('''SELECT username, 
sum(sensitive_count) as sensitive_count, (sum(retweet_count)) as retweet_count, 
(sum(favorite_count)) as favourite_count, first(followers_count) as followers_count 
from sensitive_tweets group by username order by sensitive_count desc limit 50''')

df4.show()

+--------------------+---------------+-------------+---------------+---------------+
|            username|sensitive_count|retweet_count|favourite_count|followers_count|
+--------------------+---------------+-------------+---------------+---------------+
| Somsirsa Chatterjee|             19|            0|              0|            796|
|   Against Ignorance|             10|            0|              0|            157|
|Prince Neal_Agniv...|              9|            0|              0|            318|
|ดกนปหด🇻🇳🇺🇸🇯?...|              8|            9|              0|           2775|
|Francesca BaiMuDa...|              8|           12|              0|           4926|
|              George|              8|            0|              0|            434|
|      Kim Kardashian|              8|            0|              0|           1030|
|      uMbhali Wodumo|              7|            0|              0|           3645|
|            James Wu|              7|            7|              7|  

In [83]:
df4= spark.sql('''SELECT username, 
sum(sensitive_count) as sensitive_count, (sum(retweet_count)) as retweet_count, 
(sum(favorite_count)) as favourite_count, first(followers_count) as followers_count 
from sensitive_tweets group by username order by sensitive_count asc limit 50''')


In [84]:

df4.show()

+--------------------+---------------+-------------+---------------+---------------+
|            username|sensitive_count|retweet_count|favourite_count|followers_count|
+--------------------+---------------+-------------+---------------+---------------+
|             cubby💫|              1|         5399|              0|              9|
|         Covfefe4EVA|              1|           48|              0|           1940|
|        Poorvi Sapra|              1|            1|             12|             19|
|          คุณนายพลอย|              1|         3388|              0|              5|
|                  🥵|              1|          542|              0|           3520|
|          Bill Navvy|              1|            0|              0|             15|
|    Bolu Oluwagbesan|              1|            0|              1|          21689|
|  MILF & SHEMALE FUN|              1|           52|              0|            299|
|  PeopleSearches.com|              1|           21|              0

In [61]:
#DATE(FROM_UNIXTIME(created_at)) AS created_at,
query = '''
SELECT user.name, first(favorite_count), avg(retweet_count) FROM tweetView group by user.name
'''
spark.sql(query).show(5,truncate=8)

+--------+----------------------------+------------------+
|    name|first(favorite_count, false)|avg(retweet_count)|
+--------+----------------------------+------------------+
| Ronwood|                           0|           11750.0|
|Mr. V...|                           0|          5713....|
|    Hina|                           0|            1401.7|
| APERIRÉ|                           0|               0.0|
|Amber...|                           0|               6.0|
+--------+----------------------------+------------------+
only showing top 5 rows



In [None]:
spark.sql('''SELECT full_text from tweetView where possibly_sensitive is not null and possibly_sensitive = true and lang = 'en' and lower(full_text) rlike 'social distancing|who|cdc' limit 10''').show(10,False)


In [13]:
!gsutil ls "gs://bucket-covid/TweetData/COVID-19-TweetIDs-master/2020-01/coronavirus-tweet-id-2020-01-21*.gz"

gs://bucket-covid/TweetData/COVID-19-TweetIDs-master/2020-01/coronavirus-tweet-id-2020-01-21-22.jsonl.gz
gs://bucket-covid/TweetData/COVID-19-TweetIDs-master/2020-01/coronavirus-tweet-id-2020-01-21-23.jsonl.gz


In [None]:
sourceData2 = spark.read.format("json").load("gs://bucket-covid/TweetData/COVID-19-TweetIDs-master/2020-01/coronavirus-tweet-id-2020-01-21-22.jsonl.gz")
sourceData2.repartition(10).write.save("parquetFile2.parquet")
parquetFile2 = spark.read.parquet("parquetFile2.parquet")
parquetFile2.createOrReplaceTempView("tweetView2")