# Advanced Analytics: NLP

In [1]:
!python -m pip install pandas



In [2]:
!pip install spark-nlp==1.7.3



In [3]:
import pandas as pd
pd.set_option('max_colwidth', 800)

# Create a spark contecxt that includes a 3rd party jar for NLP

In [4]:
#jarPath = "spark-nlp-assembly-1.7.3.jar"

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .config("spark.jars.packages", "JohnSnowLabs:spark-nlp:1.8.2") \
    .getOrCreate()

spark

# Read json files in a dir as one DataFrame

In [5]:
data_path = "./data/reddit/*.json"

df = spark.read.json(data_path)

In [6]:
print(df.count())

100


In [7]:
df.printSchema()

root
 |-- data: struct (nullable = true)
 |    |-- approved_at_utc: string (nullable = true)
 |    |-- approved_by: string (nullable = true)
 |    |-- archived: boolean (nullable = true)
 |    |-- author: string (nullable = true)
 |    |-- author_flair_background_color: string (nullable = true)
 |    |-- author_flair_css_class: string (nullable = true)
 |    |-- author_flair_richtext: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- author_flair_template_id: string (nullable = true)
 |    |-- author_flair_text: string (nullable = true)
 |    |-- author_flair_text_color: string (nullable = true)
 |    |-- author_flair_type: string (nullable = true)
 |    |-- author_fullname: string (nullable = true)
 |    |-- author_patreon_flair: boolean (nullable = true)
 |    |-- banned_at_utc: string (nullable = true)
 |    |-- banned_by: string (nullable = true)
 |    |-- can_gild: boolean (nullable = true)
 |    |-- can_mod_post: boolean (nullable = true)
 |  

# Deal with Struct type to query subfields

In [8]:
title = "data.title"
author = "data.author"

dfAuthorTitle = df.select(title, author)
dfAuthorTitle.limit(10).toPandas()

Unnamed: 0,title,author
0,"Microsoft Corp said it has discovered hacking targeting democratic institutions, think tanks, and non-profit organizations in Europe.",jaykirsch
1,Deutsche Bank reportedly planned to extend the dates of $340 million in loans to Trump Organization to avoid a potential nightmare of chasing a sitting president for cash,canuck_burger
2,"Iranian ""morality police"" were forced to fire warning shots when a crowd intervened to prevent them from arresting two women for not wearing a hijab. The incident occurred in Tehran's northeastern Narmak neighbourhood on Friday night, and ended with a mob tearing the door off a police vehicle.",honolulu_oahu_mod
3,"Trump administration 'pushing Saudi nuclear deal' which could benefit company linked to Jared Kushner - Senior Trump administration officials pushed a project to share nuclear power technology with Saudi Arabia over the objections of ethics officials, according to a congressional report",madam1
4,"NASA Happily Reports the Earth is Greener, With More Trees Than 20 Years Ago–and It's Thanks to China, India",purplexxx
5,"President Vladimir Putin threatened the United States with an expanded array of strategic weapons on Wednesday, announcing a new hypersonic missile, and the early deployment of new nuclear submarines equipped with long-distance underwater nuclear drones.",madam1
6,33 students at 2 Vancouver schools ordered to stay home until they can prove they've had measles vaccine,littlebossman
7,Giant tortoises believed to have been extinct for more than 100 years found alive and well in The Galápagos Islands,joakinzz99
8,The use of marijuana for medical purposes in Thailand became officially legal late Monday through a royal decree.,cskarfors
9,Three Tory MPs join Labour breakaway group,carpie101


In [9]:
import pyspark.sql.functions as F

dfWordCount =                                                  \
    df.select(F.explode(F.split(title, "\\s+")).alias("word")) \
        .groupBy("word")                                       \
        .count()                                               \
        .orderBy(F.desc("count"))

In [10]:
dfWordCount.limit(10).toPandas()

Unnamed: 0,word,count
0,to,58
1,the,46
2,of,42
3,in,41
4,a,25
5,for,20
6,and,19
7,from,12
8,on,11
9,with,10


# Use an NLP library to do part-of-speech tagging

In [11]:
from com.johnsnowlabs.nlp.pretrained.pipeline.en import BasicPipeline as bp

dfAnnotated = bp.annotate(dfAuthorTitle, "title")
dfAnnotated.printSchema()

root
 |-- text: string (nullable = true)
 |-- author: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |-- token: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |-- normal: array (nullable = true)
 |    |-- element: struct (contains

# Deal with Map type to query subfields

In [12]:
dfPos = dfAnnotated.select("text", "pos.metadata", "pos.result")

dfPos.limit(10).toPandas()

Unnamed: 0,text,metadata,result
0,"Microsoft Corp said it has discovered hacking targeting democratic institutions, think tanks, and non-profit organizations in Europe.","[{'word': 'Microsoft'}, {'word': 'Corp'}, {'word': 'said'}, {'word': 'it'}, {'word': 'has'}, {'word': 'discovered'}, {'word': 'hacking'}, {'word': 'targeting'}, {'word': 'democratic'}, {'word': 'institutions'}, {'word': 'think'}, {'word': 'tanks'}, {'word': 'and'}, {'word': 'nonprofit'}, {'word': 'organizations'}, {'word': 'in'}, {'word': 'Europe'}]","[NNP, NNP, VBD, PRP, VBZ, VBN, VBG, VBG, JJ, NNS, VBP, NNS, CC, NN, NNS, IN, NNP]"
1,Deutsche Bank reportedly planned to extend the dates of $340 million in loans to Trump Organization to avoid a potential nightmare of chasing a sitting president for cash,"[{'word': 'Deutsche'}, {'word': 'Bank'}, {'word': 'reportedly'}, {'word': 'planned'}, {'word': 'to'}, {'word': 'extend'}, {'word': 'the'}, {'word': 'dates'}, {'word': 'of'}, {'word': 'million'}, {'word': 'in'}, {'word': 'loans'}, {'word': 'to'}, {'word': 'Trump'}, {'word': 'Organization'}, {'word': 'to'}, {'word': 'avoid'}, {'word': 'a'}, {'word': 'potential'}, {'word': 'nightmare'}, {'word': 'of'}, {'word': 'chasing'}, {'word': 'a'}, {'word': 'sitting'}, {'word': 'president'}, {'word': 'for'}, {'word': 'cash'}]","[NNP, NNP, RB, VBD, TO, VB, DT, NNS, IN, CD, IN, NNS, TO, NNP, NNP, TO, VB, DT, JJ, NN, IN, VBG, DT, VBG, NN, IN, NN]"
2,"Iranian ""morality police"" were forced to fire warning shots when a crowd intervened to prevent them from arresting two women for not wearing a hijab. The incident occurred in Tehran's northeastern Narmak neighbourhood on Friday night, and ended with a mob tearing the door off a police vehicle.","[{'word': 'Iranian'}, {'word': 'morality'}, {'word': 'police'}, {'word': 'were'}, {'word': 'forced'}, {'word': 'to'}, {'word': 'fire'}, {'word': 'warning'}, {'word': 'shots'}, {'word': 'when'}, {'word': 'a'}, {'word': 'crowd'}, {'word': 'intervened'}, {'word': 'to'}, {'word': 'prevent'}, {'word': 'them'}, {'word': 'from'}, {'word': 'arresting'}, {'word': 'two'}, {'word': 'women'}, {'word': 'for'}, {'word': 'not'}, {'word': 'wearing'}, {'word': 'a'}, {'word': 'hijab'}, {'word': 'The'}, {'word': 'incident'}, {'word': 'occurred'}, {'word': 'in'}, {'word': 'Tehran'}, {'word': 's'}, {'word': 'northeastern'}, {'word': 'Narmak'}, {'word': 'neighbourhood'}, {'word': 'on'}, {'word': 'Friday'}, {'word': 'night'}, {'word': 'and'}, {'word': 'ended'}, {'word': 'with'}, {'word': 'a'}, {'word': 'mob'...","[JJ, NN, NN, VBD, VBN, TO, VB, NN, NNS, WRB, DT, NN, VBD, TO, VB, PRP, IN, VBG, CD, NNS, IN, RB, VBG, DT, NN, DT, NN, VBD, IN, NNP, VBZ, JJ, NNP, NN, IN, NNP, NN, CC, VBD, IN, DT, NN, VBG, DT, NN, RP, DT, NN, NN]"
3,"Trump administration 'pushing Saudi nuclear deal' which could benefit company linked to Jared Kushner - Senior Trump administration officials pushed a project to share nuclear power technology with Saudi Arabia over the objections of ethics officials, according to a congressional report","[{'word': 'Trump'}, {'word': 'administration'}, {'word': 'pushing'}, {'word': 'Saudi'}, {'word': 'nuclear'}, {'word': 'deal'}, {'word': 'which'}, {'word': 'could'}, {'word': 'benefit'}, {'word': 'company'}, {'word': 'linked'}, {'word': 'to'}, {'word': 'Jared'}, {'word': 'Kushner'}, {'word': 'Senior'}, {'word': 'Trump'}, {'word': 'administration'}, {'word': 'officials'}, {'word': 'pushed'}, {'word': 'a'}, {'word': 'project'}, {'word': 'to'}, {'word': 'share'}, {'word': 'nuclear'}, {'word': 'power'}, {'word': 'technology'}, {'word': 'with'}, {'word': 'Saudi'}, {'word': 'Arabia'}, {'word': 'over'}, {'word': 'the'}, {'word': 'objections'}, {'word': 'of'}, {'word': 'ethics'}, {'word': 'officials'}, {'word': 'according'}, {'word': 'to'}, {'word': 'a'}, {'word': 'congressional'}, {'word': 're...","[NNP, NN, VBG, NNP, NN, NN, WDT, MD, VB, NN, VBN, TO, NNP, NNP, NNP, NNP, NN, NNS, VBD, DT, NN, TO, VB, JJ, NN, NN, IN, NNP, NNP, IN, DT, NNS, IN, NNS, NNS, VBG, TO, DT, JJ, NN]"
4,"NASA Happily Reports the Earth is Greener, With More Trees Than 20 Years Ago–and It's Thanks to China, India","[{'word': 'NASA'}, {'word': 'Happily'}, {'word': 'Reports'}, {'word': 'the'}, {'word': 'Earth'}, {'word': 'is'}, {'word': 'Greener'}, {'word': 'With'}, {'word': 'More'}, {'word': 'Trees'}, {'word': 'Than'}, {'word': 'Years'}, {'word': 'Agoand'}, {'word': 'It'}, {'word': 's'}, {'word': 'Thanks'}, {'word': 'to'}, {'word': 'China'}, {'word': 'India'}]","[NNP, NNP, NNS, DT, NNP, VBZ, NNP, IN, JJR, NNP, IN, NNS, NNP, PRP, VBZ, NNS, TO, NNP, NNP]"
5,"President Vladimir Putin threatened the United States with an expanded array of strategic weapons on Wednesday, announcing a new hypersonic missile, and the early deployment of new nuclear submarines equipped with long-distance underwater nuclear drones.","[{'word': 'President'}, {'word': 'Vladimir'}, {'word': 'Putin'}, {'word': 'threatened'}, {'word': 'the'}, {'word': 'United'}, {'word': 'States'}, {'word': 'with'}, {'word': 'an'}, {'word': 'expanded'}, {'word': 'array'}, {'word': 'of'}, {'word': 'strategic'}, {'word': 'weapons'}, {'word': 'on'}, {'word': 'Wednesday'}, {'word': 'announcing'}, {'word': 'a'}, {'word': 'new'}, {'word': 'hypersonic'}, {'word': 'missile'}, {'word': 'and'}, {'word': 'the'}, {'word': 'early'}, {'word': 'deployment'}, {'word': 'of'}, {'word': 'new'}, {'word': 'nuclear'}, {'word': 'submarines'}, {'word': 'equipped'}, {'word': 'with'}, {'word': 'longdistance'}, {'word': 'underwater'}, {'word': 'nuclear'}, {'word': 'drones'}]","[NNP, NNP, NNP, VBD, DT, NNP, NNPS, IN, DT, JJ, NN, IN, JJ, NNS, IN, NNP, VBG, DT, JJ, JJ, NN, CC, DT, JJ, NN, IN, JJ, JJ, NNS, VBN, IN, NN, JJ, JJ, NNS]"
6,33 students at 2 Vancouver schools ordered to stay home until they can prove they've had measles vaccine,"[{'word': 'students'}, {'word': 'at'}, {'word': 'Vancouver'}, {'word': 'schools'}, {'word': 'ordered'}, {'word': 'to'}, {'word': 'stay'}, {'word': 'home'}, {'word': 'until'}, {'word': 'they'}, {'word': 'can'}, {'word': 'prove'}, {'word': 'they'}, {'word': 've'}, {'word': 'had'}, {'word': 'measles'}, {'word': 'vaccine'}]","[NNS, IN, NNP, NNS, VBD, TO, VB, NN, IN, PRP, MD, VB, PRP, NN, VBD, NNS, NN]"
7,Giant tortoises believed to have been extinct for more than 100 years found alive and well in The Galápagos Islands,"[{'word': 'Giant'}, {'word': 'tortoises'}, {'word': 'believed'}, {'word': 'to'}, {'word': 'have'}, {'word': 'been'}, {'word': 'extinct'}, {'word': 'for'}, {'word': 'more'}, {'word': 'than'}, {'word': 'years'}, {'word': 'found'}, {'word': 'alive'}, {'word': 'and'}, {'word': 'well'}, {'word': 'in'}, {'word': 'The'}, {'word': 'Galápagos'}, {'word': 'Islands'}]","[NNP, NNS, VBN, TO, VB, VBN, JJ, IN, JJR, IN, NNS, VBD, JJ, CC, RB, IN, DT, NNP, NNP]"
8,The use of marijuana for medical purposes in Thailand became officially legal late Monday through a royal decree.,"[{'word': 'The'}, {'word': 'use'}, {'word': 'of'}, {'word': 'marijuana'}, {'word': 'for'}, {'word': 'medical'}, {'word': 'purposes'}, {'word': 'in'}, {'word': 'Thailand'}, {'word': 'became'}, {'word': 'officially'}, {'word': 'legal'}, {'word': 'late'}, {'word': 'Monday'}, {'word': 'through'}, {'word': 'a'}, {'word': 'royal'}, {'word': 'decree'}]","[DT, NN, IN, NN, IN, JJ, NNS, IN, NNP, VBD, RB, JJ, RB, NNP, IN, DT, JJ, NN]"
9,Three Tory MPs join Labour breakaway group,"[{'word': 'Three'}, {'word': 'Tory'}, {'word': 'MPs'}, {'word': 'join'}, {'word': 'Labour'}, {'word': 'breakaway'}, {'word': 'group'}]","[CD, NNP, NNP, VB, NNP, NN, NN]"


In [13]:
dfPos2 = dfAnnotated.select(F.explode("pos").alias("pos"))

In [14]:
dfPos2.printSchema()

root
 |-- pos: struct (nullable = true)
 |    |-- annotatorType: string (nullable = true)
 |    |-- begin: integer (nullable = false)
 |    |-- end: integer (nullable = false)
 |    |-- result: string (nullable = true)
 |    |-- metadata: map (nullable = true)
 |    |    |-- key: string
 |    |    |-- value: string (valueContainsNull = true)



In [15]:
dfPos2.toPandas()

Unnamed: 0,pos
0,"(pos, 0, 8, NNP, {'word': 'Microsoft'})"
1,"(pos, 10, 13, NNP, {'word': 'Corp'})"
2,"(pos, 15, 18, VBD, {'word': 'said'})"
3,"(pos, 20, 21, PRP, {'word': 'it'})"
4,"(pos, 23, 25, VBZ, {'word': 'has'})"
...,...
1624,"(pos, 60, 61, IN, {'word': 'of'})"
1625,"(pos, 63, 74, JJ, {'word': 'unapologetic'})"
1626,"(pos, 76, 87, JJ, {'word': 'antisemitic'})"
1627,"(pos, 89, 95, NNS, {'word': 'attacks'})"


# Keep only proper nouns NNP or NNPS

In [16]:
nnpFilter = "pos.result = 'NNP' or pos.result = 'NNPS'"
dfNNP = dfPos2.where(nnpFilter)

dfNNP.limit(10).toPandas()

Unnamed: 0,pos
0,"(pos, 0, 8, NNP, {'word': 'Microsoft'})"
1,"(pos, 10, 13, NNP, {'word': 'Corp'})"
2,"(pos, 126, 131, NNP, {'word': 'Europe'})"
3,"(pos, 0, 7, NNP, {'word': 'Deutsche'})"
4,"(pos, 9, 12, NNP, {'word': 'Bank'})"
5,"(pos, 81, 85, NNP, {'word': 'Trump'})"
6,"(pos, 87, 98, NNP, {'word': 'Organization'})"
7,"(pos, 175, 180, NNP, {'word': 'Tehran'})"
8,"(pos, 197, 202, NNP, {'word': 'Narmak'})"
9,"(pos, 221, 226, NNP, {'word': 'Friday'})"


In [17]:
dfWordTag = dfNNP.selectExpr("pos.metadata['word'] as word", "pos.result as tag")

dfWordTag.limit(10).toPandas()

Unnamed: 0,word,tag
0,Microsoft,NNP
1,Corp,NNP
2,Europe,NNP
3,Deutsche,NNP
4,Bank,NNP
5,Trump,NNP
6,Organization,NNP
7,Tehran,NNP
8,Narmak,NNP
9,Friday,NNP


In [18]:
from pyspark.sql.functions import asc, desc

dfWordTag                   \
    .groupBy('word')        \
    .count()                \
    .orderBy(desc('count')) \
    .toPandas()

Unnamed: 0,word,count
0,US,14
1,Trump,9
2,Saudi,8
3,Putin,7
4,Russia,6
...,...,...
223,Apollo,1
224,City,1
225,Multiple,1
226,Netherlands,1


# Do the same for common nouns

In [19]:
nnFilter = "pos.result = 'NN' or pos.result = 'NNS'"
dfNN = dfPos2.filter(nnFilter)

dfNN.limit(10).toPandas()

Unnamed: 0,pos
0,"(pos, 67, 78, NNS, {'word': 'institutions'})"
1,"(pos, 87, 91, NNS, {'word': 'tanks'})"
2,"(pos, 98, 107, NN, {'word': 'nonprofit'})"
3,"(pos, 109, 121, NNS, {'word': 'organizations'})"
4,"(pos, 47, 51, NNS, {'word': 'dates'})"
5,"(pos, 72, 76, NNS, {'word': 'loans'})"
6,"(pos, 121, 129, NN, {'word': 'nightmare'})"
7,"(pos, 152, 160, NN, {'word': 'president'})"
8,"(pos, 166, 169, NN, {'word': 'cash'})"
9,"(pos, 9, 16, NN, {'word': 'morality'})"


In [20]:
dfWordTagNN = dfNN.selectExpr("pos.metadata['word'] as word", "pos.result as tag")

dfWordTagNN.limit(10).toPandas()

Unnamed: 0,word,tag
0,institutions,NNS
1,tanks,NNS
2,nonprofit,NN
3,organizations,NNS
4,dates,NNS
5,loans,NNS
6,nightmare,NN
7,president,NN
8,cash,NN
9,morality,NN


In [21]:
dfWordTagNN                 \
    .groupBy('word')        \
    .count()                \
    .orderBy(desc('count')) \
    .toPandas()

Unnamed: 0,word,count
0,climate,5
1,change,5
2,missiles,5
3,report,4
4,police,4
...,...,...
345,nations,1
346,start,1
347,homes,1
348,ward,1
