In [6]:
# Make sure that you've installed spark-nlp to run this notebook
import pandas as pd
from pyspark.sql import SparkSession

pd.set_option("max_colwidth", 800)

# Create a spark session

spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "JohnSnowLabs:spark-nlp:1.8.2") \
        .getOrCreate()
spark

In [10]:
import os

# get the current path

path = os.getcwd() + "/comments-reddit/*.csv"
df = spark.read.csv(path)

In [13]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)



In [14]:
df.limit(5).toPandas()

Unnamed: 0,_c0,_c1,_c2,_c3
0,Author,Comment,Score,ID
1,MuffinMedic,This sounds interesting! By any chance is the bot open source? I'd be interested in running this locally and collecting some data.,,
2,Also,"have you compared this to or looked into the Perspective API at all?""",3,ek6kzos
3,reseph,"You may want to get in touch with https://civilservant.io/ too, just to inform them about this neat thing. AI Moderation was one of the topics discussed at the summit.",2,ek6lqbn
4,shaggorama,"""Define """"bad comments""""""",2,ek6mled


In [17]:
# As we can see, the header of the csv is counter as a row, this is wrong so let fix this.

df = spark.read.format("csv").option("header", "true").load(path)
df.printSchema()

root
 |-- Author: string (nullable = true)
 |-- Comment: string (nullable = true)
 |-- Score: string (nullable = true)
 |-- ID: string (nullable = true)



In [18]:
df.limit(5).toPandas()

Unnamed: 0,Author,Comment,Score,ID
0,MuffinMedic,This sounds interesting! By any chance is the bot open source? I'd be interested in running this locally and collecting some data.,,
1,Also,"have you compared this to or looked into the Perspective API at all?""",3.0,ek6kzos
2,reseph,"You may want to get in touch with https://civilservant.io/ too, just to inform them about this neat thing. AI Moderation was one of the topics discussed at the summit.",2.0,ek6lqbn
3,shaggorama,"""Define """"bad comments""""""",2.0,ek6mled
4,FreeSpeechWarrior,If this is trained on a per subreddit basis I'd be interested in using this in a report/modmail only mod on r/WatchRedditDie and r/subredditcancer,,


In [20]:
# We want to show the most relevant data from our data frame, in this case is Author and Comment

df.select('Author', 'Comment').limit(5).toPandas()

Unnamed: 0,Author,Comment
0,MuffinMedic,This sounds interesting! By any chance is the bot open source? I'd be interested in running this locally and collecting some data.
1,Also,"have you compared this to or looked into the Perspective API at all?"""
2,reseph,"You may want to get in touch with https://civilservant.io/ too, just to inform them about this neat thing. AI Moderation was one of the topics discussed at the summit."
3,shaggorama,"""Define """"bad comments"""""""
4,FreeSpeechWarrior,If this is trained on a per subreddit basis I'd be interested in using this in a report/modmail only mod on r/WatchRedditDie and r/subredditcancer


In [27]:
# Now lets move on to our comment column. Let's split every word in a column. 
## For this we're going to use some functions of Spark

import pyspark.sql.functions as F

dfWordCount = df.select(F.explode(F.split('Comment', '\\s+')).alias("Word")).groupBy("Word").count().orderBy(F.desc("Word"))


In [28]:
dfWordCount.printSchema()

root
 |-- Word: string (nullable = true)
 |-- count: long (nullable = false)



In [31]:
dfWordCount.limit(10).toPandas()

Unnamed: 0,Word,count
0,😊,1
1,“inline,1
2,‘,1
3,ಠ_ಠ,1
4,~~pretty~~,1
5,~~and,1
6,~40k,1
7,},1
8,zzz,1
9,yup.,1


In [33]:
# As we can see, our aproache is not working very well... That's why we are going to use a library 
## called NLP (Natural Languague Processing) who recognize every single word ans classified like nouns, pronouns, verbs, etc.

from com.johnsnowlabs.nlp.pretrained.pipeline.en import BasicPipeline as bp

dfAnnotated = bp.annotate(df, 'Comment')


In [34]:
dfAnnotated.printSchema()

root
 |-- Author: string (nullable = true)
 |-- text: string (nullable = true)
 |-- Score: string (nullable = true)
 |-- ID: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |-- token: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 

In [35]:
# Select what we want for our schema

dfPos = dfAnnotated.select("text", "pos.metadata", "pos.result")
dfPos.limit(5).toPandas()

Unnamed: 0,text,metadata,result
0,This sounds interesting! By any chance is the bot open source? I'd be interested in running this locally and collecting some data.,"[{'word': 'This'}, {'word': 'sounds'}, {'word': 'interesting'}, {'word': 'By'}, {'word': 'any'}, {'word': 'chance'}, {'word': 'is'}, {'word': 'the'}, {'word': 'bot'}, {'word': 'open'}, {'word': 'source'}, {'word': 'I'}, {'word': 'd'}, {'word': 'be'}, {'word': 'interested'}, {'word': 'in'}, {'word': 'running'}, {'word': 'this'}, {'word': 'locally'}, {'word': 'and'}, {'word': 'collecting'}, {'word': 'some'}, {'word': 'data'}]","[DT, VBZ, JJ, IN, DT, NN, VBZ, DT, NN, JJ, NN, PRP, SYM, VB, VBN, IN, VBG, DT, RB, CC, VBG, DT, NNS]"
1,"have you compared this to or looked into the Perspective API at all?""","[{'word': 'have'}, {'word': 'you'}, {'word': 'compared'}, {'word': 'this'}, {'word': 'to'}, {'word': 'or'}, {'word': 'looked'}, {'word': 'into'}, {'word': 'the'}, {'word': 'Perspective'}, {'word': 'API'}, {'word': 'at'}, {'word': 'all'}]","[VBP, PRP, VBD, DT, TO, CC, VBD, IN, DT, NNP, NNP, IN, DT]"
2,"You may want to get in touch with https://civilservant.io/ too, just to inform them about this neat thing. AI Moderation was one of the topics discussed at the summit.","[{'word': 'You'}, {'word': 'may'}, {'word': 'want'}, {'word': 'to'}, {'word': 'get'}, {'word': 'in'}, {'word': 'touch'}, {'word': 'with'}, {'word': 'httpscivilservantio'}, {'word': 'too'}, {'word': 'just'}, {'word': 'to'}, {'word': 'inform'}, {'word': 'them'}, {'word': 'about'}, {'word': 'this'}, {'word': 'neat'}, {'word': 'thing'}, {'word': 'AI'}, {'word': 'Moderation'}, {'word': 'was'}, {'word': 'one'}, {'word': 'of'}, {'word': 'the'}, {'word': 'topics'}, {'word': 'discussed'}, {'word': 'at'}, {'word': 'the'}, {'word': 'summit'}]","[PRP, MD, VB, TO, VB, IN, NN, IN, NN, RB, RB, TO, VB, PRP, IN, DT, JJ, NN, NNP, NNP, VBD, CD, IN, DT, NNS, VBD, IN, DT, NN]"
3,"""Define """"bad comments""""""","[{'word': 'Define'}, {'word': 'bad'}, {'word': 'comments'}]","[NNP, JJ, NNS]"
4,If this is trained on a per subreddit basis I'd be interested in using this in a report/modmail only mod on r/WatchRedditDie and r/subredditcancer,"[{'word': 'If'}, {'word': 'this'}, {'word': 'is'}, {'word': 'trained'}, {'word': 'on'}, {'word': 'a'}, {'word': 'per'}, {'word': 'subreddit'}, {'word': 'basis'}, {'word': 'I'}, {'word': 'd'}, {'word': 'be'}, {'word': 'interested'}, {'word': 'in'}, {'word': 'using'}, {'word': 'this'}, {'word': 'in'}, {'word': 'a'}, {'word': 'reportmodmail'}, {'word': 'only'}, {'word': 'mod'}, {'word': 'on'}, {'word': 'rWatchRedditDie'}, {'word': 'and'}, {'word': 'rsubredditcancer'}]","[IN, DT, VBZ, VBN, IN, DT, IN, NN, NN, PRP, SYM, VB, VBN, IN, VBG, DT, IN, DT, NN, RB, NN, IN, NN, CC, NN]"


In [38]:
# Now is better, as we can see in the result column, every words is categorized as verb, noun, pronoun, etc.

dfPos = dfAnnotated.select(F.explode("pos").alias("pos"))
dfPos.limit(10).toPandas()

Unnamed: 0,pos
0,"(pos, 0, 3, DT, {'word': 'This'})"
1,"(pos, 5, 10, VBZ, {'word': 'sounds'})"
2,"(pos, 12, 22, JJ, {'word': 'interesting'})"
3,"(pos, 25, 26, IN, {'word': 'By'})"
4,"(pos, 28, 30, DT, {'word': 'any'})"
5,"(pos, 32, 37, NN, {'word': 'chance'})"
6,"(pos, 39, 40, VBZ, {'word': 'is'})"
7,"(pos, 42, 44, DT, {'word': 'the'})"
8,"(pos, 46, 48, NN, {'word': 'bot'})"
9,"(pos, 50, 53, JJ, {'word': 'open'})"


In [39]:
dfPos.printSchema()

root
 |-- pos: struct (nullable = true)
 |    |-- annotatorType: string (nullable = true)
 |    |-- begin: integer (nullable = false)
 |    |-- end: integer (nullable = false)
 |    |-- result: string (nullable = true)
 |    |-- metadata: map (nullable = true)
 |    |    |-- key: string
 |    |    |-- value: string (valueContainsNull = true)



In [43]:
# With selectExpr we can select the word of our data frame

dfPos.selectExpr("pos.metadata['word']").limit(5).toPandas()



Unnamed: 0,pos.metadata AS `metadata`[word]
0,This
1,sounds
2,interesting
3,By
4,any


In [45]:
dfPos.selectExpr("pos.result").limit(5).toPandas()

Unnamed: 0,result
0,DT
1,VBZ
2,JJ
3,IN
4,DT


In [47]:
# Now lets filter the data after create a new DataFrame

# List of NLP tags https://cs.nyu.edu/grishman/jet/guide/PennPOS.html
# We want to count the tags NNP and NNPs

NNPfilter = "pos.result = 'NNP' or pos.result = 'NNPs'"
dfFilter = dfPos.filter(NNPfilter)
dfFilter.limit(5).toPandas()

Unnamed: 0,pos
0,"(pos, 45, 55, NNP, {'word': 'Perspective'})"
1,"(pos, 57, 59, NNP, {'word': 'API'})"
2,"(pos, 107, 108, NNP, {'word': 'AI'})"
3,"(pos, 110, 119, NNP, {'word': 'Moderation'})"
4,"(pos, 1, 6, NNP, {'word': 'Define'})"


In [69]:
# lets create new columns with our tag and word and then make a count! 

dfNNP = dfFilter.selectExpr("pos.metadata['word'] as word", "pos.result as tag")

In [70]:
dfNNP.limit(5).toPandas()

Unnamed: 0,word,tag
0,Perspective,NNP
1,API,NNP
2,AI,NNP
3,Moderation,NNP
4,Define,NNP


In [73]:
# Now that we finally have our dataframe as we want, lets make a count and figure out 
## What are the most popular words in Reddit comments.
from pyspark.sql.functions import desc

dfNNP.groupBy("word").count().orderBy(desc("count"))

DataFrame[word: string, count: bigint]

In [74]:
dfNNP.createOrReplaceTempView('NLP_count')