In [1]:
# Import Spark NLP
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

In [2]:
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .config("spark.driver.memory","32G")\
    .config("spark.driver.maxResultSize", "2G") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.1.0")\
    .config("spark.kryoserializer.buffer.max", "1000M")\
    .getOrCreate()


:: loading settings :: url = jar:file:/home/3147567/.conda/envs/reddit_env/lib/python3.9/site-packages/pyspark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/3147567/.ivy2/cache
The jars for the packages stored in: /home/3147567/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-b904c7fb-b336-4e0e-9b95-5310b787e05c;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;4.1.0 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.828 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.code.findbugs#annotations;3.0.1 in central
	found net.jcip#jcip-annotations;1.0 in central
	found com.google.code.findbugs#jsr305;3.0.1 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastutil;7.0.12 in central
	found org.projectlombok#l

In [3]:
spark.version

'3.1.2'

In [3]:
NeutralFile = spark.read.parquet("../../Files/Submissions/score/done/Neutr_vac.parquet")
ProFile = spark.read.parquet("../../Files/Submissions/score/done/Pro_vac.parquet")
AntiFile = spark.read.parquet("../../Files/Submissions/score/done/Anti_vac.parquet")

                                                                                

In [18]:
type(ProFile)

pyspark.sql.dataframe.DataFrame

In [4]:
import functools
def unionAll(dfs):
    return functools.reduce(lambda df1, df2: df1.union(df2.select(df1.columns)), dfs)

In [13]:
Total = unionAll([NeutralFile, ProFile, AntiFile])

In [110]:
sample = Total.sampleBy("pred_1", fractions={
    0.0: 0.10,
    1.0: 0.10,
    2.0: 0.10
}, seed=42)

In [145]:
sample_p = ProFile.sample(0.5)

In [155]:
sample_n.count()

1057838

In [111]:
from sparknlp.base.document_assembler import DocumentAssembler
from sparknlp.base.finisher import Finisher
from sparknlp.annotator.stop_words_cleaner import StopWordsCleaner
from sparknlp.annotator.normalizer import Normalizer
from sparknlp.annotator.token import Tokenizer


In [137]:

# remove stopwords
document_assembler = DocumentAssembler() \
    .setInputCol("cleanText") \
    .setOutputCol("document") \
    .setCleanupMode("disabled")
# Split sentence to tokens(array)
tokenizer = Tokenizer() \
  .setInputCols(["document"]) \
  .setOutputCol("token")
# clean unwanted characters and garbage
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("cleanText") \
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

finisher = Finisher() \
    .setInputCols(["cleanText"]) \
    .setOutputCols(["tokens"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(False)

nlp_pipeline = Pipeline(
    stages=[
        # document_assembler,
        #     tokenizer,
        #     normalizer,
            # stopwords_cleaner,  
            finisher])

In [147]:
# train the pipeline
nlp_model = nlp_pipeline.fit(sample_p)

In [148]:
# apply the pipeline to transform dataframe.
processed_df  = nlp_model.transform(sample_p)

In [149]:
tokens_df = processed_df.select('pred_1','cleanText')
tokens_df.count()

558582

In [150]:
from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer(inputCol="cleanText", outputCol="features", vocabSize=500, minDF=3.0)
# train the model
cv_model = cv.fit(tokens_df)
# transform the data. Output column name will be features.
vectorized_tokens = cv_model.transform(tokens_df)

                                                                                

In [152]:
from pyspark.ml.clustering import LDA
num_topics = 7
lda = LDA(k=num_topics, maxIter=10)
model4 = lda.fit(vectorized_tokens)
ll = model4.logLikelihood(vectorized_tokens)
lp = model4.logPerplexity(vectorized_tokens)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))



The lower bound on the log likelihood of the entire corpus: -25437380.973684043
The upper bound on perplexity: 5.620890868953256


                                                                                

In [154]:
# extract vocabulary from CountVectorizer
vocab = cv_model.vocabulary
topics = model4.describeTopics(maxTermsPerTopic = 30)   
topics_rdd = topics.rdd
topics_words = topics_rdd\
       .map(lambda row: row['termIndices'])\
       .map(lambda idx_list: [vocab[idx] for idx in idx_list])\
       .collect()
for idx, topic in enumerate(topics_words):
    print("topic: {}".format(idx))
    print("*"*25)
    for word in topic:
       print(word)
    print("*"*25)

topic: 0
*************************
not
people
like
think
bad
thing
time
s
life
want
know
say
m
good
feel
well
work
way
person
need
opinion
kid
child
make
world
live
ve
talk
go
understand
*************************
topic: 1
*************************
[NUM]
vaccine
covid
test
covid[NUM]
[num]
coronavirus
delete
dose
day
covid[num]
gt
vaccinate
week
vaccination
study
case
report
virus
get
positive
testing
pfizer
pillar
month
datum
uk
say
new
death
*************************
topic: 2
*************************
[NUM]
not
year
like
m
people
[num]
good
know
time
think
company
man
s
woman
need
stock
look
go
high
day
pay
feel
remove
market
work
long
money
ve
job
*************************
topic: 3
*************************
remove
[url]
help
post
stock
company
short
market
look
[NUM]
thank
ampx[num]b
news
need
money
good
go
china
like
great
[num]
retard
dd
people
price
[num]k
see
not
wsb
new
*************************
topic: 4
*************************
not
m
like
shit
go
know
time
s
want
get
game
come

In [120]:
model.save('../../Files/models/topic_p_all.pickle')

In [130]:
model2.save('../../Files/models/topic_a_all.pickle')

In [151]:
model3.save('../../Files/models/topic_p_n.pickle')