In [97]:
# Import Spark NLP
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline

In [98]:
spark = SparkSession.builder \
    .appName("Spark NLP")\
    .config("spark.driver.memory","32G")\
    .config("spark.driver.maxResultSize", "2G") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.1.0")\
    .config("spark.kryoserializer.buffer.max", "1000M")\
    .getOrCreate()


In [3]:
spark.version

'3.1.2'

In [99]:
NeutralFile = spark.read.parquet("../../Files/Submissions/score/done/Neutr_vacc_d.parquet")
ProFile = spark.read.parquet("../../Files/Submissions/score/done/Pro_vacc_d.parquet")
AntiFile = spark.read.parquet("../../Files/Submissions/score/done/Anti_vacc_d.parquet")

In [100]:
from sparknlp.base.document_assembler import DocumentAssembler
from sparknlp.base.finisher import Finisher
from sparknlp.annotator.stop_words_cleaner import StopWordsCleaner
from sparknlp.annotator.normalizer import Normalizer
from sparknlp.annotator.token import Tokenizer
from pyspark.ml.clustering import LDA

In [101]:
import functools
def unionAll(dfs):
    return functools.reduce(lambda df1, df2: df1.union(df2.select(df1.columns)), dfs)

In [102]:
Total = unionAll([NeutralFile, ProFile, AntiFile])

In [84]:
sample_n = NeutralFile.sample(0.1)

In [103]:
sample = Total.sampleBy("class_II", fractions={
    0.0: 0.10,
    1.0: 0.10,
    2.0: 0.10
}, seed=42)

In [104]:

# remove stopwords
document_assembler = DocumentAssembler() \
    .setInputCol("cleanText") \
    .setOutputCol("document") \
    .setCleanupMode("disabled")
# Split sentence to tokens(array)
tokenizer = Tokenizer() \
  .setInputCols(["document"]) \
  .setOutputCol("token")
# clean unwanted characters and garbage
normalizer = Normalizer() \
    .setInputCols(["token"]) \
    .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized") \
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

finisher = Finisher() \
    .setInputCols(["cleanTokens"]) \
    .setOutputCols(["tokens"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(False)

nlp_pipeline = Pipeline(
    stages=[
        document_assembler,
            tokenizer,
            normalizer,
            stopwords_cleaner,  
            finisher])

In [105]:
# train the pipeline
nlp_model = nlp_pipeline.fit(sample)

In [106]:
# apply the pipeline to transform dataframe.
processed_df  = nlp_model.transform(sample)

In [107]:
processed_df

DataFrame[cleanText: string, score: bigint, subreddit: string, created_utc: bigint, class_II: bigint, document: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, token: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, normalized: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, cleanTokens: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, tokens: array<string>]

In [108]:
tokens_df = processed_df.select('subreddit', 'score', 'created_utc','tokens')
tokens_df.count()

298255

In [109]:
from pyspark.ml.feature import CountVectorizer
cv = CountVectorizer(inputCol="tokens", outputCol="features", vocabSize=1000, minDF=3.0)
# train the model
cv_model = cv.fit(tokens_df)
# transform the data. Output column name will be features.
vectorized_tokens = cv_model.transform(tokens_df)

                                                                                

In [112]:
k=22
lda = LDA(k=k, maxIter=10, optimizer='em')
model = lda.fit(vectorized_tokens)
ll = model.logLikelihood(vectorized_tokens)
lp = model.logPerplexity(vectorized_tokens)

print( f"ALL, NUMBER OF TOPICS: {k}")
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))

print("The upper bound on perplexity: " + str(lp))

vocab = cv_model.vocabulary
topics = model.describeTopics(maxTermsPerTopic = 30)   



ALL, NUMBER OF TOPICS: 22
The lower bound on the log likelihood of the entire corpus: -18170112.909964096
The upper bound on perplexity: 6.5659815286261995


                                                                                

In [93]:
transformed = model.transform(vectorized_tokens)

In [94]:
out = transformed[['subreddit', 'score','created_utc', 'topicDistribution']]

In [95]:
import pandas as pd
pdf = out.coalesce(1).toPandas()

                                                                                

In [96]:
pdf.to_pickle("../../Files/models/topics_n_7_distr.pickle")

In [55]:
topic_range = [4,5,6,7,8,9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20]

In [81]:
with open("../../Files/models/topic_an.txt", "w") as output:
    output.write(results)

In [90]:
# extract vocabulary from CountVectorizer

topics_rdd = topics.rdd
topics_words = topics_rdd\
       .map(lambda row: row['termIndices'])\
       .map(lambda idx_list: [vocab[idx] for idx in idx_list])\
       .collect()
for idx, topic in enumerate(topics_words):
    print("topic: {}".format(idx))
    print("*"*25)
    for word in topic:
       print(word)
    print("*"*25)

topic: 0
*************************
NUM
people
vaccine
like
m
know
think
covid
go
want
get
time
say
work
thing
feel
good
life
need
ve
year
way
come
tell
vaccinate
death
day
right
try
die
*************************
topic: 1
*************************
NUM
people
vaccine
like
m
know
think
covid
go
want
get
time
say
work
thing
need
life
good
feel
year
ve
way
come
death
tell
vaccinate
day
right
die
try
*************************
topic: 2
*************************
NUM
people
vaccine
like
m
know
think
covid
go
want
get
time
say
work
thing
life
feel
good
ve
year
need
way
come
tell
day
vaccinate
death
try
right
die
*************************


In [126]:
model4.save('../../Files/models/topic_a_7_d.pickle')

                                                                                

In [8]:
from pyspark.ml.clustering import DistributedLDAModel

In [9]:
model5 = DistributedLDAModel.load('../../Files/models/topic_a_7_d.pickle')

                                                                                

In [15]:
topics = model5.describeTopics()

                                                                                

In [21]:
vocab = cv_model.vocabulary

In [19]:
topics_rdd = topics.rdd

In [22]:
topics_words = topics_rdd\
       .map(lambda row: row['termIndices'])\
       .map(lambda idx_list: [vocab[idx] for idx in idx_list])\
       .collect()

In [23]:
topics_rdd = topics.rdd
topics_words = topics_rdd\
       .map(lambda row: row['termIndices'])\
       .map(lambda idx_list: [vocab[idx] for idx in idx_list])\
       .collect()
for idx, topic in enumerate(topics_words):
    print("topic: {}".format(idx))
    print("*"*25)
    for word in topic:
       print(word)
    print("*"*25)

topic: 0
*************************
NUM
people
vaccine
like
m
know
think
covid
go
want
*************************
topic: 1
*************************
NUM
people
vaccine
like
m
know
think
covid
go
want
*************************
topic: 2
*************************
NUM
people
vaccine
like
m
know
think
covid
go
want
*************************


In [130]:
model2.save('../../Files/models/topic_a_all.pickle')

In [151]:
model3.save('../../Files/models/topic_p_n.pickle')

In [12]:
lda = LDA(k=7, maxIter=10)