In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

from pyspark.sql.functions import lit

In [0]:
# Load in one of the tables
df1 = spark.sql("select * from default.video_games_5")
df1 = df1.withColumn('category', lit("video_games"))

df2 = spark.sql("select * from default.home_and_kitchen_5_small")
df2 = df2.withColumn('category', lit("home_and_kitchen"))

df3 = spark.sql("select * from default.books_5_small")
df3 = df3.withColumn('category', lit("books"))

df = df1.union(df2).union(df3)

# Take a sample (useful for code development purposes)
# df = df.sample(False, 0.15, seed=0)

df = df.cache()

print((df.count(), len(df.columns)))

In [0]:
df.count()

In [0]:
df.show()

In [0]:
# Let's look at some quick summary statistics
df.describe().show()

In [0]:
display(df.groupBy("category").count().orderBy("category"))

category,count
books,149667
home_and_kitchen,299513
video_games,72809


In [0]:
display(df.groupBy("overall").count().orderBy("overall"))

overall,count
1.0,26590
2.0,23086
3.0,43741
4.0,91287
5.0,337285


# looking at label distributions by overall rating

In [0]:
display(df.groupBy("Label", "overall", "verified").count().orderBy("Label"))

Label,overall,verified,count
0,1.0,True,79637
0,3.0,True,178513
0,5.0,True,1716914
0,2.0,True,81631
0,5.0,False,220590
0,4.0,False,94577
0,4.0,True,404427
0,2.0,False,21071
0,1.0,False,20979
0,3.0,False,42826


In [0]:
display(df.groupBy("Label", "overall", "category").count().orderBy("Label"))

Label,overall,category,count
0,5.0,video_games,245934
0,1.0,video_games,16025
0,1.0,home_and_kitchen,72179
0,4.0,video_games,69348
0,4.0,home_and_kitchen,260700
0,3.0,video_games,33958
0,3.0,home_and_kitchen,120379
0,2.0,video_games,14784
0,5.0,books,503038
0,1.0,books,12412


# Look at labels by verified status

In [0]:
display(df.groupBy("Label", "verified").count().orderBy("Label"))

Label,verified,count
0,False,400043
0,True,2461122
1,False,255621
1,True,370545


# Look at labels by text word length

In [0]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer
from pyspark.ml import Pipeline

In [0]:
# tokenizer parses the string text into tokens
tokenizer = Tokenizer(inputCol='reviewText', outputCol='tokenized')
stopwordsRemover = StopWordsRemover(inputCol="tokenized", outputCol="stopwordremoval")

In [0]:
#Create a Data Transformation Pipeline

data_pipeline = Pipeline(stages=[tokenizer, stopwordsRemover])


In [0]:
data_pipelineFit = data_pipeline.fit(df)
df_preprocessed = data_pipelineFit.transform(df)

In [0]:
df_preprocessed.show(5)

In [0]:
from pyspark.sql import functions as f
counts = df_preprocessed.select(f.explode('stopwordremoval').alias('col')).groupBy('col').count().sort(f.desc('count')).collect()
display(counts)

col,count
,4887489
one,1050306
like,905077
book,858635
great,790138
game,727372
good,662377
get,582179
love,550339
really,543759


# Look at reviewText by label

In [0]:
helpful_review = df.where((df.label == 1) & (df.category == "books")).select("category", "verified", "summary", "overall", "reviewText", "label")
unhelpful_review = df.where((df.label == 0) & (df.category == "books")).select("category", "verified", "summary", "overall", "reviewText", "label")

In [0]:
display(helpful_review)
display(unhelpful_review)

category,verified,summary,overall,reviewText,label
books,False,Let there be Light!,5.0,"While A NIGHT IN THE LONESOME OCTOBER may be Zelazny's most fun book, LORD OF LIGHT is probably the greatest. It's science fiction written in the vocabulary of fantasy. Basically, settlers on an alien world, using advanced technology, create a world corresponding to Hindu mythology, with members of the crew taking on the aspects and powers of Hindu deities. There is even a means of electronically providing for reincarnation. The battles between the gods are epic and the verbal depictions are as good as a mural. The ending is also some of Zelazny's most touching writing, without ever dipping into the maudlin.",1
books,False,learned alot,4.0,This is a biography for kids on Jackie Robinson. He was a famous African american baseball player. He had to work really hard and ignore rude comment from many fans. Most people in his days did not like black and whites playing together. He took the bad and pushed it aside and became a better person and player! The book was not too long and not too short. It contained the right amount of information for kids. I would recommend this book to kids who are learning about the Jackie Robinson. I learned a lot from reading this book. He is a great role model for kids today.,1
books,False,"""What it takes to survive,""",5.0,"The old saying that ""no good deed goes unpunished"" is proven once again in Laura Lippman's new Tess Monaghan novel. Tess is a Baltimore-based private investigator and former reporter who lives in harmony with her boyfriend, Edgar ""Crow"" Ransome, until Crow unexpectedly brings home a troubled teenager named Lloyd Jupiter. Crow is a soft touch and a do-gooder, and even after Lloyd perpetrates a scam on Crow, the kindhearted man decides to give the hungry and homeless boy a decent meal and a roof over his head. Tess is less than thrilled with this arrangement, and she is even more ticked off when Lloyd leaves in the middle of the night, planning to steal her laptop, digital camera, Lexus SUV, and other assorted items. Although Lloyd's scheme doesn't play out exactly as planned, he manages to get away and promptly disappears into the streets of Baltimore. As it turns out, Lloyd may possess key information about the unsolved murder of Gregory Youssef, an Assistant United States Attorney who disappeared and was later found dead after having been stabbed dozens of times. Since he had been assigned to antiterrorism cases and had also prosecuted drug cases, the authorities assumed that Youssef must have been kidnapped and killed by one of the many enemies he made as a prosecutor. However, the murder remains unsolved and soon fades from the headlines. Later, rumors began to surface that Youssef may have been involved in a gay love tryst gone bad. This does not sit well with Youssef's wife, who rejects the idea as absurd. However, on the night that Lloyd stays in her home, Tess notices that when Youssef's name comes up in passing, the young man apparently recognizes it and appears very frightened and guilt-ridden. ""No Good Deeds"" is a well-crafted story that seamlessly combines a number of themes: the plight of homeless and neglected children, the corrosive nature of police corruption, and the manner in which secrets and lies destroy relationships and lives. The characters are varied, lively, and well delineated. They include Gabe Delasio, a shallow and ambitious Assistant United States Attorney who desperately wants to climb through the ranks, Barry Jenkins, a washed up FBI agent, and Mike ""Bully"" Collins, a tough-as-nails African-American DEA agent who is bitter about having been hung out to dry after he shot a civilian during a drug bust. Each of these men has his own agenda, and when all three join forces to nail Tess and Crow, the two wind up in grave danger. As always, Monaghan brings her native Baltimore to life, with its beauty and ugliness as well as its tranquility and violence, and there are also evocative and atmospheric scenes at the Delaware shore. Lippman's descriptive writing and dialogue are as sharp as ever, and she skillfully integrates the many disparate elements of her intricate plot into a satisfying whole. Lippman does what few suspense writers have the ability to accomplish. She creates intriguing scenarios with recognizable people whose back stories make their behavior, if not commendable, at least understandable. The best mysteries are never formulaic whodunits, but fully realized novels, and ""No Good Deeds"" continues the fine Lippman tradition of respecting her audience's intelligence enough to avoid predictable and hackneyed writing in favor of complex and riveting storytelling.",1
books,True,Copied from a whiteboard in a 1980's Howard Johnson?,2.0,"Don't normally write book reviews but the existing reviews were so far off base, I couldn't stay silent. First of all the book painfully shows its age being written in 1986 when all the business ""gurus"" were touting the success of McDonalds, Coke and IBM as their own. These references were outdated even then. I thought that ""revisited"" (1995 publish date) might include a reference to the internet at least, but I was wrong. The book feels like a 268 page infomercial for the author's consulting company. Once per chapter you can count on: ""we at GERBER Corp"" have the answer to this problem. We'll give an overgeneralized and useless thumbnail of the solution here, in the hopes that you might become a client"" I appreciate the attempt at allegorical style, but it is poorly done. I don't believe for a minute that there ever was a conversation with a ""Sarah"" that runs through the book. With this obvious falsification, the author loses authenticity. Obviously the author is a career consultant and it is painfully obvious: for example the points he makes are always bunched in threes. (because thats the only way stupid people comprehend at hotel seminars) Well what if the discoveries take you in a direction that leads to 5 or 13 points of knowledge? There is no hard data, empirical evidence, test results, or original knowledge - just anecdotes and the author's unsupported claims, likely culled from a morass of 1970s business books and spun into a different yarn. Despite all this, there are some interesting points raised about the qualities needed for starting a business being different from those to run it. Some colorful descriptions here and there which elevate it to two stars bordering on three. In summary: If you want to grow a business larger than a mom and pop and stay profitable over time, you must build systems and processes. The author thinks a lot of small business owners don't know this. If you do know this, move on.",1
books,False,"I heard colleagues recommend ""The E-Myth Revisited"" time and time again",5.0,"When I ran a small photography business, I heard colleagues recommend ""The E-Myth Revisited"" time and time again. Yet, years went by, I closed my business due to low revenue, and still, I had not read the book. Until this past week, when I remembered that the book was collecting dust under my bed. I got down on my hands and knees to fish it out and give it a read. I could not put it down. This book has forever changed how I view business, whether it's a small one-person shop or a large Fortune 500 corporation. My favorite concept in the book is the idea of building a prototype. Imagine you are looking to franchise your business. You need to create the structure and systems so that anyone could buy the franchise and successfully run the business. Create an organization chart with the various positions that are necessary for your business to run (e.g. CEO, CFO, VP Marketing, VP Operations, etc.). Each of those positions needs defined responsibilities and expectations as well as an Operating Manual so that anyone could jump into that position and be successful. ""But wait,"" you say. ""I'm the only person who works in my business. This won't help me."" Yes, it will. Instead of arriving at work and just ""getting to work,"" having distinct positions can help you be intentional about your time. Instead of running around haggard only to feel like you got nothing done at the end of the day, you will intentionally put on the appropriate ""hat"" and work on a specific goal before moving on. Why did my photography business close up shop? Because I never wore my marketing hat and would only wear my CFO hat when I scrambled to find documents for taxes. I never wore my CEO hat and took the time to strategically plan the trajectory of my business. The only hat I wore was that of a photographer, assuming that if my pictures were good enough, they would sell themselves. So I spent all of my time and money focused on that vague goal. I worked 50-60 hours every week, often with nothing to show by the end of the week. As Zig Ziglar said, If you aim at nothing, you will hit it every time. Having defined roles in a one-person business aids with focus. As you hire people, they can step into these defined roles to lighten your load. With clearly stated goals, responsibilities and expectations, you are giving a new employee the tools to succeed. This concept applies to any type of organization, even at a volunteer level. I've been thinking about setting up a playgroup for my son to get some play time and for me to network with other parents and make friends. My first thought was to run head first and immediately schedule events. But after reading ""The E-Myth,"" I've realized I should first create defined positions, even if I serve in every position at the beginning. It gives other parents the opportunity to step up and host activities that suit their interests. By having set positions and defined expectations for each position, the group can grow to a point of having several events per week without me feeling frazzled and overbooked. I would recommend this book to anyone who leads a business or organization. Seriously. No need for years to pass and you to go out of business before reading it. :) It could completely shatter and rebuild how you define and operate a business.",1
books,True,EXCELLENT BOOK FOR ALL LEVELS OF COOKS,5.0,"Not suprisingly, Donna Hay has done it again. Every one of her books is clear, concise, innovative, and very doable for all skill levels. This was my second purchase of the same book, to give to my 24yr.old son for christmas. It is excellent.",1
books,False,Really superior fantasy adventure,5.0,"Lord Ingrey kin Wolfcliff knows about animal possession. Still, when he arrives at the deathscene of a prince, he is shocked to find the accused killer to be a beautiful woman--who happens to be possessed by a leopard. It's his duty to bring Lady Ijada to the capital where, perhaps, she can find justice. But some compulsion, almost impossible to resist, is urging him to kill her at once. A very different compulsion draws him to her. In a meeting with a priestess, the wolf that dwells within Ingrey becomes active--opening his eyes to the magic that is around him and hinting at one of the ancient heresies--that animal-ridden people have a special path to the deities. But Ingrey and Ijada are not the only people with the forbidden animal inhabitation. Ingrey's cousin, Earl Horseriver, seems inhabited by a stalion--and something more. Whether that something more is good or evil, Ingrey has a hard time determining. Author Lois McMaster Bujold combines fascinating world-building, intriguing and multi-dimensional characters, and plenty of magic and action to deliver a truly superior fantasy. Bujold explains the history of the tortured land, and then weaves that history into the present, and into the task that Ingrey and Ijada must fulfil. THE HALLOWED HUNT is set in the same world as Bujold's earlier (and excellent) CURSE OF CHALION and PALADIN OF SOULS. Although I enjoyed the earlier books, I think HUNT surpasses even those.",1
books,False,Whodunit,2.0,"Ok story, wasn't as funny as I'd hoped. Characters were pretty run of the mill. Too bad, I was young it would be better.",1
books,True,Powerful Anti-War Statement,4.0,"After reading a few of his short stories, I decided to try one of Vonnegut's novels and I was not disappointed. Billy Pilgrim is a character that you won't soon forget. The gruesome war that made him the way he is becomes the central force of the novel as all of Billy's experiences are seen through the prism of his witnessing the bombing of Dresden at the conclusion of WWII. In his mind, he travels through time, inventing an alter life in wihich he escapes from Earth, but perhaps the novel's most clear theme is the innescapable nature of war memories. They will even follow you to other worlds. The language can be rough at times, so I can't recommend this book to younger readers, but for adults, it's worth reading and contemplating on why we wage war with one another. In my opinion, this bests ""Catch-22"" as the ultimate anti-war statement in contemporary literature.",1
books,True,The best Israeli love story ever,5.0,"This is an absolutely wonderful novel. The love story itself is beautiful and real. And the political subtext conveys perfectly what the situation in Israel is today. For anyone who cares about Israel, this will make a painful read. For me, it made me all the more determined to strengthen the wonderful people in Israel (the peace activists) who are trying so hard to save Israel (and the Palestinians) from the current nightmare. If you care about Israel, read this book and learn.",1


category,verified,summary,overall,reviewText,label
books,True,"Had to read it twice, loved it!",5.0,My toddler made me read this 2 times in a row when he got it. Then he pretended to read it to himself. He's a train fanatic but the Berenstein Bears books always seem to have the right amount of story line & language to delight him. I loved adding it to his collection :),0
books,False,"An Average ""I Can Read"" book -- Was feeling nostalgic when I bought it, but my kids don't really enjoy it...",3.0,"Pros: 1. Good for a kid who's a train lover. They not only are taking a train trip, but also get to tour the train and learn about the various parts/people on a train. 2. Age appropriate, family friend story line. Cons: 1. As a kid, I liked the Berenstain bears, but now, as a parent and adult, I feel as though there are many better illustrated options out there. They're illustrated the same way they always have been, but I can tell that my kids aren't as drawn in by these drawings compared to other books produced under the I Can Read series. 2. The reading level feels a little off to me. It's definitely not a ""My First"" level, but it's borderline Level 2? And some of the words introduced aren't common words (like ""caboose"" which is great for this book, probably not going to be useful outside of this book). Though I'm not disappointed to have bought this book, I think there are definitely better ""I Can Read Level 1"" books out there...",0
books,True,Five Stars,5.0,:),0
books,True,Five Stars,5.0,Best book ever.,0
books,True,what a revelation,5.0,"I had no idea what torment Jackie Robinson had to endure to play baseball. A real eye-opener, a valuable lesson in courage and patience. Young people need to know the history of the first African American in the major leagues",0
books,False,love!,5.0,"Enjoyable from start to finish. It seems choppy at times, but that's nearly exactly the point. Would recommend to anyone. The humor can be dry, which is a good thing as far as I'm concerned.",0
books,True,Three Stars,3.0,I expected more from this than I got.,0
books,True,ok,4.0,"This book started out so great, and interesting and then kind of goes on and on",0
books,True,Too weird for me,2.0,"This wasn't a smoothe read for me . In fact it hurt my brain trying to figure out all the weird things going on. I do undertsand the symbolism and all that, but it just wasn't that captivating for me .",0
books,False,Magical mystery tour of eastern religion meets fantasy!,5.0,"Luckily for me, I had a truly adventurous Oriental Religions instructor at San Jose State who made this amazing book required reading. The book unfolds on many levels, and through many readings I am still uncovering new revelations. Roger Zelazny was the best of many gifted sci fi and fantasy writers emerging in the 60's and 70's. I had the profound pleasure of meeting him at a book signing and found him to be bright, modest and interested in his readers. I still have my signed, tattered copy and can recommend this book along with Creatures of Light and Darkness, Jack of Shadows, Eye of Cat, Doorways In Sand, Dream Master et al. Other novels, including the Amber Series, A Night In the Lonesome October, This Immortal and many others, can be enjoyed, shared and collected...thank heavens for the prolific out-pouring! I truly hope this wondrous tale is reprinted, as well as many others. I almost envy those who have never been touched, nay, transported to the multi-layered worlds of Zelazny! They have so much reading (and thinking) pleasure in store...",0


# Compare Labels by ReviewText Length

In [0]:
# display(df_preprocessed.select("stopwordremoval").count())

df_length = df_preprocessed.withColumn('length', df_preprocessed['stopwordremoval'].length()).show()



In [0]:
display(df_length.groupBy("Label").agg({'label':'count', 'length':'avg'})

In [0]:
display(df_length.groupBy("Label", "category").agg({'label':'count', 'length':'avg'}))

In [0]:
df_length.show(5)

#labels by noun count

In [0]:
df.show(5)

In [0]:
import spacy
spacy.cli.download("en")

In [0]:
nlp=spacy.load("en_core_web_sm")

In [0]:
entity_rec = spark.udf.register("entity_rec", lambda x: nlp(x), "array<string>")

In [0]:
detect_entity = entity_rec(inputCol="reviewText", outputCol="entity_text")

In [0]:
data_pipeline = Pipeline(stages=[tokenizer, stopwordsRemover])


In [0]:
data_pipelineFit = data_pipeline.fit(df)
df_preprocessed = data_pipelineFit.transform(df)

In [0]:
from pyspark.sql.functions import *
import spacy
nlp = spacy.load('en_core_web_sm')
# detect_langs = udf(lambda x: detect(x), StringType())

In [0]:
detect_entity = udf(lambda x: nlp(x), StringType())

In [0]:
doc = nlp("Tesla, Inc. is an American electric vehicle and clean energy company based in Palo Alto, California")

In [0]:
doc.ents

# labels by avg. character length

In [0]:
df_length.show(5)

In [0]:
# df_length = df.withColumn('length', f.size(f.split(f.col('reviewText'), ' ')))

df_lazy_complexity = df_length.withColumn('char_length', f.length(f.col('reviewText')))

In [0]:
df_lazy_complexity  = df_lazy_complexity .withColumn('lazy_complexity', f.col('char_length')/f.col('length') )

In [0]:
df_lazy_complexity.show()

In [0]:
display(df_lazy_complexity.groupBy("Label").agg({'label':'count', 'lazy_complexity':'avg'}))

Label,avg(lazy_complexity),count(label)
1,5.372918761182195,94177
0,5.336645721928205,427812


In [0]:
display(df_lazy_complexity.groupBy("Label", "category").agg({'label':'count', 'lazy_complexity':'avg'}))

Label,category,avg(lazy_complexity),count(label)
0,video_games,5.302033684938883,56805
1,video_games,5.352146561241189,16004
0,home_and_kitchen,5.291023243624575,255342
1,home_and_kitchen,5.240831494041932,44171
1,books,5.55428647380817,34002
0,books,5.454360429196475,115665


# Create a Data Transformation Pipeline

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover
from pyspark.sql import functions as f

# We'll tokenize the text using a simple RegexTokenizer
tokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W")

# Remove standard Stopwords
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered")

pipeline = Pipeline(stages=[tokenizer, stopwordsRemover])

pipelineFit = pipeline.fit(df)
df = pipelineFit.transform(df)

In [0]:
# set seed for reproducibility
(trainingData, testingData) = df.randomSplit([0.8, 0.2], seed = 47)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testingData.count()))



In [0]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData, params=None)

# Get Term Frequencies

In [0]:
counts = df.select(f.explode('filtered').alias('col')).groupBy('col').count().sort(f.desc('count')).collect()
display(counts)

In [0]:
from pyspark.ml.feature import RegexTokenizer

regexTokenizer = RegexTokenizer(inputCol="reviewText", outputCol="reviewWord", pattern="\\W")

amazon_review_tokenized = regexTokenizer.transform(df.fillna("", subset=["reviewText"]))

In [0]:
amazon_review_tokenized.select('reviewWord').show(5)

In [0]:
from pyspark.ml.feature import CountVectorizer

# # Input data: Each row is a bag of words with a ID.
# df = spark.createDataFrame([
#     (0, "a b c".split(" ")),
#     (1, "a b b c a".split(" "))
# ], ["id", "words"])

# fit a CountVectorizerModel from the corpus.
cv = CountVectorizer(inputCol="reviewWord", outputCol="features", vocabSize=3, minDF=2.0)

model = cv.fit(amazon_review_tokenized)

result = model.transform(amazon_review_tokenized)
result.show(truncate=False)

In [0]:
result.show(5)

In [0]:
interpret_tfidf.select("TF-IDF").show(5)

In [0]:
from pyspark.ml.clustering import LDA

num_topics = 3
max_iterations = 10


lda = LDA(k= 3, seed=1024, optimizer="em", featuresCol="TF-IDF", maxIter = 10)
model = lda.fit(interpret_tfidf)

In [0]:
topics = model.describeTopics()

In [0]:
topics.show(5)

In [0]:
vocab = tf_model.vocabulary
vocab_broadcast = sc.broadcast(vocab)

def map_termID_to_Word(termIndices):
    words = []
    for termID in termIndices:
        words.append(vocab_broadcast.value[termID])

    return words

udf_map_termID_to_Word = udf(map_termID_to_Word , ArrayType(StringType()))
ldatopics_mapped = topics.withColumn("topic_desc", udf_map_termID_to_Word(topics.termIndices))

In [0]:
ldatopics_mapped.show(truncate=False)

In [0]:
model.describeTopics(maxTermsPerTopic = 10).show()

In [0]:
from transformers import pipeline
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.functions import udf
classifier = pipeline('sentiment-analysis')
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
from pyspark.sql.functions import StringType
from pyspark.sql.types import DoubleType
import pyspark.sql.functions as func


def polarity_detection(text):
    return TextBlob(text).sentiment.polarity
def subjectivity_detection(text):
    return TextBlob(text).sentiment.subjectivity
def text_classification(words):
    # polarity detection
    polarity_detection_udf = udf(polarity_detection, DoubleType())
    words = words.withColumn("polarity", polarity_detection_udf("reviewText"))
    # subjectivity detection
    subjectivity_detection_udf = udf(subjectivity_detection, DoubleType())
    words = words.withColumn("subjectivity", subjectivity_detection_udf("reviewText"))
    return words


In [0]:
semtimentResult = text_classification(df)
semtimentResult.show(5)

In [0]:
display(semtimentResult.select("Label", "polarity"))


Label,polarity
0,0.0361111111111111
0,0.0898148148148147
0,0.0499999999999999
0,-0.3333333333333333
0,0.3
0,0.0499999999999999
0,-0.1166666666666666
0,-0.1085434173669467
0,0.0319444444444444
0,0.2670454545454545


In [0]:
display(semtimentResult.select("Label", "subjectivity"))

Label,subjectivity
0,0.5638888888888889
0,0.4157407407407408
0,0.45
0,0.6
0,0.5833333333333334
0,0.5806122448979593
0,0.628888888888889
0,0.525280112044818
0,0.3731481481481482
0,0.3818181818181818


In [0]:
display(semtimentResult.select("subjectivity").filter(col("label")==1))

subjectivity
0.4105158730158729
0.4316368734510328
0.519426406926407
0.6675000000000001
0.3166666666666666
0.5583333333333333
0.4360294117647059
0.4875000000000001
0.55
0.5555555555555556


In [0]:
display(semtimentResult.select("subjectivity").filter(col("label")==0))

subjectivity
0.45
0.3818181818181818
0.6624999999999999
0.5375
0.4053030303030303
0.75
1.0
0.6000000000000001
0.7799999999999999
0.6883333333333334
