# Pyspark NLP

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer, StopWordsRemover
from pyspark import SparkContext

In [2]:
sc=SparkContext()

In [3]:
sc

In [4]:
spark=SparkSession.builder.master("local[1]").getOrCreate()

In [5]:
spark

In [6]:
df_review=spark.read.option("inferSchema",True).option("header",True).csv("c:/csv-ml/sentiment/K8 Reviews v0.2.csv")

In [7]:
df_review.show()

+---------+--------------------+
|sentiment|              review|
+---------+--------------------+
|        1|Good but need upd...|
|        0|Worst mobile i ha...|
|        1|when I will get m...|
|        1|                Good|
|        0|The worst phone e...|
|        0|Only I'm telling ...|
|        1|Phone is awesome....|
|        0|The battery level...|
|        0|It's over hitting...|
|        0|A lot of glitches...|
|        0|               Wrost|
|        1|Good phone but ch...|
|        0|Don't purchase th...|
|        0|I have faced the ...|
|        1|Very good phone s...|
|        0|headset is not av...|
|        0|every time automa...|
|        1|Best product acco...|
|        0|Battery draining ...|
|        1|     Good smartphone|
+---------+--------------------+
only showing top 20 rows



In [8]:
df_review.show(5)

+---------+--------------------+
|sentiment|              review|
+---------+--------------------+
|        1|Good but need upd...|
|        0|Worst mobile i ha...|
|        1|when I will get m...|
|        1|                Good|
|        0|The worst phone e...|
+---------+--------------------+
only showing top 5 rows



In [None]:
df_review.select("review").show(10)

In [9]:
from pyspark.ml.feature import Tokenizer

In [10]:
from pyspark.ml.feature import StopWordsRemover

# removing punctuations

In [11]:

from pyspark.sql.functions import regexp_replace


In [12]:
regpat=r"""[!\"#$%&'()*+,\-.\/:;<=>?@\[\\\]^_`{|}~]"""

In [13]:
df_nopunct=df_review.withColumn("review_nopunct",regexp_replace("review",regpat,""))

In [15]:
df_nopunct.select("review").collect()

[Row(review='Good but need updates and improvements'),
 Row(review="Worst mobile i have bought ever, Battery is draining like hell, backup is only 6 to 7 hours with internet uses, even if I put mobile idle its getting discharged.This is biggest lie from Amazon & Lenove which is not at all expected, they are making full by saying that battery is 4000MAH & booster charger is fake, it takes at least 4 to 5 hours to be fully charged.Don't know how Lenovo will survive by making full of us.Please don;t go for this else you will regret like me."),
 Row(review='when I will get my 10% cash back.... its already 15 January..'),
 Row(review='Good'),
 Row(review='The worst phone everThey have changed the last phone but the problem is still same and the amazon is not returning the phone .Highly disappointing of amazon'),
 Row(review="Only I'm telling don't buyI'm totally disappointedPoor batteryPoor cameraWaste of money"),
 Row(review='Phone is awesome. But while charging, it heats up allot..Really 

In [156]:
df_nopunct.select("review_nopunct").collect()[0]

Row(review_nopunct='Good but need updates and improvements')

# removing digits

In [18]:
regpat1=r"""[\d+]"""

In [19]:
df_nodigit=df_nopunct.withColumn("review_nodigit",regexp_replace("review_nopunct",regpat1,""))

In [20]:
df_nodigit.show()

+---------+--------------------+--------------------+--------------------+
|sentiment|              review|      review_nopunct|      review_nodigit|
+---------+--------------------+--------------------+--------------------+
|        1|Good but need upd...|Good but need upd...|Good but need upd...|
|        0|Worst mobile i ha...|Worst mobile i ha...|Worst mobile i ha...|
|        1|when I will get m...|when I will get m...|when I will get m...|
|        1|                Good|                Good|                Good|
|        0|The worst phone e...|The worst phone e...|The worst phone e...|
|        0|Only I'm telling ...|Only Im telling d...|Only Im telling d...|
|        1|Phone is awesome....|Phone is awesome ...|Phone is awesome ...|
|        0|The battery level...|The battery level...|The battery level...|
|        0|It's over hitting...|Its over hitting ...|Its over hitting ...|
|        0|A lot of glitches...|A lot of glitches...|A lot of glitches...|
|        0|              

In [157]:
df_nodigit.select("review_nodigit").collect()[0:10]

[Row(review_nodigit='Good but need updates and improvements'),
 Row(review_nodigit='Worst mobile i have bought ever Battery is draining like hell backup is only  to  hours with internet uses even if I put mobile idle its getting dischargedThis is biggest lie from Amazon  Lenove which is not at all expected they are making full by saying that battery is MAH  booster charger is fake it takes at least  to  hours to be fully chargedDont know how Lenovo will survive by making full of usPlease dont go for this else you will regret like me'),
 Row(review_nodigit='when I will get my  cash back its already  January'),
 Row(review_nodigit='Good'),
 Row(review_nodigit='The worst phone everThey have changed the last phone but the problem is still same and the amazon is not returning the phone Highly disappointing of amazon'),
 Row(review_nodigit='Only Im telling dont buyIm totally disappointedPoor batteryPoor cameraWaste of money'),
 Row(review_nodigit='Phone is awesome But while charging it heats

# convert to lowercase

In [22]:
from pyspark.sql.functions import lower

In [23]:
df_lower=df_nodigit.withColumn("review_lower",lower(df_nodigit["review_nodigit"]))

In [158]:
df_lower.select("review","review_lower").show(10)

+--------------------+--------------------+
|              review|        review_lower|
+--------------------+--------------------+
|Good but need upd...|good but need upd...|
|Worst mobile i ha...|worst mobile i ha...|
|when I will get m...|when i will get m...|
|                Good|                good|
|The worst phone e...|the worst phone e...|
|Only I'm telling ...|only im telling d...|
|Phone is awesome....|phone is awesome ...|
|The battery level...|the battery level...|
|It's over hitting...|its over hitting ...|
|A lot of glitches...|a lot of glitches...|
+--------------------+--------------------+
only showing top 10 rows



# tokenization

In [25]:
tokenized_model=Tokenizer(inputCol="review_lower",outputCol="tokenized_review")

In [26]:
df_tokenized=tokenized_model.transform(df_lower)

In [27]:
df_tokenized.show(10)

+---------+--------------------+--------------------+--------------------+--------------------+--------------------+
|sentiment|              review|      review_nopunct|      review_nodigit|        review_lower|    tokenized_review|
+---------+--------------------+--------------------+--------------------+--------------------+--------------------+
|        1|Good but need upd...|Good but need upd...|Good but need upd...|good but need upd...|[good, but, need,...|
|        0|Worst mobile i ha...|Worst mobile i ha...|Worst mobile i ha...|worst mobile i ha...|[worst, mobile, i...|
|        1|when I will get m...|when I will get m...|when I will get m...|when i will get m...|[when, i, will, g...|
|        1|                Good|                Good|                Good|                good|              [good]|
|        0|The worst phone e...|The worst phone e...|The worst phone e...|the worst phone e...|[the, worst, phon...|
|        0|Only I'm telling ...|Only Im telling d...|Only Im tel

In [159]:
df_tokenized.select("review","tokenized_review").show(10)

+--------------------+--------------------+
|              review|    tokenized_review|
+--------------------+--------------------+
|Good but need upd...|[good, but, need,...|
|Worst mobile i ha...|[worst, mobile, i...|
|when I will get m...|[when, i, will, g...|
|                Good|              [good]|
|The worst phone e...|[the, worst, phon...|
|Only I'm telling ...|[only, im, tellin...|
|Phone is awesome....|[phone, is, aweso...|
|The battery level...|[the, battery, le...|
|It's over hitting...|[its, over, hitti...|
|A lot of glitches...|[a, lot, of, glit...|
+--------------------+--------------------+
only showing top 10 rows



# stopwords removal

In [29]:
stopword_rem_model=StopWordsRemover(inputCol="tokenized_review",outputCol="swr_review")

In [30]:
df_swr_review=stopword_rem_model.transform(df_tokenized)

In [31]:
df_swr_review.show(5)

+---------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|sentiment|              review|      review_nopunct|      review_nodigit|        review_lower|    tokenized_review|          swr_review|
+---------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|        1|Good but need upd...|Good but need upd...|Good but need upd...|good but need upd...|[good, but, need,...|[good, need, upda...|
|        0|Worst mobile i ha...|Worst mobile i ha...|Worst mobile i ha...|worst mobile i ha...|[worst, mobile, i...|[worst, mobile, b...|
|        1|when I will get m...|when I will get m...|when I will get m...|when i will get m...|[when, i, will, g...|[get, , cash, bac...|
|        1|                Good|                Good|                Good|                good|              [good]|              [good]|
|        0|The worst phone e...|Th

In [32]:
df_swr_review.select("review","swr_review").show()

+--------------------+--------------------+
|              review|          swr_review|
+--------------------+--------------------+
|Good but need upd...|[good, need, upda...|
|Worst mobile i ha...|[worst, mobile, b...|
|when I will get m...|[get, , cash, bac...|
|                Good|              [good]|
|The worst phone e...|[worst, phone, ev...|
|Only I'm telling ...|[im, telling, don...|
|Phone is awesome....|[phone, awesome, ...|
|The battery level...|[battery, level, ...|
|It's over hitting...|[hitting, problem...|
|A lot of glitches...|[lot, glitches, d...|
|               Wrost|             [wrost]|
|Good phone but ch...|[good, phone, cha...|
|Don't purchase th...|[dont, purchase, ...|
|I have faced the ...|[faced, battery, ...|
|Very good phone s...|[good, phone, sli...|
|headset is not av...|[headset, available]|
|every time automa...|[every, time, aut...|
|Best product acco...|[best, product, a...|
|Battery draining ...|[battery, drainin...|
|     Good smartphone|  [good, s

# lemmatisation

In [33]:
import nltk
from nltk.stem.wordnet import WordNetLemmatizer

In [34]:
df_pandas_lemma=df_swr_review.toPandas()

  from pandas.core import (


In [35]:
df_pandas_lemma.head()

Unnamed: 0,sentiment,review,review_nopunct,review_nodigit,review_lower,tokenized_review,swr_review
0,1,Good but need updates and improvements,Good but need updates and improvements,Good but need updates and improvements,good but need updates and improvements,"[good, but, need, updates, and, improvements]","[good, need, updates, improvements]"
1,0,"Worst mobile i have bought ever, Battery is dr...",Worst mobile i have bought ever Battery is dra...,Worst mobile i have bought ever Battery is dra...,worst mobile i have bought ever battery is dra...,"[worst, mobile, i, have, bought, ever, battery...","[worst, mobile, bought, ever, battery, drainin..."
2,1,when I will get my 10% cash back.... its alrea...,when I will get my 10 cash back its already 15...,when I will get my cash back its already Jan...,when i will get my cash back its already jan...,"[when, i, will, get, my, , cash, back, its, al...","[get, , cash, back, already, , january]"
3,1,Good,Good,Good,good,[good],[good]
4,0,The worst phone everThey have changed the last...,The worst phone everThey have changed the last...,The worst phone everThey have changed the last...,the worst phone everthey have changed the last...,"[the, worst, phone, everthey, have, changed, t...","[worst, phone, everthey, changed, last, phone,..."


In [36]:
lemma=[]
for i in df_pandas_lemma["swr_review"]:
    lemma_row=[]
    for j in i:
        lemma_row.append(WordNetLemmatizer().lemmatize(j))
    lemma.append(lemma_row)
    

In [161]:
lemma[0:5]

[['good', 'need', 'update', 'improvement'],
 ['worst',
  'mobile',
  'bought',
  'ever',
  'battery',
  'draining',
  'like',
  'hell',
  'backup',
  '',
  '',
  'hour',
  'internet',
  'us',
  'even',
  'put',
  'mobile',
  'idle',
  'getting',
  'dischargedthis',
  'biggest',
  'lie',
  'amazon',
  '',
  'lenove',
  'expected',
  'making',
  'full',
  'saying',
  'battery',
  'mah',
  '',
  'booster',
  'charger',
  'fake',
  'take',
  'least',
  '',
  '',
  'hour',
  'fully',
  'chargeddont',
  'know',
  'lenovo',
  'survive',
  'making',
  'full',
  'usplease',
  'dont',
  'go',
  'else',
  'regret',
  'like'],
 ['get', '', 'cash', 'back', 'already', '', 'january'],
 ['good'],
 ['worst',
  'phone',
  'everthey',
  'changed',
  'last',
  'phone',
  'problem',
  'still',
  'amazon',
  'returning',
  'phone',
  'highly',
  'disappointing',
  'amazon']]

In [38]:
df_pandas_lemma["lemmatized_review"]=lemma

In [39]:
df_pandas_lemma.head()

Unnamed: 0,sentiment,review,review_nopunct,review_nodigit,review_lower,tokenized_review,swr_review,lemmatized_review
0,1,Good but need updates and improvements,Good but need updates and improvements,Good but need updates and improvements,good but need updates and improvements,"[good, but, need, updates, and, improvements]","[good, need, updates, improvements]","[good, need, update, improvement]"
1,0,"Worst mobile i have bought ever, Battery is dr...",Worst mobile i have bought ever Battery is dra...,Worst mobile i have bought ever Battery is dra...,worst mobile i have bought ever battery is dra...,"[worst, mobile, i, have, bought, ever, battery...","[worst, mobile, bought, ever, battery, drainin...","[worst, mobile, bought, ever, battery, drainin..."
2,1,when I will get my 10% cash back.... its alrea...,when I will get my 10 cash back its already 15...,when I will get my cash back its already Jan...,when i will get my cash back its already jan...,"[when, i, will, get, my, , cash, back, its, al...","[get, , cash, back, already, , january]","[get, , cash, back, already, , january]"
3,1,Good,Good,Good,good,[good],[good],[good]
4,0,The worst phone everThey have changed the last...,The worst phone everThey have changed the last...,The worst phone everThey have changed the last...,the worst phone everthey have changed the last...,"[the, worst, phone, everthey, have, changed, t...","[worst, phone, everthey, changed, last, phone,...","[worst, phone, everthey, changed, last, phone,..."


In [40]:
df_pandas_lemma.head()

Unnamed: 0,sentiment,review,review_nopunct,review_nodigit,review_lower,tokenized_review,swr_review,lemmatized_review
0,1,Good but need updates and improvements,Good but need updates and improvements,Good but need updates and improvements,good but need updates and improvements,"[good, but, need, updates, and, improvements]","[good, need, updates, improvements]","[good, need, update, improvement]"
1,0,"Worst mobile i have bought ever, Battery is dr...",Worst mobile i have bought ever Battery is dra...,Worst mobile i have bought ever Battery is dra...,worst mobile i have bought ever battery is dra...,"[worst, mobile, i, have, bought, ever, battery...","[worst, mobile, bought, ever, battery, drainin...","[worst, mobile, bought, ever, battery, drainin..."
2,1,when I will get my 10% cash back.... its alrea...,when I will get my 10 cash back its already 15...,when I will get my cash back its already Jan...,when i will get my cash back its already jan...,"[when, i, will, get, my, , cash, back, its, al...","[get, , cash, back, already, , january]","[get, , cash, back, already, , january]"
3,1,Good,Good,Good,good,[good],[good],[good]
4,0,The worst phone everThey have changed the last...,The worst phone everThey have changed the last...,The worst phone everThey have changed the last...,the worst phone everthey have changed the last...,"[the, worst, phone, everthey, have, changed, t...","[worst, phone, everthey, changed, last, phone,...","[worst, phone, everthey, changed, last, phone,..."


In [41]:
df_swr_review.show()

+---------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|sentiment|              review|      review_nopunct|      review_nodigit|        review_lower|    tokenized_review|          swr_review|
+---------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|        1|Good but need upd...|Good but need upd...|Good but need upd...|good but need upd...|[good, but, need,...|[good, need, upda...|
|        0|Worst mobile i ha...|Worst mobile i ha...|Worst mobile i ha...|worst mobile i ha...|[worst, mobile, i...|[worst, mobile, b...|
|        1|when I will get m...|when I will get m...|when I will get m...|when i will get m...|[when, i, will, g...|[get, , cash, bac...|
|        1|                Good|                Good|                Good|                good|              [good]|              [good]|
|        0|The worst phone e...|Th

In [42]:
 df_swr_review.count()

14675

In [43]:
 df_swr_review.select("swr_review").show(5)

+--------------------+
|          swr_review|
+--------------------+
|[good, need, upda...|
|[worst, mobile, b...|
|[get, , cash, bac...|
|              [good]|
|[worst, phone, ev...|
+--------------------+
only showing top 5 rows



# df_cleaned not null


In [44]:
#df_cleaned not null

df_cleaned= df_swr_review.filter( df_swr_review["swr_review"].isNotNull())

In [45]:
df_cleaned.count()

14675

In [46]:
col=df_cleaned.select("swr_review").rdd.flatMap(lambda x: x).collect()

In [162]:
col[0:5]

[['good', 'need', 'updates', 'improvements'],
 ['worst',
  'mobile',
  'bought',
  'ever',
  'battery',
  'draining',
  'like',
  'hell',
  'backup',
  '',
  '',
  'hours',
  'internet',
  'uses',
  'even',
  'put',
  'mobile',
  'idle',
  'getting',
  'dischargedthis',
  'biggest',
  'lie',
  'amazon',
  '',
  'lenove',
  'expected',
  'making',
  'full',
  'saying',
  'battery',
  'mah',
  '',
  'booster',
  'charger',
  'fake',
  'takes',
  'least',
  '',
  '',
  'hours',
  'fully',
  'chargeddont',
  'know',
  'lenovo',
  'survive',
  'making',
  'full',
  'usplease',
  'dont',
  'go',
  'else',
  'regret',
  'like'],
 ['get', '', 'cash', 'back', 'already', '', 'january'],
 ['good'],
 ['worst',
  'phone',
  'everthey',
  'changed',
  'last',
  'phone',
  'problem',
  'still',
  'amazon',
  'returning',
  'phone',
  'highly',
  'disappointing',
  'amazon']]

In [48]:
lemmatizer=WordNetLemmatizer()

In [49]:
lemma_col=[]
for i in col:
    row=[]
    for j in i:
        row.append(WordNetLemmatizer().lemmatize(j))
    lemma_col.append(row)

In [163]:
lemma_col[0:5]

[['good', 'need', 'update', 'improvement'],
 ['worst',
  'mobile',
  'bought',
  'ever',
  'battery',
  'draining',
  'like',
  'hell',
  'backup',
  '',
  '',
  'hour',
  'internet',
  'us',
  'even',
  'put',
  'mobile',
  'idle',
  'getting',
  'dischargedthis',
  'biggest',
  'lie',
  'amazon',
  '',
  'lenove',
  'expected',
  'making',
  'full',
  'saying',
  'battery',
  'mah',
  '',
  'booster',
  'charger',
  'fake',
  'take',
  'least',
  '',
  '',
  'hour',
  'fully',
  'chargeddont',
  'know',
  'lenovo',
  'survive',
  'making',
  'full',
  'usplease',
  'dont',
  'go',
  'else',
  'regret',
  'like'],
 ['get', '', 'cash', 'back', 'already', '', 'january'],
 ['good'],
 ['worst',
  'phone',
  'everthey',
  'changed',
  'last',
  'phone',
  'problem',
  'still',
  'amazon',
  'returning',
  'phone',
  'highly',
  'disappointing',
  'amazon']]

In [51]:
df_cleaned_pd=df_cleaned.toPandas()

In [52]:
df_cleaned_pd["lemma_col"]=lemma_col

In [53]:
df_cleaned_lemma=spark.createDataFrame(df_cleaned_pd)

In [54]:
df_cleaned_lemma.show()

+---------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|sentiment|              review|      review_nopunct|      review_nodigit|        review_lower|    tokenized_review|          swr_review|           lemma_col|
+---------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|        1|Good but need upd...|Good but need upd...|Good but need upd...|good but need upd...|[good, but, need,...|[good, need, upda...|[good, need, upda...|
|        0|Worst mobile i ha...|Worst mobile i ha...|Worst mobile i ha...|worst mobile i ha...|[worst, mobile, i...|[worst, mobile, b...|[worst, mobile, b...|
|        1|when I will get m...|when I will get m...|when I will get m...|when i will get m...|[when, i, will, g...|[get, , cash, bac...|[get, , cash, bac...|
|        1|                Good|              

In [55]:
df_cleaned_final=df_cleaned_lemma.select("lemma_col","sentiment")

In [56]:
df_cleaned_final.show()

+--------------------+---------+
|           lemma_col|sentiment|
+--------------------+---------+
|[good, need, upda...|        1|
|[worst, mobile, b...|        0|
|[get, , cash, bac...|        1|
|              [good]|        1|
|[worst, phone, ev...|        0|
|[im, telling, don...|        0|
|[phone, awesome, ...|        1|
|[battery, level, ...|        0|
|[hitting, problem...|        0|
|[lot, glitch, don...|        0|
|             [wrost]|        0|
|[good, phone, cha...|        1|
|[dont, purchase, ...|        0|
|[faced, battery, ...|        0|
|[good, phone, sli...|        1|
|[headset, available]|        0|
|[every, time, aut...|        0|
|[best, product, a...|        1|
|[battery, drainin...|        0|
|  [good, smartphone]|        1|
+--------------------+---------+
only showing top 20 rows



# Countvectorizer

In [110]:
from pyspark.ml.feature import CountVectorizer

In [111]:
cv_model=CountVectorizer(inputCol="lemma_col",outputCol="features")

In [112]:
df_countvector=cv_model.fit(df_cleaned_final)

In [113]:
df_countvector=df_countvector.transform(df_cleaned_final)

In [114]:
df_countvector=df_countvector.withColumnRenamed("sentiment","label")

In [115]:
train_cv,test_cv=df_countvector.randomSplit([.7,.3])

In [116]:
lr = LogisticRegression(labelCol="label", featuresCol="features")
model = lr.fit(train_cv)
print ("Training Done")

Training Done


In [117]:
train_cv.show()

+--------------------+-----+--------------------+
|           lemma_col|label|            features|
+--------------------+-----+--------------------+
|                  []|    0|       (16320,[],[])|
|[, month, started...|    0|(16320,[1,3,17,39...|
|[, primary, camer...|    0|(16320,[0,1,4,5,7...|
|[, touch, tacky, ...|    0|(16320,[0,1,6,14,...|
|[absolutely, nons...|    0|(16320,[1,24,138,...|
|[accept, phone, e...|    0|(16320,[0,120,222...|
|        [affordable]|    1| (16320,[858],[1.0])|
|[almost, , week, ...|    1|(16320,[0,1,2,3,4...|
|[almost, k, note,...|    0|(16320,[1,2,4,6,7...|
|[amazing, experie...|    1|(16320,[2,4,8,12,...|
|[amazing, mbl, ca...|    1|(16320,[0,2,4,61,...|
|[amazing, perform...|    1|(16320,[25,109],[...|
|    [amazing, phone]|    1|(16320,[0,109],[1...|
|[amazing, product...|    1|(16320,[2,5,6,26,...|
|[amazing, speed, ...|    1|(16320,[0,1,2,3,4...|
|   [apple, se, best]|    1|(16320,[28,557,29...|
|      [asome, phone]|    0|(16320,[0,4821],[...|


In [118]:
prediction_cv = model.transform(test_cv)


In [119]:
prediction_cv.show()

+--------------------+-----+--------------------+--------------------+--------------------+----------+
|           lemma_col|label|            features|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+--------------------+----------+
|[, day, usage, fi...|    0|(16320,[0,1,7,8,1...|[119.152690457252...|           [1.0,0.0]|       0.0|
|[, great, phone, ...|    1|(16320,[0,1,3,4,1...|[5.42344747328488...|[0.99560746899715...|       0.0|
|[able, download, ...|    0|(16320,[243,505,5...|[13.2880219449424...|[0.99999830533188...|       0.0|
|[absolutely, usel...|    0|(16320,[0,3,18,27...|[40.9442125789744...|           [1.0,0.0]|       0.0|
|[almost, average,...|    0|(16320,[0,1,2,4,1...|[-18.424286897949...|[9.96400330439610...|       1.0|
|[always, trusted,...|    0|(16320,[0,3,21,24...|[66.6716932117345...|           [1.0,0.0]|       0.0|
|           [amazing]|    1| (16320,[109],[1.0])|[-29.067439178758...|[2.

In [120]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [121]:
eval1=BinaryClassificationEvaluator(labelCol="label",rawPredictionCol="prediction",metricName="areaUnderROC")

In [122]:
eval1.evaluate(prediction_cv)

0.7729334541422306