In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer
from pyspark.ml.feature import RegexTokenizer, CountVectorizer, Tokenizer
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import  IDF
from pyspark.ml import Pipeline
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.functions import desc
from pyspark.ml.clustering import KMeans
from pyspark.ml.clustering import LDA
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.clustering import BisectingKMeans
from pyspark.ml.classification import LogisticRegression
import pyspark.sql.functions as f
from pyspark.sql.functions import countDistinct
from pyspark.sql.functions import lit
from pyspark.ml.linalg import Vectors, VectorUDT

In [2]:
%time
spark = SparkSession \
    .builder \
    .master('local[*]') \
    .appName("Text_Classification") \
    .config("spark.executor.memory", "8g") \
    .getOrCreate()

spark.sparkContext.setLogLevel('WARN')

print(spark.version)

Wall time: 0 ns
2.3.2


In [3]:
path = '/Users/User/PycharmProjects/spark/animals_comments.csv'
data = spark.read.csv([path])
print("Number of documents read in is:", data.count())

Number of documents read in is: 5820036


In [4]:
%%time
data = data.withColumnRenamed('_c0', 'chan_owner').withColumnRenamed('_c1', 'id_comentator').withColumnRenamed('_c2', 'text')
data.show(5, truncate = False)

+-------------+-------------+-----------------------------------------------------------------------------------------------------------------------+
|chan_owner   |id_comentator|text                                                                                                                   |
+-------------+-------------+-----------------------------------------------------------------------------------------------------------------------+
|creator_name |userid       |comment                                                                                                                |
|Doug The Pug | 87          |I shared this to my friends and mom the were lol                                                                       |
|Doug The Pug | 87          |Super cute  😀🐕🐶                                                                                                     |
|bulletproof  | 530         |stop saying get em youre literally dumb . have some common sense or dont o

In [5]:
data.groupBy("id_comentator") \
    .count() \
    .orderBy(col("count").desc()) \
    .show(n=50, truncate=40)

+-------------+-----+
|id_comentator|count|
+-------------+-----+
|      2036522| 2571|
|       455571| 2159|
|       569313| 1452|
|      1727704| 1339|
|      2041593| 1288|
|      2288680| 1247|
|       954873|  888|
|       479268|  775|
|      1568280|  757|
|       575285|  742|
|      2184324|  740|
|       170012|  725|
|       797741|  697|
|      1367232|  671|
|      1293328|  644|
|      2056517|  642|
|      1705977|  627|
|      2071628|  605|
|      1448539|  601|
|       717385|  596|
|      2427846|  596|
|      2396540|  586|
|       576447|  581|
|       239250|  578|
|      1766767|  573|
|      1194456|  566|
|         null|  565|
|      2399121|  563|
|       416516|  558|
|      2042389|  544|
|      2427873|  541|
|      1810444|  541|
|       620667|  538|
|      2207685|  523|
|      1377368|  517|
|      2076074|  512|
|      1288301|  509|
|       635762|  508|
|      2451022|  503|
|      1007044|  498|
|      1078410|  487|
|       248498|  487|
|       20

In [18]:
countDistinctDF = data.groupby("id_comentator").agg(f.concat_ws(", ", f.collect_list(data.text)))
countDistinctDF.show()

+-------------+---------------------------------+
|id_comentator|concat_ws(, , collect_list(text))|
+-------------+---------------------------------+
|       100005|             Can you do blitz ...|
|      1000095|             teoría por demás ...|
|      1000928|             Life really took ...|
|      1000955|             I love you Jungko...|
|      1000967|             OK COOL  FISHES  ...|
|      1001237|                            Quack|
|      1001262|             la de la flor de ...|
|      1001299|                               Hi|
|        10013|              no quería caerse :C|
|       100141|             Awwwww They Are S...|
|      1001452|             I love these vlog...|
|      1001603|             After watching vi...|
|      1002165|                               D:|
|      1002329|             HonestlyI LOVE th...|
|      1002348|             1:20Well at Walgr...|
|      1002567|                   quem é  toddy?|
|      1002904|             Omg sew that guy ...|


In [19]:
countDistinctDF = countDistinctDF.withColumnRenamed('concat_ws(, , collect_list(text))', 'text')


In [20]:
countDistinctDF.withColumn('word', f.explode(f.split(f.col('text'), ' ')))\
    .groupBy('word')\
    .count()\
    .sort('count', ascending=False)\
    .show(20)

+----+-------+
|word|  count|
+----+-------+
| the|2362893|
|   I|2155003|
|   a|1851088|
| and|1705804|
|  to|1697617|
| you|1361504|
|    |1239857|
|  is|1033276|
|  of| 911444|
|  in| 744448|
| for| 727594|
|that| 726554|
|  it| 659242|
|  so| 648062|
|  my| 621289|
|your| 582033|
|have| 560468|
| are| 546201|
|this| 521092|
|like| 503676|
+----+-------+
only showing top 20 rows



In [21]:
words_list_cat = ['cat owner', 'cats owner', "кот", 'i have cat', 'my cat', 'cat', 'cats']

cat = countDistinctDF.filter(f.col('text')
                            .rlike('(^|\s)(' + '|'
                            .join(words_list_cat) + ')(\s|$)'))

cat.describe().show()

+-------+-----------------+--------------------+
|summary|    id_comentator|                text|
+-------+-----------------+--------------------+
|  count|            84058|               84058|
|   mean|1269945.150110638|                null|
| stddev|731914.3214415313|                null|
|    min|          1000034| *KOVU* no!  remi...|
|    max|           999990|🧀🧀, Her editing...|
+-------+-----------------+--------------------+



In [22]:
words_list_dog = ['dog', 'dogs', "собака", 'i have dog', 'dog owner', "my dog", "mine dog"]

dog = countDistinctDF.filter(f.col('text')
                            .rlike('(^|\s)(' + '|'
                            .join(words_list_dog) + ')(\s|$)'))

dog.describe().show()

+-------+-----------------+--------------------+
|summary|    id_comentator|                text|
+-------+-----------------+--------------------+
|  count|           143452|              143452|
|   mean|1269379.064739425|                null|
| stddev|733192.8865919772|                null|
|    min|          1000014| *Thirsty* , i di...|
|    max|           999976|🧟‍♂️ I like turt...|
+-------+-----------------+--------------------+



In [23]:
dog = dog.withColumn('category', lit("dog"))
cat = cat.withColumn('category', lit("cat"))


In [24]:
cat_dog = dog.unionAll(cat)

In [25]:
cat_dog.show(25)

+-------------+--------------------+--------+
|id_comentator|                text|category|
+-------------+--------------------+--------+
|      1004717|hi taylor! I love...|     dog|
|      1006412|My friends I see ...|     dog|
|      1006679|Name him Bryan th...|     dog|
|      1011010|People adopted hi...|     dog|
|       101207|fuck that fish! b...|     dog|
|      1025022|accent sounded en...|     dog|
|       102640|My neighbor just ...|     dog|
|      1026537|i looooove mice t...|     dog|
|      1028988|I wish I was with...|     dog|
|      1028999|I hope u are ok a...|     dog|
|      1035275|Oh my God fucking...|     dog|
|      1036191|Great love to see...|     dog|
|      1044107|The way Coyote po...|     dog|
|      1044800|I love gohan, I w...|     dog|
|        10485|I wish all dog tr...|     dog|
|      1049861|Nagini was the na...|     dog|
|      1058469|Gone to the snow ...|     dog|
|      1058822|Who needs TV when...|     dog|
|      1059832|у меня дома собак..

In [26]:
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered")
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)
label_stringIdx = StringIndexer(inputCol = "category", outputCol = "label")
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])
pipelineFit = pipeline.fit(cat_dog)
dataset = pipelineFit.transform(cat_dog)
dataset.show(5)


+-------------+--------------------+--------+--------------------+--------------------+--------------------+-----+
|id_comentator|                text|category|               words|            filtered|            features|label|
+-------------+--------------------+--------+--------------------+--------------------+--------------------+-----+
|      1004717|hi taylor! I love...|     dog|[hi, taylor, i, l...|[hi, taylor, love...|(10000,[0,1,2,4,9...|  0.0|
|      1006412|My friends I see ...|     dog|[my, friends, i, ...|[friends, see, no...|(10000,[2,4,14,16...|  0.0|
|      1006679|Name him Bryan th...|     dog|[name, him, bryan...|[name, bryan, gat...|(10000,[2,26,95,3...|  0.0|
|      1011010|People adopted hi...|     dog|[people, adopted,...|[people, adopted,...|(10000,[0,2,4,5,1...|  0.0|
|       101207|fuck that fish! b...|     dog|[fuck, that, fish...|[fuck, fish, boil...|(10000,[2,6,113,1...|  0.0|
+-------------+--------------------+--------+--------------------+--------------

In [27]:
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 159361
Test Dataset Count: 68149


In [28]:
lr = LogisticRegression(maxIter=20,
                        regParam=0.3,
                        elasticNetParam=0)

lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)

predictions.filter(predictions['prediction'] == 0) \
    .select("id_comentator","text","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+-------------+------------------------------+--------+------------------------------+-----+----------+
|id_comentator|                          text|category|                   probability|label|prediction|
+-------------+------------------------------+--------+------------------------------+-----+----------+
|      2017407|Place human-dog friendship ...|     cat|  [1.0,3.8912470427199477E-25]|  1.0|       0.0|
|       843061|The air shark was so cool!,...|     dog|   [1.0,2.712536412252654E-44]|  0.0|       0.0|
|      1316036|French and English bulldogs...|     cat|[0.9999999999971767,2.82340...|  1.0|       0.0|
|       636273|Hehe I thought that was dwa...|     dog|[0.9999999993466913,6.53308...|  0.0|       0.0|
|      2302437|I love gohan, I love you Go...|     dog|[0.9999999958074084,4.19259...|  0.0|       0.0|
|      2126410|Such a great video!  Thank ...|     cat|[0.9999999865016185,1.34983...|  1.0|       0.0|
|       252989|Good morning brian 😁What e...|     dog|[0.9999999

In [29]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.5530393116905363

In [30]:
import matplotlib.pyplot as plt
import numpy as np

trainingSummary = lrModel.summary
roc = trainingSummary.roc.toPandas()

In [31]:
plt.plot(roc['FPR'], roc['TPR'])
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
print('Training set areaUnderROC: ' + str(trainingSummary.areaUnderROC))


Training set areaUnderROC: 0.8722257300628284


In [32]:
from pyspark.ml.classification import NaiveBayes

nb = NaiveBayes(smoothing=1)
model_nb = nb.fit(trainingData)
predictionsnb = model_nb.transform(testData)
predictionsnb.filter(predictionsnb['prediction'] == 0) \
    .select("id_comentator","text","category","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)


+-------------+------------------------------+--------+---------------------------+-----+----------+
|id_comentator|                          text|category|                probability|label|prediction|
+-------------+------------------------------+--------+---------------------------+-----+----------+
|      1718781|Dont Forget Subscribe VIRAL...|     dog|[1.0,1.015409805219536E-16]|  0.0|       0.0|
|      1161308|Hello Brian!! Im on vacatio...|     dog|[1.0,9.973488263617031E-17]|  0.0|       0.0|
|      1679441|You are my dog trainer and ...|     dog|[1.0,9.904250025978762E-17]|  0.0|       0.0|
|      1044567|omg too cute in car while w...|     dog|[1.0,9.668966872316635E-17]|  0.0|       0.0|
|      2170919|Rick please remember these ...|     cat|[1.0,9.301893088451775E-17]|  1.0|       0.0|
|       229408|What kind of stupid people ...|     cat|[1.0,7.708844664600006E-17]|  1.0|       0.0|
|       229408|What kind of stupid people ...|     dog|[1.0,7.708844664600006E-17]|  0.0|  

In [33]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictionsnb)

0.8789944410998412

In [56]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)

rfModel = rf.fit(trainingData)

In [57]:
predictions_rf = rfModel.transform(testData)
predictions_rf.filter(predictions_rf['prediction'] == 0) \
    .select("id_comentator","text","category","probability","label","prediction")\
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+-------------+------------------------------+--------+------------------------------+-----+----------+
|id_comentator|                          text|category|                   probability|label|prediction|
+-------------+------------------------------+--------+------------------------------+-----+----------+
|         8460|A bullmastiff is a form of ...|     dog|[0.7439669895522577,0.25603...|  0.0|       0.0|
|      2230282|I have a beautiful male Rid...|     dog|[0.7245044172289337,0.27549...|  0.0|       0.0|
|       602073|You know I like how this vi...|     dog|[0.7234606942531768,0.27653...|  0.0|       0.0|
|      1599417|All of these breeds are alr...|     dog|[0.7231087793344182,0.27689...|  0.0|       0.0|
|      2487142|dam that mutherfucker bit t...|     dog|[0.7220139497321232,0.27798...|  0.0|       0.0|
|      1599417|Awesome video.  This is wit...|     dog|[0.7206074195630413,0.27939...|  0.0|       0.0|
|      2230282|Not a bad list but I must s...|     dog|[0.719923

In [59]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions_rf)

0.4599925919799822

In [49]:
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered")
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors])
pipelineFit = pipeline.fit(countDistinctDF)
dataset = pipelineFit.transform(countDistinctDF)
dataset.show(5)

+-------------+--------------------+--------------------+--------------------+--------------------+
|id_comentator|                text|               words|            filtered|            features|
+-------------+--------------------+--------------------+--------------------+--------------------+
|       100005|Can you do blitz ...|[can, you, do, bl...|[blitz, bull, 125...|(10000,[879,5932]...|
|      1000095|teoría por demás ...|[teor, a, por, de...|[teor, por, dem, ...|(10000,[29,58,60,...|
|      1000928|Life really took ...|[life, really, to...|[life, really, to...|(10000,[16,36,80,...|
|      1000955|I love you Jungko...|[i, love, you, ju...|[love, jungkook, ...|(10000,[0,29,121]...|
|      1000967|OK COOL  FISHES  ...|[ok, cool, fishes...|[ok, cool, fishes...|(10000,[1,3,7,10,...|
+-------------+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [50]:
predictions_all = model_nb.transform(dataset)

In [51]:
predictions_all.show()

+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|id_comentator|                text|               words|            filtered|            features|       rawPrediction|         probability|prediction|
+-------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|       100005|Can you do blitz ...|[can, you, do, bl...|[blitz, bull, 125...|(10000,[879,5932]...|[-20.271947072500...|[0.56649480294087...|       0.0|
|      1000095|teoría por demás ...|[teor, a, por, de...|[teor, por, dem, ...|(10000,[29,58,60,...|[-313.22484819330...|[0.80878949064078...|       0.0|
|      1000928|Life really took ...|[life, really, to...|[life, really, to...|(10000,[16,36,80,...|[-81.935000407094...|[0.78860546667384...|       0.0|
|      1000955|I love you Jungko...|[i, love, you, ju...|[love, jungkook, ...|(100

In [None]:
#0 - dog, 1 - cat

In [71]:
data.head(3)

[Row(chan_owner='creator_name', id_comentator='userid', text='comment'),
 Row(chan_owner='Doug The Pug', id_comentator=' 87', text='I shared this to my friends and mom the were lol'),
 Row(chan_owner='Doug The Pug', id_comentator=' 87', text='Super cute  😀🐕🐶')]

In [72]:
predictions_all.head(3)

[Row(id_comentator=' 100005', text='Can you do blitz bull 125 wd', words=['can', 'you', 'do', 'blitz', 'bull', '125', 'wd'], filtered=['blitz', 'bull', '125', 'wd'], features=SparseVector(10000, {879: 1.0, 5932: 1.0}), rawPrediction=DenseVector([-20.2719, -20.5395]), probability=DenseVector([0.5665, 0.4335]), prediction=0.0),
 Row(id_comentator=' 1000095', text='teoría por demás absurda recordar que el reactor arc que Tony Stark tenia en su pecho le fue retirado en Iron Man 3 en Los Vengadores el cetro de Loki no hizo efecto en Tony porque técnicamente no tocó su pecho hasta un niño de 5 años entiende eso', words=['teor', 'a', 'por', 'dem', 's', 'absurda', 'recordar', 'que', 'el', 'reactor', 'arc', 'que', 'tony', 'stark', 'tenia', 'en', 'su', 'pecho', 'le', 'fue', 'retirado', 'en', 'iron', 'man', '3', 'en', 'los', 'vengadores', 'el', 'cetro', 'de', 'loki', 'no', 'hizo', 'efecto', 'en', 'tony', 'porque', 't', 'cnicamente', 'no', 'toc', 'su', 'pecho', 'hasta', 'un', 'ni', 'o', 'de', '5',

In [73]:
x = predictions_all.join(data, predictions_all.id_comentator == data.id_comentator, 'inner').drop(data.id_comentator)
x = pred_data.dropDuplicates(["text", "id_comentator"])
x.head(5)

[Row(text='', words=[], filtered=[], features=SparseVector(10000, {}), rawPrediction=DenseVector([-0.4605, -0.9969]), probability=DenseVector([0.631, 0.369]), prediction=0.0, chan_owner='Mermaid Melissa', id_comentator=' 2527324', text=None),
 Row(text='', words=[], filtered=[], features=SparseVector(10000, {}), rawPrediction=DenseVector([-0.4605, -0.9969]), probability=DenseVector([0.631, 0.369]), prediction=0.0, chan_owner='Brave Wilderness', id_comentator=' 439837', text=None),
 Row(text='', words=[], filtered=[], features=SparseVector(10000, {}), rawPrediction=DenseVector([-0.4605, -0.9969]), probability=DenseVector([0.631, 0.369]), prediction=0.0, chan_owner='ViralBe', id_comentator=' 558547', text=None),
 Row(text=' Main Aisa Kyun Hoon one of best Bollywood video song. Please react to this.', words=['main', 'aisa', 'kyun', 'hoon', 'one', 'of', 'best', 'bollywood', 'video', 'song', 'please', 'react', 'to', 'this'], filtered=['main', 'aisa', 'kyun', 'hoon', 'one', 'best', 'bollywoo

In [75]:
cv = CountVectorizer(inputCol="words", outputCol="z")
cv = cv_tmp.fit(x)

top100 = list(cv.vocabulary[0:100])

AttributeError: 'list' object has no attribute 'show'

In [76]:
more_then_3_charachters = [word for word in cv.vocabulary if len(word) <= 3]
contains_digits = [word for word in cv.vocabulary if any(char.isdigit() for char in word)]
stopwords = [] 
stopwords = stopwords + top100 + more_then_3_charachters + contains_digits
remover = StopWordsRemover(inputCol="words", outputCol="words_for_count", stopWords = stopwords)


+-------------+--------------------+--------------------+--------------------+--------------------+
|id_comentator|                text|               words|            filtered|     words_for_count|
+-------------+--------------------+--------------------+--------------------+--------------------+
|       100005|Can you do blitz ...|[can, you, do, bl...|       [blitz, bull]|       [blitz, bull]|
|      1000095|teoría por demás ...|[teor, a, por, de...|[teor, absurda, r...|[teor, absurda, r...|
|      1000928|Life really took ...|[life, really, to...|[life, really, to...|[life, took, mass...|
|      1000955|I love you Jungko...|[i, love, you, ju...|          [jungkook]|          [jungkook]|
|      1000967|OK COOL  FISHES  ...|[ok, cool, fishes...|[cool, fishes, ha...|[cool, fishes, ha...|
|      1001237|               Quack|             [quack]|             [quack]|             [quack]|
|      1001262|la de la flor de ...|[la, de, la, flor...|[flor, hojas, tam...|[flor, hojas, tam...|


In [78]:
x = remover.transform(x)
x.head(3)

[Row(text='', words=[], filtered=[], features=SparseVector(10000, {}), rawPrediction=DenseVector([-0.4605, -0.9969]), probability=DenseVector([0.631, 0.369]), prediction=0.0, chan_owner='Mermaid Melissa', id_comentator=' 2527324', text=None, words_for_count=[]),
 Row(text='', words=[], filtered=[], features=SparseVector(10000, {}), rawPrediction=DenseVector([-0.4605, -0.9969]), probability=DenseVector([0.631, 0.369]), prediction=0.0, chan_owner='Brave Wilderness', id_comentator=' 439837', text=None, words_for_count=[]),
 Row(text='', words=[], filtered=[], features=SparseVector(10000, {}), rawPrediction=DenseVector([-0.4605, -0.9969]), probability=DenseVector([0.631, 0.369]), prediction=0.0, chan_owner='ViralBe', id_comentator=' 558547', text=None, words_for_count=[])]

In [84]:
x.groupBy('words_for_count')\
    .count()\
    .sort('count', ascending=False)\
    .show(50)

+--------------------+------+
|     words_for_count| count|
+--------------------+------+
|                  []|421066|
|              [cool]|  5259|
|             [first]|  4218|
|            [coyote]|  3662|
|              [want]|  2571|
|              [asmr]|  2358|
|             [funny]|  2092|
|              [keep]|  2076|
|[looking, back, t...|  2015|
|          [adorable]|  1858|
|              [fake]|  1734|
|            [crying]|  1616|
|[fabulous, zapped...|  1553|
|          [birthday]|  1529|
|              [dogs]|  1525|
|             [bless]|  1514|
|    [raptor, fossil]|  1511|
|              [made]|  1464|
|[scarlet, johanss...|  1452|
|             [crazy]|  1440|
|             [thats]|  1422|
|              [guys]|  1414|
|             [hello]|  1315|
|    [coyote, statue]|  1305|
|[theres, dogs, di...|  1287|
|              [awww]|  1285|
|              [best]|  1283|
|[nope, still, eve...|  1246|
|      [raptor, bone]|  1226|
|             [gohan]|  1212|
|         

In [85]:
x = x.na.drop(subset=["words_for_count"])

In [86]:
dog_owners = x.filter(x["prediction"] == 0)
dog_owners.describe().show()

+-------+--------+----------+--------------------+------------------+--------------------+
|summary|    text|prediction|          chan_owner|     id_comentator|                text|
+-------+--------+----------+--------------------+------------------+--------------------+
|  count| 4182775|   4182775|             4161182|           4182775|             4182336|
|   mean|Infinity|       0.0|1.018439742824547...|1270007.2489434045|            Infinity|
| stddev|     NaN|       0.0|1.005104405673807...| 733881.6607035523|                 NaN|
|    min|        |       0.0|#CameraLord™ • Ko...|                 1|                 ...|
|    max|      🧡|       0.0|🐾 Life Is Better...|            userid|🧡🧡💛🧡💛🧡💛🧡?...|
+-------+--------+----------+--------------------+------------------+--------------------+



In [88]:
cat_owners = x.filter(x["prediction"] == 1)
cat_owners.describe().show()

+-------+--------------------+----------+--------------------+------------------+--------------------+
|summary|                text|prediction|          chan_owner|     id_comentator|                text|
+-------+--------------------+----------+--------------------+------------------+--------------------+
|  count|             1564885|   1564885|             1554894|           1564885|             1564855|
|   mean|   274.5550769230769|       1.0|1.835060233047752...|1269655.4293401751|1.285245763906350...|
| stddev|  408.42268784588794|       0.0|1.348430557917316E13| 733556.5605698468|3.995521631830355...|
|    min| *BUY MY MERCH* ,...|       1.0|#CameraLord™ • Ko...|                10|                    |
|    max|🧟‍♂️ I like turt...|       1.0|🐾 Life Is Better...|            999997|🧟‍♂️ I like turt...|
+-------+--------------------+----------+--------------------+------------------+--------------------+



In [None]:
dog_owners.groupBy('words_for_count')\
    .count()\
    .sort('count', ascending=False)\
    .show(50)

In [None]:
cat_owners.groupBy('words_for_count')\
    .count()\
    .sort('count', ascending=False)\
    .show(50)

In [93]:
mean = x.groupBy().avg("prediction").take(1)[0][0]
x = x.withColumn("mean_category", lit(mean))