In [1]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "natasha pritykovskaya clustering app") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [4]:
spark

![kmeans](pics/kmeans.png)

![kmeans_algo](pics/kmeans_algo.png)

In [5]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [6]:
schema = StructType([
    StructField("id", StringType()),
    StructField("comment_text", StringType()),
    StructField("toxic", IntegerType()),
    StructField("severe_toxic", IntegerType()),
    StructField("obscene", IntegerType()),
    StructField("threat", IntegerType()),
    StructField("insult", IntegerType()),
    StructField("identity_hate", IntegerType())
])

In [7]:
dataset = spark.read.csv("/lectures/lecture03/data/train.csv", schema=schema, header=True, multiLine=True, escape='"')

In [8]:
dataset = dataset.repartition(3).cache()

In [9]:
dataset

DataFrame[id: string, comment_text: string, toxic: int, severe_toxic: int, obscene: int, threat: int, insult: int, identity_hate: int]

In [10]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer

In [11]:
tokenizer = Tokenizer(inputCol="comment_text", outputCol="words")

In [12]:
stop_words = StopWordsRemover.loadDefaultStopWords("english")

In [13]:
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="words_filtered", stopWords=stop_words)

In [14]:
count_vectorizer = CountVectorizer(inputCol=swr.getOutputCol(), outputCol="word_vector", vocabSize=20000)

In [15]:
from pyspark.ml import Pipeline

In [16]:
preprocessing = Pipeline(stages=[
    tokenizer,
    swr,
    count_vectorizer
])

In [17]:
preprocessing_model = preprocessing.fit(dataset)

In [18]:
preprocessed_dataset = preprocessing_model.transform(dataset)

In [19]:
preprocessed_dataset.select(["word_vector"]).show(5, truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|word_vector                                 

In [20]:
dataset

DataFrame[id: string, comment_text: string, toxic: int, severe_toxic: int, obscene: int, threat: int, insult: int, identity_hate: int]

In [21]:
from pyspark.ml.clustering import KMeans

In [22]:
kmeans = KMeans(featuresCol="word_vector", k=7, seed=5757)

In [23]:
kmeans_model = kmeans.fit(preprocessed_dataset)

In [24]:
clustering = kmeans_model.transform(preprocessed_dataset)

In [25]:
clustering[clustering.columns[2:8] + ["prediction"]].take(10)

[Row(toxic=1, severe_toxic=0, obscene=1, threat=0, insult=1, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=4),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=1, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=4),
 Row(toxic=0, severe_toxic=0, obscene=0, threat=0, insult=0, identity_hate=0, prediction=0)]

### Silhouette score

https://en.wikipedia.org/wiki/Silhouette_(clustering)

In [26]:
from pyspark.ml.evaluation import ClusteringEvaluator

In [27]:
evaluator = ClusteringEvaluator(featuresCol="word_vector")

In [28]:
evaluator.evaluate(clustering)

0.3807178722067213

In [29]:
clustering.filter(clustering.prediction == 1)[["comment_text"]].show(5, truncate=False, vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [30]:
kmeans = KMeans(featuresCol="word_vector", k=2, seed=1234)

In [31]:
kmeans_model = kmeans.fit(preprocessed_dataset)

In [32]:
clustering = kmeans_model.transform(preprocessed_dataset)

In [33]:
evaluator.evaluate(clustering)

0.9910851831316501

In [34]:
kmeans_model.clusterCenters()

[array([2.78586869e+00, 5.11587573e-01, 2.42170219e-01, ...,
        1.38246531e-04, 1.38246531e-04, 1.38246531e-04]),
 array([127.92183908,   0.59310345,   1.09885057, ...,   0.        ,
          0.        ,   0.        ])]

In [35]:
import numpy as np

In [36]:
kmeans_model.clusterCenters()[1]

array([127.92183908,   0.59310345,   1.09885057, ...,   0.        ,
         0.        ,   0.        ])

In [37]:
np.argsort(kmeans_model.clusterCenters()[1])

array([19999,  6473, 12597, ...,     7,   370,     0])

In [38]:
preprocessing_model.stages[2].vocabulary

['',
 '"',
 'article',
 'page',
 'please',
 'like',
 'one',
 '-',
 'wikipedia',
 'talk',
 'think',
 'see',
 'also',
 'know',
 'may',
 'edit',
 'people',
 'use',
 'get',
 'even',
 'make',
 'articles',
 'good',
 'want',
 'time',
 'it.',
 'need',
 'new',
 'thank',
 'go',
 'first',
 'information',
 'many',
 'made',
 'find',
 'page.',
 'name',
 'really',
 'thanks',
 'say',
 'fuck',
 'much',
 'used',
 'since',
 'article.',
 'user',
 'add',
 'way',
 'take',
 'help',
 'sources',
 'look',
 'someone',
 'still',
 'read',
 'section',
 'pages',
 'going',
 'two',
 'deletion',
 'you.',
 'source',
 'edits',
 'without',
 'discussion',
 'well',
 'editing',
 'wikipedia.',
 'point',
 'deleted',
 'back',
 'might',
 'work',
 'something',
 'image',
 'another',
 'added',
 'never',
 'put',
 'link',
 'seems',
 'stop',
 ',',
 'blocked',
 'feel',
 '.',
 'list',
 'block',
 'right',
 'said',
 '(utc)',
 'using',
 'ask',
 'personal',
 'fact',
 'sure',
 'article,',
 'believe',
 'hope',
 'page,',
 'note',
 'actually',


In [39]:
for i in np.argsort(kmeans_model.clusterCenters()[1])[:40]:
    print(preprocessing_model.stages[2].vocabulary[i])

conflicted
compatible
manufactured
wikipedia).
itsuck
proof,
intuitive
koreans
irresponsible
cougar!you
would.
edit:
 (talk)
manga
alpha
tackle
metro
(please
infobox,
undeletion
nl33ers
import
777
slogan
nights
savage
analyze
(actually
rightfully
invest
estimates
galaxy
hacker
expired,
superpower
revival
manufacturers
elephant
override
whore


In [40]:
for i in np.argsort(kmeans_model.clusterCenters()[1])[-40:]:
    print(preprocessing_model.stages[2].vocabulary[i])

user
must
need
new
people
fack
also
wanker
sexual.
bad
"
think
notrhbysouthbanof
wikipedia
cocks
all!!
nigger
talk
see
please
•
|
anal
nikko
smells
rape
one
fggt!
you?
sucks
fuck
homo
page
article
like
know
tacos
-
die



In [41]:
for i in np.argsort(kmeans_model.clusterCenters()[0])[:40]:
    print(preprocessing_model.stages[2].vocabulary[i])

enyclopedia!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|style=""background:
mangina!
cingular
nikko
faggot!!!!jéské
shannon!
biznitch
misterwiki.
11/27
die!!!!!!!!!!!!
(speedy
edie
notrhbysouthbanof
hellor
farted,
i7)
shikoku
km²
71.15.159.191
oi!!
2+2=5
dickhead!
j.delanoy
couriano
1.5%
lovers.68.79.118.61
brother!
18:42
sexual.
(city)
battalion,
b:
tacos
d:
cline
all!!
keller
frm
paterson,


In [42]:
for i in np.argsort(kmeans_model.clusterCenters()[0])[-40:]:
    print(preprocessing_model.stages[2].vocabulary[i])

much
say
really
page.
name
find
made
many
first
information
go
thank
new
need
it.
time
want
good
articles
make
even
get
use
people
edit
may
know
also
see
think
talk
wikipedia
-
one
like
please
page
article
"



## The curse of dimensionality
![curse](pics/dimensionality_vs_performance.png)

## Why is that?
![curse](pics/curseofdimensionality.png)

## LDA

![curse](pics/lda.png)

In [43]:
from pyspark.ml.clustering import LDA

In [44]:
lda = LDA(featuresCol="word_vector", seed=5757, k=7)

In [45]:
lda_model = lda.fit(preprocessed_dataset)

In [46]:
topics = lda_model.transform(preprocessed_dataset)

In [47]:
topics.show(5, vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [61]:
lda_model.vocabSize()

20000

In [49]:
lda_model.describeTopics(maxTermsPerTopic=10).collect()

[Row(topic=0, termIndices=[214, 1356, 131, 463, 1285, 1070, 1476, 0, 2901, 3454], termWeights=[0.0508251610687159, 0.01850433821111892, 0.017287992732937456, 0.016301571888461, 0.014503750565141742, 0.01191668426584085, 0.010405416295169192, 0.010312859613410144, 0.00940903370088961, 0.009111726905350457]),
 Row(topic=1, termIndices=[40, 853, 348, 0, 726, 1339, 29, 447, 583, 976], termWeights=[0.08148619003251031, 0.026381605209881746, 0.026145737505179133, 0.022880793419447723, 0.02027144474630342, 0.019844540175485578, 0.018845770805441513, 0.016615335962049387, 0.014482666351556348, 0.013866661857661841]),
 Row(topic=2, termIndices=[0, 1, 9, 3, 13, 5, 8, 18, 16, 4], termWeights=[0.08967067217524097, 0.014723534742256223, 0.008623661978159343, 0.006540167732032256, 0.0064814311944065795, 0.005984722889253739, 0.005877172932981316, 0.005415533956191886, 0.004861307665162586, 0.004396428656982614]),
 Row(topic=3, termIndices=[0, 4, 2, 59, 3, 14, 1, 121, 69, 67], termWeights=[0.05922961

In [50]:
for i in [200, 175, 87, 0, 1701, 835, 2510, 240, 1368, 2249]:
    print(preprocessing_model.stages[-1].vocabulary[i])

hate
u
block

buttsecks
bark
so-called
old
freezer
club,


## Clustering is a good dimensionality reduction technique

In [64]:
topics

DataFrame[id: string, comment_text: string, toxic: int, severe_toxic: int, obscene: int, threat: int, insult: int, identity_hate: int, words: array<string>, words_filtered: array<string>, word_vector: vector, topicDistribution: vector]

In [65]:
from pyspark.sql import functions as f

In [66]:
target = f.when(
    (topics.toxic == 0) &
    (topics.severe_toxic == 0) &
    (topics.obscene == 0) &
    (topics.threat == 0) &
    (topics.insult == 0) &
    (topics.identity_hate == 0),
    0
).otherwise(1)

In [67]:
new_dataset = topics.withColumn("target", target)[["id", "target", "topicDistribution"]].cache()

In [68]:
new_dataset.take(5)

[Row(id='28ad6dfd6b406694', target=1, topicDistribution=DenseVector([0.038, 0.0382, 0.0474, 0.0409, 0.0402, 0.0626, 0.7328])),
 Row(id='26e1b63617df36b1', target=0, topicDistribution=DenseVector([0.0044, 0.0044, 0.0055, 0.2042, 0.0047, 0.7724, 0.0045])),
 Row(id='39b742437bd11ec9', target=0, topicDistribution=DenseVector([0.0019, 0.0019, 0.0024, 0.0021, 0.002, 0.9877, 0.002])),
 Row(id='6574f9f065d3c8fd', target=0, topicDistribution=DenseVector([0.0008, 0.0008, 0.001, 0.0008, 0.0008, 0.9951, 0.0008])),
 Row(id='bff59f526e2fe3ee', target=0, topicDistribution=DenseVector([0.0111, 0.0111, 0.0138, 0.0119, 0.252, 0.6887, 0.0113]))]

In [69]:
from pyspark.ml.classification import LogisticRegression

In [70]:
lr = LogisticRegression(featuresCol="topicDistribution", labelCol="target")

In [71]:
train = new_dataset.sampleBy("target", fractions={0: 0.8, 1: 0.8}, seed=5757).cache()

In [72]:
test = new_dataset.join(train, on="id", how="leftanti").cache()

In [73]:
lr_model = lr.fit(train)

In [74]:
predictions = lr_model.transform(test)

In [75]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [76]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="target", metricName='areaUnderROC')

In [77]:
evaluator.evaluate(predictions)

0.8578107851527246

In [78]:
spark.stop()