In [None]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 4g --driver-memory 3g pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))


In [None]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "Sergey Grishaev clustering app") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [None]:
spark

![kmeans_algo](pics/kmeans_algo.png)

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [None]:
schema = StructType([
    StructField("id", StringType()),
    StructField("comment_text", StringType()),
    StructField("toxic", IntegerType()),
    StructField("severe_toxic", IntegerType()),
    StructField("obscene", IntegerType()),
    StructField("threat", IntegerType()),
    StructField("insult", IntegerType()),
    StructField("identity_hate", IntegerType())
])

In [None]:
dataset = spark.read.csv("/lectures/lecture03/data/train.csv", schema=schema, header=True, multiLine=True, escape='"')

In [None]:
dataset = dataset.repartition(10).cache()

In [None]:
dataset

In [None]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer

In [None]:
tokenizer = Tokenizer(inputCol="comment_text", outputCol="words")

In [None]:
StopWordsRemover.loadDefaultStopWords("russian")

In [None]:
stop_words = StopWordsRemover.loadDefaultStopWords("english")

In [None]:
stop_words

In [None]:
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="words_filtered", stopWords=stop_words)

In [None]:
count_vectorizer = CountVectorizer(inputCol=swr.getOutputCol(), outputCol="word_vector", vocabSize=20000)

In [None]:
from pyspark.ml import Pipeline

In [None]:
preprocessing = Pipeline(stages=[
    tokenizer,
    swr,
    count_vectorizer
])

In [None]:
preprocessing_model = preprocessing.fit(dataset)

In [None]:
preprocessed_dataset = preprocessing_model.transform(dataset)

In [None]:
preprocessed_dataset.select(["word_vector"]).show(5, truncate=False)

In [None]:
dataset

In [None]:
from pyspark.ml.clustering import KMeans

In [None]:
kmeans = KMeans(featuresCol="word_vector", k=7, seed=5757)

In [None]:
kmeans_model = kmeans.fit(preprocessed_dataset)

In [None]:
clustering = kmeans_model.transform(preprocessed_dataset)

In [None]:
clustering[clustering.columns[2:8] + ["prediction"]].take(20)

### Silhouette score

https://en.wikipedia.org/wiki/Silhouette_(clustering)

In [None]:
from pyspark.ml.evaluation import ClusteringEvaluator

In [None]:
evaluator = ClusteringEvaluator(featuresCol="word_vector")

In [None]:
evaluator.evaluate(clustering)

In [None]:
clustering.filter(clustering.prediction == 1)[["comment_text"]].show(5, truncate=False, vertical=True)

In [None]:
kmeans = KMeans(featuresCol="word_vector", k=2, seed=1234)

In [None]:
kmeans_model = kmeans.fit(preprocessed_dataset)

In [None]:
clustering = kmeans_model.transform(preprocessed_dataset)

In [None]:
evaluator.evaluate(clustering)

In [None]:
kmeans_model.clusterCenters()

In [None]:
import numpy as np

In [None]:
kmeans_model.clusterCenters()[1]

In [None]:
np.argsort(kmeans_model.clusterCenters()[1])

In [None]:
preprocessing_model.stages[2].vocabulary

In [None]:
for i in np.argsort(kmeans_model.clusterCenters()[1])[:40]:
    print(preprocessing_model.stages[2].vocabulary[i])

In [None]:
for i in np.argsort(kmeans_model.clusterCenters()[1])[-40:]:
    print(preprocessing_model.stages[2].vocabulary[i])

In [None]:
for i in np.argsort(kmeans_model.clusterCenters()[0])[:40]:
    print(preprocessing_model.stages[2].vocabulary[i])

In [None]:
for i in np.argsort(kmeans_model.clusterCenters()[0])[-40:]:
    print(preprocessing_model.stages[2].vocabulary[i])

## The curse of dimensionality
![curse](pics/dimensionality_vs_performance.png)

## Why is that?
![curse](pics/curseofdimensionality.png)

## LDA

![curse](pics/lda.png)

In [None]:
from pyspark.ml.clustering import LDA

In [None]:
lda = LDA(featuresCol="word_vector", seed=5757, k=7)

In [None]:
lda_model = lda.fit(preprocessed_dataset)

In [None]:
topics = lda_model.transform(preprocessed_dataset)

In [None]:
topics.show(5, vertical=True, truncate=False)

In [None]:
lda_model.vocabSize()

In [None]:
lda_model.describeTopics(maxTermsPerTopic=10).collect()

In [None]:
for i in [0, 4, 59, 2, 1, 121, 3, 14, 69, 9]:
    print(preprocessing_model.stages[-1].vocabulary[i])

## Clustering is a good dimensionality reduction technique

In [None]:
topics

In [None]:
from pyspark.sql import functions as f

In [None]:
target = f.when(
    (topics.toxic == 0) &
    (topics.severe_toxic == 0) &
    (topics.obscene == 0) &
    (topics.threat == 0) &
    (topics.insult == 0) &
    (topics.identity_hate == 0),
    0
).otherwise(1)

In [None]:
new_dataset = topics.withColumn("target", target)[["id", "target", "topicDistribution"]].cache()

In [None]:
new_dataset.take(5)

In [None]:
from pyspark.ml.classification import LogisticRegression

In [None]:
lr = LogisticRegression(featuresCol="topicDistribution", labelCol="target")

In [None]:
train = new_dataset.sampleBy("target", fractions={0: 0.8, 1: 0.8}, seed=5757).cache()

In [None]:
test = new_dataset.join(train, on="id", how="leftanti").coalesce(10).cache()

In [None]:
lr_model = lr.fit(train)

In [None]:
predictions = lr_model.transform(test)

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="target", metricName='areaUnderROC')

In [None]:
evaluator.evaluate(predictions)

In [None]:
spark.stop()