In [80]:
spark.stop()

In [81]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [82]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "PM app")

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [83]:
spark

In [84]:
from pyspark.mllib.linalg.distributed import IndexedRowMatrix
from pyspark.sql import functions as F
from pyspark.ml.linalg import Vectors
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml import Pipeline
from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import FloatType, ArrayType, StringType
import numpy as np
import re
from pyspark.sql.functions import udf

In [85]:
task_list = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]

In [86]:
for i in task_list:
    print(i)

[23126, 'en', 'Compass - powerful SASS library that makes your life easier']
[21617, 'en', 'Preparing for the AP* Computer Science A Exam — Part 2']
[16627, 'es', 'Aprende Excel: Nivel Intermedio by Alfonso Rinsche']
[11556, 'es', 'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo']
[16704, 'ru', 'Программирование на Lazarus']
[13702, 'ru', 'Математическая экономика']


In [87]:
for i in task_list:
    print(i[0])

23126
21617
16627
11556
16704
13702


In [89]:
!pwd

/data/home/petr.manannikov


In [92]:
df = spark.read.json("/labs/slaba02/DO_record_per_line.json")

In [93]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [94]:
from pyspark.ml.feature import Tokenizer, HashingTF, RegexTokenizer
from pyspark.ml.feature import HashingTF

In [95]:
df.show(1)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
+--------------------+--------------------+---+----+--------------------+--------------+
only showing top 1 row



In [96]:
df.select(['id','name','lang','desc']).show(1)

+---+--------------------+----+--------------------+
| id|                name|lang|                desc|
+---+--------------------+----+--------------------+
|  4|Accounting Cycle:...|  en|This course intro...|
+---+--------------------+----+--------------------+
only showing top 1 row



In [97]:
regexTokenizer = RegexTokenizer(inputCol="desc", outputCol="words", pattern=r"[,\s]")
df2 = regexTokenizer.transform(df)
df2.select('id','name','desc', "words").show(1,truncate=False)

+---+----------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [98]:
rus_stopwords = StopWordsRemover.loadDefaultStopWords("russian")
eng_stopwords = StopWordsRemover.loadDefaultStopWords("english")
esp_stopwords = StopWordsRemover.loadDefaultStopWords("spanish")

In [99]:
# for symbol in '\.,|/-`~<>!@#$%^&*()"№%'':,.;"':
#     rus_stopwords.append(symbol)
# rus_stopwords

In [100]:
eng_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'nor',
 '

In [101]:

stop_words_3_lang = rus_stopwords+eng_stopwords+esp_stopwords

remover = StopWordsRemover(inputCol="words",
                           outputCol="w_filtered",
                           stopWords=stop_words_3_lang)

df3 = remover.transform(df2)
df3.select("words","w_filtered").show(1,vertical = True,truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [102]:
ht = HashingTF(inputCol="w_filtered", outputCol="vector", numFeatures=10000)
df4 = ht.transform(df3)
df4.show(truncate=False)

+--------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [103]:
df4

DataFrame[cat: string, desc: string, id: bigint, lang: string, name: string, provider: string, words: array<string>, w_filtered: array<string>, vector: vector]

In [104]:
df4.select(['id','lang','w_filtered','vector']).filter("id == 23126").show(6,truncate=False, vertical=True)  #and lang == 'en'"

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [105]:
df4.createOrReplaceTempView("df4_table")

In [106]:
query = """

SELECT 
  t.id
, t.lang
, t.w_filtered
, t.vector
, t2.id as id2
, t2.lang as lang2
, t2.w_filtered as w_filtered2
, t2.vector as vector2

FROM df4_table t
join df4_table t2 on t.lang=t2.lang
where t.id in ('23126','21617','16627','11556','16704','13702')
"""

In [107]:
df5 = spark.sql(query)

In [113]:
df5.show(10)

+-----+----+--------------------+--------------------+---+-----+--------------------+--------------------+
|   id|lang|          w_filtered|              vector|id2|lang2|         w_filtered2|             vector2|
+-----+----+--------------------+--------------------+---+-----+--------------------+--------------------+
|21617|  en|[introduction, co...|(10000,[161,213,3...|  4|   en|[course, introduc...|(10000,[36,63,138...|
|21617|  en|[introduction, co...|(10000,[161,213,3...|  5|   en|[online, course, ...|(10000,[32,222,36...|
|21617|  en|[introduction, co...|(10000,[161,213,3...|  7|   en|[live, digitally,...|(10000,[493,721,8...|
|21617|  en|[introduction, co...|(10000,[161,213,3...|  8|   en|[self-paced, cour...|(10000,[32,65,115...|
|21617|  en|[introduction, co...|(10000,[161,213,3...|  9|   en|[game-based, cour...|(10000,[56,268,30...|
|21617|  en|[introduction, co...|(10000,[161,213,3...| 10|   en|[what’s, digital,...|(10000,[1045,2044...|
|21617|  en|[introduction, co...|(100

In [114]:
@udf
def sim_cos(v1,v2):
    try:
        p = 2
        return float(v1.dot(v2))/float(v1.norm(p)*v2.norm(p))
    except:
        return 0

In [115]:
from pyspark.sql.functions import row_number,lit
from pyspark.sql.window import Window

df6 = df5.select(F.col("id"),F.col("id2"),
sim_cos("vector", "vector2").alias('cos_dim')).where("(id = 23126 or id=21617\
 or id=16627 or id=11556 or id = 16704 or id =13702) \
and id2!=id").orderBy(["id","cos_dim"], ascending=[1, 0])

wind_func = Window.partitionBy("id").orderBy(lit('cos_dim'))
df7 = df6.withColumn("row_num", row_number().over(wind_func))

#F.col("id"),
df8 = df7.where('row_num>0 and row_num<11')
df8.show(25)

+-----+-----+-------------------+-------+
|   id|  id2|            cos_dim|row_num|
+-----+-----+-------------------+-------+
|23126|13665|0.46182450709912204|      1|
|23126|25782| 0.3950339293497708|      2|
|23126|13782| 0.3450327796711772|      3|
|23126| 2724| 0.3255893170258271|      4|
|23126|23718|0.31825488783904093|      5|
|23126| 7153|0.30631472535868753|      6|
|23126|13348| 0.3010001199617141|      7|
|23126|23756|0.29217364807353613|      8|
|23126| 4342| 0.2831920305186628|      9|
|23126|10921|0.28175885158143726|     10|
|16627|11431| 0.5979763080562118|      1|
|16627|17964| 0.4217357203567792|      2|
|16627|12660| 0.4090909206150068|      3|
|16627|12247| 0.4011248379647904|      4|
|16627| 5687| 0.3824737488730695|      5|
|16627| 9598|0.36794059063579965|      6|
|16627|16694| 0.3632612873491998|      7|
|16627|12598|0.35088280888029433|      8|
|16627|17961|0.33378437475919104|      9|
|16627|11575| 0.3337637953685698|     10|
|13702|21033|0.12993504870941053| 

In [116]:
df9 = df8.select(F.col("id"),F.col("id2")).groupBy("id").agg(F.collect_list(F.struct("id2")).alias("arrg_list"))

In [117]:
df9.show(10)

+-----+--------------------+
|   id|           arrg_list|
+-----+--------------------+
|23126|[[13665], [25782]...|
|16627|[[11431], [17964]...|
|13702|[[21026], [21421]...|
|16704|[[1236], [1365], ...|
|11556|[[16488], [10447]...|
|21617|[[21609], [21608]...|
+-----+--------------------+



In [118]:
from pyspark.sql.functions import to_json

In [119]:
df9.coalesce(1).write.format('json').save('lab02.json')

In [392]:
# 2 вариант сохранения
import json
collected_df = df9.collect()
with open('test2lab02.json', 'w') as outfile:
    json.dump(collected_df, outfile)

In [75]:
spark.stop()