In [534]:
spark.stop()

In [535]:
sc.stop()

In [536]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [537]:
from pyspark import SparkContext, SparkConf

conf = SparkConf().setAll([('spark.executor.memory', '4g'),
                           ('spark.executor.cores', '4'),
                           ('spark.cores.max', '4'),
                           ('spark.driver.memory','8g')])
conf.set("spark.app.name", "Barinov lab2") 
spark = SparkSession.builder.config(conf=conf).appName("Barinov lab2").getOrCreate()

In [538]:
spark

In [539]:
from pyspark.sql.types import FloatType, DoubleType, StringType, ArrayType
from pyspark.sql.functions import *
import pyspark.sql.functions as f
spark.conf.set("spark.sql.crossJoin.enabled", True)

In [541]:
docs = spark.read.json("/labs/slaba02/DO_record_per_line.json")

In [542]:
docs.printSchema()

root
 |-- cat: string (nullable = true)
 |-- desc: string (nullable = true)
 |-- id: long (nullable = true)
 |-- lang: string (nullable = true)
 |-- name: string (nullable = true)
 |-- provider: string (nullable = true)



In [543]:
docs.show(5)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|
+--------------------+--------------------+---+----+--------------------+--------------+
only showing top 5 rows



In [544]:
import re
regex = re.compile(u'[\w\d]{2,}', re.U)
regex_udf = f.udf(lambda x: regex.findall(x.lower()), ArrayType(StringType()))

docs = docs.withColumn("words", regex_udf(docs.desc))
docs.show()

+--------------------+--------------------+---+----+--------------------+--------------+--------------------+
|                 cat|                desc| id|lang|                name|      provider|               words|
+--------------------+--------------------+---+----+--------------------+--------------+--------------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|[this, course, in...|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|[this, online, co...|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|[this, course, is...|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|[we, live, in, di...|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|[this, self, pace...|
|9/humanities|15/m...|This game-based c...|  9|  en|College Foundatio...|Canvas Network|[this, game, base...|
|  14/soci

In [545]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
# tokenizer = Tokenizer(inputCol="desc", outputCol="words")
# wordsData = tokenizer.transform(docs)

In [509]:
# wordsData.take(5)

In [546]:
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=10000)
featurizedData = hashingTF.transform(docs)

In [548]:
featurizedData.select("rawFeatures").show(3)

+--------------------+
|         rawFeatures|
+--------------------+
|(10000,[36,63,138...|
|(10000,[32,222,36...|
|(10000,[30,118,12...|
+--------------------+
only showing top 3 rows



In [549]:
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

rescaledData.select("id", "lang", "features").show(5,False)

+---+----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [550]:
from pyspark.ml.feature import Normalizer
normalizer = Normalizer(inputCol="features", outputCol="norm")
data = normalizer.transform(rescaledData)

In [552]:
data.select("id", "norm").show(1,vertical=True,truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [594]:
dot_udf = f.udf(lambda x,y: float(x.dot(y)), DoubleType())

In [563]:
data.where(data.id.isin (23126, 21617, 16627, 11556, 16704, 13702)).select('id', 'lang').show()

+-----+----+
|   id|lang|
+-----+----+
|11556|  es|
|13702|  ru|
|16627|  es|
|16704|  ru|
|21617|  en|
|23126|  en|
+-----+----+



In [597]:
%%time
data_23126 = data[(data.lang == 'en')].withColumn("id_recom", lit(23126)).alias("i")\
    .join(data.alias("j"), f.col("i.id_recom") == f.col("j.id"), how='inner')\
    .select(
        f.col("i.lang"), 
        f.col("i.id"),
        f.col("i.id_recom"),
        f.col("i.norm"),
        f.col("j.norm"),
        dot_udf("i.norm", "j.norm").alias("dot"))\
    .orderBy("dot", ascending=0)


data_21617 = data[(data.lang == 'en')].withColumn("id_recom", lit(21617)).alias("i")\
    .join(data.alias("j"), f.col("i.id_recom") == f.col("j.id"), how='inner')\
    .select(
        f.col("i.lang"), 
        f.col("i.id"),
        f.col("i.id_recom"),
        f.col("i.norm"),
        f.col("j.norm"),
        dot_udf("i.norm", "j.norm").alias("dot"))\
    .orderBy("dot", ascending=0)


data_16704 = data[(data.lang == 'ru')].withColumn("id_recom", lit(16704)).alias("i")\
    .join(data.alias("j"), f.col("i.id_recom") == f.col("j.id"), how='inner')\
    .select(
        f.col("i.lang"), 
        f.col("i.id"),
        f.col("i.id_recom"),
        f.col("i.norm"),
        f.col("j.norm"),
        dot_udf("i.norm", "j.norm").alias("dot"))\
    .orderBy("dot", ascending=0)


data_13702 = data[(data.lang == 'ru')].withColumn("id_recom", lit(13702)).alias("i")\
    .join(data.alias("j"), f.col("i.id_recom") == f.col("j.id"), how='inner')\
    .select(
        f.col("i.lang"), 
        f.col("i.id"),
        f.col("i.id_recom"),
        f.col("i.norm"),
        f.col("j.norm"),
        dot_udf("i.norm", "j.norm").alias("dot"))\
    .orderBy("dot", ascending=0)


data_11556 = data[(data.lang == 'es')].withColumn("id_recom", lit(11556)).alias("i")\
    .join(data.alias("j"), f.col("i.id_recom") == f.col("j.id"), how='inner')\
    .select(
        f.col("i.lang"), 
        f.col("i.id"),
        f.col("i.id_recom"),
        f.col("i.norm"),
        f.col("j.norm"),
        dot_udf("i.norm", "j.norm").alias("dot"))\
    .orderBy("dot", ascending=0)


data_16627 = data[(data.lang == 'es')].withColumn("id_recom", lit(16627)).alias("i")\
    .join(data.alias("j"), f.col("i.id_recom") == f.col("j.id"), how='inner')\
    .select(
        f.col("i.lang"), 
        f.col("i.id"),
        f.col("i.id_recom"),
        f.col("i.norm"),
        f.col("j.norm"),
        dot_udf("i.norm", "j.norm").alias("dot"))\
    .orderBy("dot", ascending=0)

CPU times: user 50.1 ms, sys: 6.61 ms, total: 56.7 ms
Wall time: 529 ms


In [560]:
spark.conf.set("spark.sql.broadcastTimeout",  36000)

In [618]:
%%time
data_23126.select('id').show(11)

+-----+
|   id|
+-----+
|23126|
|14760|
|13665|
|13782|
|20638|
|24419|
|15909|
| 2724|
|25782|
|17499|
|13348|
+-----+
only showing top 11 rows

CPU times: user 6.82 ms, sys: 0 ns, total: 6.82 ms
Wall time: 4.33 s


In [614]:
%%time
data_21617.select('id').show(12)

+-----+
|   id|
+-----+
|21609|
|21617|
|21616|
|21608|
|22298|
|21630|
|21628|
|21623|
|21508|
|21081|
|19417|
|21857|
+-----+
only showing top 12 rows

CPU times: user 4.61 ms, sys: 1.17 ms, total: 5.78 ms
Wall time: 4.58 s


In [613]:
%%time
data_16704.select('id').show(12)

+-----+
|   id|
+-----+
|16704|
| 1236|
| 1247|
| 1365|
|20288|
| 1273|
| 1164|
| 8186|
| 1233|
| 8203|
| 8207|
|  875|
+-----+
only showing top 12 rows

CPU times: user 1.42 ms, sys: 4.68 ms, total: 6.1 ms
Wall time: 1.08 s


In [612]:
%%time
data_13702.select('id').show(12)

+-----+
|   id|
+-----+
|13702|
|  864|
|21079|
| 8313|
| 1041|
|28074|
| 8300|
| 1033|
|13057|
|21025|
| 1111|
| 1110|
+-----+
only showing top 12 rows

CPU times: user 5.45 ms, sys: 882 µs, total: 6.33 ms
Wall time: 2.24 s


In [611]:
%%time
data_11556.select('id').show(12)

+-----+
|   id|
+-----+
|11556|
|16488|
|  468|
|13461|
|23357|
|19330|
| 7833|
| 9289|
|10447|
|22710|
|11340|
|  387|
+-----+
only showing top 12 rows

CPU times: user 5.42 ms, sys: 6.9 ms, total: 12.3 ms
Wall time: 7.82 s


In [610]:
%%time
data_16627.select('id').show(12)

+-----+
|   id|
+-----+
|16627|
|11431|
|11575|
|12247|
|17964|
| 5687|
|17961|
|16694|
|12660|
|25010|
| 5558|
|13551|
+-----+
only showing top 12 rows

CPU times: user 9.73 ms, sys: 2.47 ms, total: 12.2 ms
Wall time: 5.38 s


In [640]:
import json
answer = {}
answer["23126"] = [14760,13665,13782,20638,24419,15909,2724,25782,17499,13348]
answer["23126"] = [14760,13665,13782,20638,24419,15909,2724,25782,17499,13348]
answer["16627"] = [11431,11575,12247,17964,5687,17961,16694,12660,25010,5558]
answer["11556"] = [16488,468,13461,23357,19330,7833,9289,10447,22710,11340]
answer["13702"] = [864,21079,8313,1041,28074,8300,1033,13057,21025,1111]
answer["16704"] = [1236,1247,1365,20288,1273,1164,8186,1233,8203,8207]
answer["21617"] = [21609,21616,21608,22298,21630,21628,21623,21508,21081,19417]
answer

{'23126': [14760,
  13665,
  13782,
  20638,
  24419,
  15909,
  2724,
  25782,
  17499,
  13348],
 '16627': [11431, 11575, 12247, 17964, 5687, 17961, 16694, 12660, 25010, 5558],
 '11556': [16488, 468, 13461, 23357, 19330, 7833, 9289, 10447, 22710, 11340],
 '13702': [864, 21079, 8313, 1041, 28074, 8300, 1033, 13057, 21025, 1111],
 '16704': [1236, 1247, 1365, 20288, 1273, 1164, 8186, 1233, 8203, 8207],
 '21617': [21609,
  21616,
  21608,
  22298,
  21630,
  21628,
  21623,
  21508,
  21081,
  19417]}

In [641]:
with open('lab02.json', 'w') as outfile:
    json.dump(answer, outfile)

In [642]:
sc.stop()