In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 --executor-memory 4g --executor-cores 1 --driver-memory 3g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Normalizer, StopWordsRemover
from pyspark.sql.functions import col, udf, lit, desc
from pyspark.ml.linalg import SparseVector, VectorUDT, DenseVector
from pyspark.ml.feature import Normalizer
import json
import re

conf = SparkConf()
conf.set("spark.driver.memory", "8g") 
conf.set('spark.executor.cores', '3')
conf.set('spark.yarn.executor.memoryOverhead', '2G')

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("test")
         .getOrCreate())

In [3]:
data = spark.read.json('/labs/slaba02/DO_record_per_line.json')

In [4]:
data.show(5)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|
+--------------------+--------------------+---+----+--------------------+--------------+
only showing top 5 rows



In [27]:
collection = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], 
              [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], 
              [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], 
              [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], 
              [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], 
              [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]

In [28]:
rdd_data = spark.sparkContext.parallelize(collection)

In [29]:
rdd_data

ParallelCollectionRDD[8] at parallelize at PythonRDD.scala:195

In [30]:
schema = StructType(fields = [
    StructField('id', IntegerType()),
    StructField('lang', StringType()),
    StructField('desc', StringType()),
])

In [31]:
target = spark.createDataFrame(rdd_data, schema=schema)
target.show(5)

+-----+----+--------------------+
|   id|lang|                desc|
+-----+----+--------------------+
|23126|  en|Compass - powerfu...|
|21617|  en|Preparing for the...|
|16627|  es|Aprende Excel: Ni...|
|11556|  es|Aprendizaje Colab...|
|16704|  ru|Программирование ...|
+-----+----+--------------------+
only showing top 5 rows



In [5]:
def clear_string(series):
    regex = re.compile(u'[\w\d]{2,}', re.U)
    lower_words = series.str.lower()
    words = lower_words.str.findall(regex)
    return words

tokenizer_udf = F.pandas_udf(clear_string, ArrayType(StringType()))

In [6]:
prev = data.select('id', 'lang', tokenizer_udf("desc") )

In [7]:
prev.show(5)

+---+----+--------------------+
| id|lang|  clear_string(desc)|
+---+----+--------------------+
|  4|  en|[this, course, in...|
|  5|  en|[this, online, co...|
|  6|  fr|[this, course, is...|
|  7|  en|[we, live, in, di...|
|  8|  en|[this, self, pace...|
+---+----+--------------------+
only showing top 5 rows



In [8]:
stop_words = StopWordsRemover.loadDefaultStopWords("english")
swr = StopWordsRemover(inputCol='clear_string(desc)', outputCol="words_filtered", stopWords=stop_words)
prev = swr.transform(prev)
prev.show(5)

+---+----+--------------------+--------------------+
| id|lang|  clear_string(desc)|      words_filtered|
+---+----+--------------------+--------------------+
|  4|  en|[this, course, in...|[course, introduc...|
|  5|  en|[this, online, co...|[online, course, ...|
|  6|  fr|[this, course, is...|[course, taught, ...|
|  7|  en|[we, live, in, di...|[live, digitally,...|
|  8|  en|[this, self, pace...|[self, paced, cou...|
+---+----+--------------------+--------------------+
only showing top 5 rows



In [9]:
ht = HashingTF(inputCol="words_filtered", outputCol="tf", numFeatures=10000)
result = ht.transform(prev)
result.show(5)

+---+----+--------------------+--------------------+--------------------+
| id|lang|  clear_string(desc)|      words_filtered|                  tf|
+---+----+--------------------+--------------------+--------------------+
|  4|  en|[this, course, in...|[course, introduc...|(10000,[36,63,138...|
|  5|  en|[this, online, co...|[online, course, ...|(10000,[32,222,36...|
|  6|  fr|[this, course, is...|[course, taught, ...|(10000,[30,118,12...|
|  7|  en|[we, live, in, di...|[live, digitally,...|(10000,[493,721,8...|
|  8|  en|[this, self, pace...|[self, paced, cou...|(10000,[32,115,13...|
+---+----+--------------------+--------------------+--------------------+
only showing top 5 rows



In [10]:
idf = IDF(inputCol='tf', outputCol="idf").fit(result)
res = idf.transform(result)

In [11]:
res.show(5)

+---+----+--------------------+--------------------+--------------------+--------------------+
| id|lang|  clear_string(desc)|      words_filtered|                  tf|                 idf|
+---+----+--------------------+--------------------+--------------------+--------------------+
|  4|  en|[this, course, in...|[course, introduc...|(10000,[36,63,138...|(10000,[36,63,138...|
|  5|  en|[this, online, co...|[online, course, ...|(10000,[32,222,36...|(10000,[32,222,36...|
|  6|  fr|[this, course, is...|[course, taught, ...|(10000,[30,118,12...|(10000,[30,118,12...|
|  7|  en|[we, live, in, di...|[live, digitally,...|(10000,[493,721,8...|(10000,[493,721,8...|
|  8|  en|[this, self, pace...|[self, paced, cou...|(10000,[32,115,13...|(10000,[32,115,13...|
+---+----+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [12]:
normalizer = Normalizer(inputCol="idf", outputCol="norm")
res_end = normalizer.transform(res)

In [13]:
res_end.show(5)

+---+----+--------------------+--------------------+--------------------+--------------------+--------------------+
| id|lang|  clear_string(desc)|      words_filtered|                  tf|                 idf|                norm|
+---+----+--------------------+--------------------+--------------------+--------------------+--------------------+
|  4|  en|[this, course, in...|[course, introduc...|(10000,[36,63,138...|(10000,[36,63,138...|(10000,[36,63,138...|
|  5|  en|[this, online, co...|[online, course, ...|(10000,[32,222,36...|(10000,[32,222,36...|(10000,[32,222,36...|
|  6|  fr|[this, course, is...|[course, taught, ...|(10000,[30,118,12...|(10000,[30,118,12...|(10000,[30,118,12...|
|  7|  en|[we, live, in, di...|[live, digitally,...|(10000,[493,721,8...|(10000,[493,721,8...|(10000,[493,721,8...|
|  8|  en|[this, self, pace...|[self, paced, cou...|(10000,[32,115,13...|(10000,[32,115,13...|(10000,[32,115,13...|
+---+----+--------------------+--------------------+--------------------

In [300]:
spark.conf.set( "spark.sql.crossJoin.enabled" , "true" )



dot_udf = udf(lambda x,y: float(x.dot(y)), DoubleType())
itog = res_end.alias("i").join(res_end.where(col("id") == 13702).alias("j"), how='left')\
    .select(
        col("i.id").alias("i"), 
        col("j.id").alias("j"), 
        dot_udf("i.norm", "j.norm").alias("dot"))\
    .sort(desc('dot'))\
    .select('i', 'dot')

In [301]:
itog1=itog.filter('i!=13702').select("i").limit(10)
itog1.show()

+-----+
|    i|
+-----+
|  864|
|25502|
|23769|
|21079|
|26206|
|20069|
| 8313|
| 5399|
| 1041|
|  467|
+-----+



In [303]:
collected_df = itog1.select("i").toPandas()
end_13702 = collected_df['i'].values.tolist()
end_21617
end_23126
end_16627
end_11556
end_16704
end_13702
end= dict({'21617' : end_21617 , '23126' : end_23126 , '16627' : end_16627 , '11556' : end_11556, '16704' : end_16704, '13702' : end_13702  }) 

In [304]:
end

{'21617': [21609,
  21616,
  22298,
  21608,
  21628,
  21630,
  21081,
  21623,
  19417,
  21508],
 '23126': [13665,
  14760,
  13782,
  20638,
  24419,
  15909,
  2724,
  25782,
  17499,
  13348],
 '16627': [11431, 12247, 17964, 5687, 11575, 17961, 12660, 25010, 5558, 16694],
 '11556': [16488, 468, 13461, 23357, 19330, 7833, 9289, 10447, 10384, 16929],
 '16704': [3864, 23407, 25724, 25726, 23864, 1236, 18023, 1247, 25627, 11212],
 '13702': [864, 25502, 23769, 21079, 26206, 20069, 8313, 5399, 1041, 467]}

In [305]:
import json
collected_df = end
with open('lab02.json', 'w') as outfile:
    json.dump(collected_df , outfile , indent=1)

In [21]:
spark.stop()