In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import json
from pyspark.ml.feature import HashingTF, IDF
import re

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("Lab2")
         .getOrCreate())

In [3]:
spark

In [4]:
data = spark.read.json("/labs/slaba02/DO_record_per_line.json")
data.show(10)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|
|9/humanities|15/m...|This game-based c...|  9|  en|College Foundatio...|Canvas Network|
|  14/social_sciences|What’s in your di...| 10|  en|Digital Literacies I|Canvas Network|
|  14/social_sciences|The goal of the D...| 11|  en|Digital Literacie...|Canvas Network|
|  14/social_sciences

In [5]:
courses = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]

[i[0] for i in courses]

[23126, 21617, 16627, 11556, 16704, 13702]

In [6]:
[i[1] for i in courses]

['en', 'en', 'es', 'es', 'ru', 'ru']

In [7]:
def tokenize(series):
    regex = re.compile(u'[\w\d]{2,}', re.U)
    return series.str.findall(regex)

In [8]:
tokenize_udf = F.pandas_udf(tokenize, returnType=ArrayType(StringType()))

In [12]:
data = data.withColumn('desc_words', tokenize_udf(F.lower(F.col('desc'))))
data.show(10)

+--------------------+--------------------+---+----+--------------------+--------------+--------------------+
|                 cat|                desc| id|lang|                name|      provider|          desc_words|
+--------------------+--------------------+---+----+--------------------+--------------+--------------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|[this, course, in...|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|[this, online, co...|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|[this, course, is...|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|[we, live, in, di...|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|[this, self, pace...|
|9/humanities|15/m...|This game-based c...|  9|  en|College Foundatio...|Canvas Network|[this, game, base...|
|  14/soci

In [13]:
hashingTF = HashingTF(inputCol="desc_words", outputCol="tf", numFeatures=10000)
tf_data = hashingTF.transform(data)
tf_data.show()

+--------------------+--------------------+---+----+--------------------+--------------+--------------------+--------------------+
|                 cat|                desc| id|lang|                name|      provider|          desc_words|                  tf|
+--------------------+--------------------+---+----+--------------------+--------------+--------------------+--------------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|[this, course, in...|(10000,[36,63,138...|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|[this, online, co...|(10000,[32,222,36...|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|[this, course, is...|(10000,[30,118,12...|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|[we, live, in, di...|(10000,[493,572,7...|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Net

In [14]:
idf = IDF(inputCol="tf", outputCol="idf", minDocFreq=1).fit(tf_data)
tfidf_data = idf.transform(tf_data)
tfidf_data.show()

+--------------------+--------------------+---+----+--------------------+--------------+--------------------+--------------------+--------------------+
|                 cat|                desc| id|lang|                name|      provider|          desc_words|                  tf|                 idf|
+--------------------+--------------------+---+----+--------------------+--------------+--------------------+--------------------+--------------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|[this, course, in...|(10000,[36,63,138...|(10000,[36,63,138...|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|[this, online, co...|(10000,[32,222,36...|(10000,[32,222,36...|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|[this, course, is...|(10000,[30,118,12...|(10000,[30,118,12...|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|

In [15]:
need_lang_data = tfidf_data.filter(F.col('lang').isin([i[1] for i in courses]))
need_lang_data.show()

+--------------------+--------------------+---+----+--------------------+--------------+--------------------+--------------------+--------------------+
|                 cat|                desc| id|lang|                name|      provider|          desc_words|                  tf|                 idf|
+--------------------+--------------------+---+----+--------------------+--------------+--------------------+--------------------+--------------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|[this, course, in...|(10000,[36,63,138...|(10000,[36,63,138...|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|[this, online, co...|(10000,[32,222,36...|(10000,[32,222,36...|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|[we, live, in, di...|(10000,[493,572,7...|(10000,[493,572,7...|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|

In [19]:
test_data = need_lang_data.filter(F.col('id').isin([i[0] for i in courses]))
test_data.show()

+--------------------+--------------------+-----+----+--------------------+--------+--------------------+--------------------+--------------------+
|                 cat|                desc|   id|lang|                name|provider|          desc_words|                  tf|                 idf|
+--------------------+--------------------+-----+----+--------------------+--------+--------------------+--------------------+--------------------+
|                    | La transformació...|11556|  es|Aprendizaje Colab...|   Udemy|[la, transformaci...|(10000,[249,522,5...|(10000,[249,522,5...|
|6/economics_finan...|Математическая эк...|13702|  ru|Математическая эк...|  Intuit|[математическая, ...|(10000,[310,942,2...|(10000,[310,942,2...|
|                    | Hazte más emplea...|16627|  es|Aprende Excel: Ni...|   Udemy|[hazte, más, empl...|(10000,[55,76,192...|(10000,[55,76,192...|
|5/computer_scienc...|В курсе рассматри...|16704|  ru|Программирование ...|  Intuit|[курсе, рассматри...|(10000,

In [17]:
@F.udf
def sim_cos(v1,v2):
    try:
        return float(v1.dot(v2))/(float(v1.norm(2)*v2.norm(2)))
    except:
        return 0

In [20]:
final_data = test_data.alias("test").join(need_lang_data.alias("langs"), ((F.col("test.id") != F.col("langs.id")))&(F.col('test.lang') == F.col('langs.lang')))\
    .select(
        F.col("test.id").alias("test_id"),
        F.col("langs.id").alias("id"),
        F.col("test.lang").alias("lang"),
        sim_cos("test.idf", "langs.idf").cast('double').alias("sim_cosine"))\
    .sort("test_id", "id")
    
final_data.show()

+-------+----+----+-------------------+
|test_id|  id|lang|         sim_cosine|
+-------+----+----+-------------------+
|  11556|  59|  es|0.32007148181296435|
|  11556| 124|  es|0.22274961789062608|
|  11556| 160|  es| 0.2466710896939478|
|  11556| 166|  es|0.28279398152797525|
|  11556| 196|  es|0.29704364895916324|
|  11556| 198|  es|0.27053950813287586|
|  11556| 252|  es| 0.1855338581590521|
|  11556| 272|  es|0.29186082339474834|
|  11556| 273|  es|0.19610893769580712|
|  11556| 386|  es|0.23931124652001923|
|  11556| 387|  es|  0.356753371222401|
|  11556| 468|  es|0.44855573811207733|
|  11556|3660|  es|  0.330469409825434|
|  11556|3810|  es| 0.2569888045406846|
|  11556|3870|  es|0.17403609739501189|
|  11556|3872|  es|0.13080267858915573|
|  11556|3873|  es|0.13752122617558823|
|  11556|3874|  es|0.12939753648484184|
|  11556|3875|  es|0.14027261187376897|
|  11556|3876|  es|0.17165679005070722|
+-------+----+----+-------------------+
only showing top 20 rows



In [21]:
final_dict = dict()

for i in courses:
    df = final_data.filter((F.col('test_id') == i[0])).sort(F.col('sim_cosine').desc()).limit(10)
    final_dict[i[0]] = list(df.select('id').toPandas()['id'])

In [22]:
final_dict

{23126: [14760, 13665, 13782, 20638, 24419, 15909, 2724, 25782, 17499, 13348],
 21617: [21609, 21616, 21608, 22298, 21630, 21628, 21623, 21508, 21081, 19417],
 16627: [11431, 11575, 12247, 17964, 5687, 17961, 16694, 12660, 25010, 5558],
 11556: [16488, 468, 13461, 23357, 19330, 7833, 9289, 10447, 22710, 11340],
 16704: [1236, 1247, 1365, 1273, 20288, 1164, 8186, 1233, 8203, 8207],
 13702: [864, 21079, 8313, 1041, 28074, 8300, 1033, 13057, 21025, 1111]}

In [36]:
with open("lab02.json", "w") as outfile:
    json.dump(final_dict, outfile)

In [None]:
spark.stop()