In [67]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [68]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "Yelena Pavlyuk lab02") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [165]:
spark

In [166]:
from pyspark.sql.types import ArrayType,StringType,FloatType
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.linalg import DenseVector, SparseVector, Vectors
import pyspark.sql.functions as f
from pyspark.sql.functions import col
import re

In [167]:
!hdfs dfs -head /labs/slaba02/DO_record_per_line.json

{"lang": "en", "name": "Accounting Cycle: The Foundation of Business Measurement and Reporting", "cat": "3/business_management|6/economics_finance", "provider": "Canvas Network", "id": 4, "desc": "This course introduces the basic financial statements used by most businesses, as well as the essential tools used to prepare them. This course will serve as a resource to help business students succeed in their upcoming university-level accounting classes, and as a refresher for upper division accounting students who are struggling to recall elementary concepts essential to more advanced accounting topics. Business owners will also benefit from this class by gaining essential skills necessary to organize and manage information pertinent to operating their business. At the conclusion of the class, students will understand the balance sheet, income statement, and cash flow statement. They will be able to differentiate between cash basis and accrual basis techniques, and know when each is appro

In [168]:
dataset = spark.read.json("/labs/slaba02/DO_record_per_line.json")

In [169]:
dataset

DataFrame[cat: string, desc: string, id: bigint, lang: string, name: string, provider: string]

In [170]:
dataset.show(10)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|
|9/humanities|15/m...|This game-based c...|  9|  en|College Foundatio...|Canvas Network|
|  14/social_sciences|What’s in your di...| 10|  en|Digital Literacies I|Canvas Network|
|  14/social_sciences|The goal of the D...| 11|  en|Digital Literacie...|Canvas Network|
|  14/social_sciences

In [171]:
dataset.rdd.getNumPartitions()

3

In [172]:
dataset.count()

28153

In [173]:
dataset.filter(f.col("id") == "23126").show(1, vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [174]:
regex = re.compile(u'[\w\d]{2,}', re.U)

In [175]:
def regexFunc(str):
    return regex.findall(str.lower())

In [176]:
regexUDF = f.udf(lambda z:regexFunc(z),  ArrayType(StringType()))

In [177]:
dataset2 = dataset.withColumn("words", regexUDF(dataset["desc"]))

In [178]:
dataset2

DataFrame[cat: string, desc: string, id: bigint, lang: string, name: string, provider: string, words: array<string>]

In [180]:
dataset2.filter(f.col("id") == "23126").show(1, vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [181]:
hasher = HashingTF(numFeatures=10000, binary=False, inputCol="words", outputCol="word_vector")

In [182]:
dataset2 = hasher.transform(dataset2)

In [183]:
idf = IDF(inputCol="word_vector", outputCol="word_vector_idf").fit(dataset2)

In [184]:
dataset2 = idf.transform(dataset2)

In [185]:
dataset2

DataFrame[cat: string, desc: string, id: bigint, lang: string, name: string, provider: string, words: array<string>, word_vector: vector, word_vector_idf: vector]

In [186]:
dataset2.select("id", "word_vector", "word_vector_idf").show(1, False, True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [187]:
def cosine_udf(v, u):
    cosine = v.dot(u) / (v.norm(2) * u.norm(2))
    return float(cosine)

In [188]:
cosine_wrapper = f.udf(lambda x,y: cosine_udf(x, y), FloatType())

In [189]:
courses = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], [11556,u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]
courses

[[23126, 'en', 'Compass - powerful SASS library that makes your life easier'],
 [21617, 'en', 'Preparing for the AP* Computer Science A Exam — Part 2'],
 [16627, 'es', 'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'],
 [11556,
  'es',
  'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'],
 [16704, 'ru', 'Программирование на Lazarus'],
 [13702, 'ru', 'Математическая экономика']]

In [190]:
answer = {}
j=0
for i in courses:
    ln = courses[j][1]
    #print(ln)
    id_c = courses[j][0]
    #print(id_c)
    df = dataset2.filter((col("lang") == ln) & (col("id") != id_c)).select("id", "name", "word_vector_idf")
    #df.show(2)
    v_cr = dataset2.filter(col("id") == id_c).select(col("word_vector_idf").alias("course_vector"))
    #v_cr.show(1)
    df = df.crossJoin(v_cr)
    df = df.withColumn("vector_cosine", cosine_wrapper("word_vector_idf","course_vector"))
    #df.filter("vector_cosine <> 'NaN'").sort(col("vector_cosine").desc(), col("name"), col("id")).show(10)
    df_top10 = df.filter("vector_cosine <> 'NaN'").sort(col("vector_cosine").desc(), col("name"), col("id")).select("id").limit(10)
    l=df_top10.rdd.map(lambda x:x[0]).collect()
    #print(l)
    answer[str(id_c)]=l
    j=j+1
print(answer)    

{'23126': [14760, 13665, 13782, 20638, 24419, 15909, 2724, 25782, 17499, 13348], '21617': [21609, 21616, 21608, 22298, 21630, 21628, 21623, 21508, 21081, 19417], '16627': [11431, 11575, 12247, 17964, 5687, 17961, 16694, 12660, 25010, 5558], '11556': [16488, 468, 13461, 23357, 19330, 7833, 9289, 10447, 22710, 11340], '16704': [1236, 1247, 1365, 1273, 20288, 1164, 8186, 1233, 8203, 875], '13702': [864, 21079, 8313, 1041, 28074, 8300, 1033, 13057, 21025, 1111]}


In [191]:
import json
with open('lab02.json', 'w') as f:
    json.dump(answer, f, indent=2)

In [192]:
spark.stop()