In [1]:
!hdfs dfs -head /labs/slaba02/DO_record_per_line.json

{"lang": "en", "name": "Accounting Cycle: The Foundation of Business Measurement and Reporting", "cat": "3/business_management|6/economics_finance", "provider": "Canvas Network", "id": 4, "desc": "This course introduces the basic financial statements used by most businesses, as well as the essential tools used to prepare them. This course will serve as a resource to help business students succeed in their upcoming university-level accounting classes, and as a refresher for upper division accounting students who are struggling to recall elementary concepts essential to more advanced accounting topics. Business owners will also benefit from this class by gaining essential skills necessary to organize and manage information pertinent to operating their business. At the conclusion of the class, students will understand the balance sheet, income statement, and cash flow statement. They will be able to differentiate between cash basis and accrual basis techniques, and know when each is appro

In [2]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [3]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "ZK Spark Dataframe app") 

spark = SparkSession.builder.config(conf=conf).appName("ZK Spark Dataframe app").getOrCreate()

In [4]:
spark

In [5]:
sc = spark.sparkContext

In [6]:
sc

In [7]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [8]:
spark.read

<pyspark.sql.readwriter.DataFrameReader at 0x7fd178685c88>

In [9]:
df = spark.read\
          .format("json")\
          .option("sep", "|")\
          .load("/labs/slaba02/DO_record_per_line.json")

In [10]:
df.show(5)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|
+--------------------+--------------------+---+----+--------------------+--------------+
only showing top 5 rows



переводим описание в массивы слов

In [11]:
df.select('desc').show(2, False, True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [12]:
from pyspark.ml.feature import Tokenizer, HashingTF, IDF

In [14]:
from pyspark.sql.functions import col, udf, lower
from pyspark.sql.types import IntegerType, StringType, ArrayType, DoubleType

In [15]:
import pyspark.sql.functions as F
from pyspark.sql.functions import regexp_replace

In [16]:
df_lower = df.withColumn('desc_lower', lower(col('desc')))
df_lower.show(1,False,True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [17]:
pattern_punct = '[.!@"“’«»#$%&\'()*+,—/:;<=>?^_`{|}~\[\]]'
df3 = df_lower.withColumn('cleaned_desc', 
regexp_replace('desc_lower', pattern_punct, ''))

In [18]:
df3.show(2, False, True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [19]:
tokenizer = Tokenizer(inputCol="cleaned_desc", outputCol="words")

In [20]:
df_token = tokenizer.transform(df3)

In [21]:
df_token.show(2, False, True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [22]:
ht = HashingTF(inputCol="words", outputCol="tf_features", numFeatures=10000)
df_tf = ht.transform(df_token)
df_tf.show(1, False, True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [23]:
df_tf_cleaned = df_tf.drop("cat", "desc", "provider","desc_lower","cleaned_desc")
df_tf_cleaned.show(1, False, True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [24]:
idf = IDF(inputCol="tf_features", outputCol="features")
idfModel = idf.fit(df_tf_cleaned)
df_tfidf = idfModel.transform(df_tf_cleaned)
df_tfidf.show(1,False,True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

дф со своими курсами

In [25]:
list_course_id = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], 
 [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], 
 [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], 
 [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], 
 [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], 
 [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']
]

In [26]:
list_course_id

[[23126, 'en', 'Compass - powerful SASS library that makes your life easier'],
 [21617, 'en', 'Preparing for the AP* Computer Science A Exam — Part 2'],
 [16627, 'es', 'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'],
 [11556,
  'es',
  'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'],
 [16704, 'ru', 'Программирование на Lazarus'],
 [13702, 'ru', 'Математическая экономика']]

In [27]:
from pyspark.sql import Row
rdd1 = sc.parallelize(list_course_id)
row_rdd = rdd1.map(lambda x: Row(x))

In [28]:
df_test = row_rdd.flatMap(lambda x: x).toDF()
df_test.show(6,False,True)

-RECORD 0-----------------------------------------------------------------------------
 _1  | 23126                                                                          
 _2  | en                                                                             
 _3  | Compass - powerful SASS library that makes your life easier                    
-RECORD 1-----------------------------------------------------------------------------
 _1  | 21617                                                                          
 _2  | en                                                                             
 _3  | Preparing for the AP* Computer Science A Exam — Part 2                         
-RECORD 2-----------------------------------------------------------------------------
 _1  | 16627                                                                          
 _2  | es                                                                             
 _3  | Aprende Excel: Nivel Intermedio by A

In [29]:
df_test

DataFrame[_1: bigint, _2: string, _3: string]

In [30]:
inner_join = df_test.join(df_tfidf, df_test._1 == df_tfidf.id)
df_test_v = inner_join.drop("id", "lang", "name","words","tf_features").cache()
df_test_v = df_test_v.select(col("_1").alias("id_test"),col("_2").alias("lang_test"),col("_3").alias("name_test"),col("features").alias("features_test"),)
df_test_v.show(1,False,True)
# df_test_v.count()

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [31]:
inner_join2 = df_test_v.join(df_tfidf, df_test_v.lang_test == df_tfidf.lang)
inner_join2.show(1,False,True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [32]:
inner_join2

DataFrame[id_test: bigint, lang_test: string, name_test: string, features_test: vector, id: bigint, lang: string, name: string, words: array<string>, tf_features: vector, features: vector]

In [33]:
df_tfidf.count()

28153

In [34]:
inner_join2.count()

54316

In [35]:
cosine = F.udf( lambda x, y: float(x.dot(y)/ x.norm(2) / y.norm(2)), DoubleType())

In [49]:
from pyspark.sql.functions import col

In [36]:
df_full = inner_join2.filter(inner_join2.id_test != inner_join2.id)
df_full.count()

54310

In [37]:
df_full

DataFrame[id_test: bigint, lang_test: string, name_test: string, features_test: vector, id: bigint, lang: string, name: string, words: array<string>, tf_features: vector, features: vector]

In [38]:
df_full_d = df_full.withColumn('cos', cosine(F.col('features_test'),F.col('features')))

In [39]:
df_full_d.show(1,False,True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [41]:
df_full_d_cleaned = df_full_d.drop("name_test", "features_test","words","tf_features","features")
df_full_d_cleaned.show(10,False,True)

-RECORD 0---------------------------------------------------------------------------
 id_test   | 21617                                                                  
 lang_test | en                                                                     
 id        | 4                                                                      
 lang      | en                                                                     
 name      | Accounting Cycle: The Foundation of Business Measurement and Reporting 
 cos       | 0.07767828390839164                                                    
-RECORD 1---------------------------------------------------------------------------
 id_test   | 23126                                                                  
 lang_test | en                                                                     
 id        | 4                                                                      
 lang      | en                                                  

In [42]:
df_full_d_cleaned_nan = df_full_d_cleaned.filter(df_full_d_cleaned.cos != NaN)
df_full_d_cleaned_nan.count()

54256

In [43]:
from pyspark.sql.functions import desc,row_number,asc
from pyspark.sql import Window

w = Window.partitionBy('id_test').orderBy(desc('cos'),asc('name'))

df_full_rn = df_full_d_cleaned_nan.withColumn("row_number", F.row_number().over(w))
df_full_10 = df_full_rn.filter(df_full_rn.row_number <= 10)
df_full_10.count()

60

In [44]:
df_full_10.show(60,False,True)

-RECORD 0-------------------------------------------------------------------------------------------
 id_test    | 23126                                                                                 
 lang_test  | en                                                                                    
 id         | 14760                                                                                 
 lang       | en                                                                                    
 name       | Foundation 4: Incorporating Sass and Compass                                          
 cos        | 0.6256944403676344                                                                    
 row_number | 1                                                                                     
-RECORD 1-------------------------------------------------------------------------------------------
 id_test    | 23126                                                                        

In [45]:
df_collect = df_full_10.groupBy('id_test').agg(F.collect_list('id').alias('recom_courses'))
df_collect.show(truncate=False)

+-------+----------------------------------------------------------------------+
|id_test|recom_courses                                                         |
+-------+----------------------------------------------------------------------+
|23126  |[14760, 13665, 13782, 20638, 24419, 15909, 2724, 25782, 17499, 13348] |
|16627  |[11431, 5687, 12247, 17961, 11575, 17964, 25010, 12660, 13021, 16694] |
|13702  |[864, 1052, 8082, 8313, 1216, 19613, 915, 17017, 7173, 21017]         |
|16704  |[1247, 20288, 1273, 1236, 8203, 1365, 1233, 20645, 1164, 20105]       |
|11556  |[16488, 13461, 23357, 468, 7833, 19330, 9289, 10447, 22710, 387]      |
|21617  |[21609, 21608, 21616, 21492, 21624, 21508, 21623, 21700, 21676, 21703]|
+-------+----------------------------------------------------------------------+



In [46]:
df_sort = df_collect.sort('id_test')
rdd_fin = df_sort.rdd
rdd_fin.take(6)

[Row(id_test=11556, recom_courses=[16488, 13461, 23357, 468, 7833, 19330, 9289, 10447, 22710, 387]),
 Row(id_test=13702, recom_courses=[864, 1052, 8082, 8313, 1216, 19613, 915, 17017, 7173, 21017]),
 Row(id_test=16627, recom_courses=[11431, 5687, 12247, 17961, 11575, 17964, 25010, 12660, 13021, 16694]),
 Row(id_test=16704, recom_courses=[1247, 20288, 1273, 1236, 8203, 1365, 1233, 20645, 1164, 20105]),
 Row(id_test=21617, recom_courses=[21609, 21608, 21616, 21492, 21624, 21508, 21623, 21700, 21676, 21703]),
 Row(id_test=23126, recom_courses=[14760, 13665, 13782, 20638, 24419, 15909, 2724, 25782, 17499, 13348])]

In [47]:
result_dict = dict(rdd_fin.collect()) 
result_dict

{11556: [16488, 13461, 23357, 468, 7833, 19330, 9289, 10447, 22710, 387],
 13702: [864, 1052, 8082, 8313, 1216, 19613, 915, 17017, 7173, 21017],
 16627: [11431, 5687, 12247, 17961, 11575, 17964, 25010, 12660, 13021, 16694],
 16704: [1247, 20288, 1273, 1236, 8203, 1365, 1233, 20645, 1164, 20105],
 21617: [21609, 21608, 21616, 21492, 21624, 21508, 21623, 21700, 21676, 21703],
 23126: [14760, 13665, 13782, 20638, 24419, 15909, 2724, 25782, 17499, 13348]}

In [49]:
import json
with open('lab02.json', 'w') as fp:
    json.dump(result_dict, fp)

In [50]:
sc.stop()