In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
!hdfs dfs -ls /labs/slaba02/

Found 1 items
-rw-r--r--   3 hdfs hdfs   69519728 2021-02-27 21:58 /labs/slaba02/DO_record_per_line.json


**Load Data**

In [49]:
data = spark.read.json('/labs/slaba02/DO_record_per_line.json')
print('Data size: {}'. format(data.count()))

Data size: 28153


In [50]:
data.show(5)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|
+--------------------+--------------------+---+----+--------------------+--------------+
only showing top 5 rows



**TF-IDF**

In [51]:
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, StopWordsRemover
tokenizer = Tokenizer(inputCol="desc", outputCol="words")
data = tokenizer.transform(data)

In [52]:
data.show(5)

+--------------------+--------------------+---+----+--------------------+--------------+--------------------+
|                 cat|                desc| id|lang|                name|      provider|               words|
+--------------------+--------------------+---+----+--------------------+--------------+--------------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|[this, course, in...|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|[this, online, co...|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|[this, course, is...|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|[we, live, in, a,...|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|[this, self-paced...|
+--------------------+--------------------+---+----+--------------------+--------------+--------------------+
only showi

Remove Stopwords

In [53]:
remover = StopWordsRemover(inputCol='words', outputCol='words_clean')
data_removed = remover.transform(data)

In [54]:
ht = HashingTF(inputCol="words_clean", outputCol="features", numFeatures=10000)
data = ht.transform(data_removed)

IDF transformation

In [57]:
data.cache()
idf = IDF(inputCol='features', outputCol='idf_features').fit(data)
tfidf = idf.transform(data)

Normalize Vectors

In [59]:
from pyspark.ml.feature import Normalizer
normalizer = Normalizer(inputCol="idf_features", outputCol="normFeatures", p=1.0)
normalized_data = normalizer.transform(tfidf)

In [60]:
normalized_data.show(5)

+--------------------+--------------------+---+----+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                 cat|                desc| id|lang|                name|      provider|               words|         words_clean|            features|        idf_features|        normFeatures|
+--------------------+--------------------+---+----+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|[this, course, in...|[course, introduc...|(10000,[36,42,63,...|(10000,[36,42,63,...|(10000,[36,42,63,...|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|[this, online, co...|[online, course, ...|(10000,[32,222,29...|(10000,[32,222,29...|(10000,[32,222,29...|
|5/computer_scienc...|This cou

In [61]:
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

**Filter by courses to make recommendations**

In [64]:
print('Data size: {}'. format(normalized_data.count()))
selected_courses = normalized_data.where("id in (23126, 21617, 16627, 11556, 16704, 13702)")
print('Data size: {}'. format(selected_courses.count()))
other_courses = normalized_data.where("id not in (23126, 21617, 16627, 11556, 16704, 13702)")
print('Data size: {}'. format(other_courses.count()))

Data size: 28153
Data size: 6
Data size: 28147


Языки курсов

- es
- ru
- en

In [66]:
other_courses_es = other_courses.filter(other_courses.lang == 'es')
other_courses_ru = other_courses.filter(other_courses.lang == 'ru')
other_courses_en = other_courses.filter(other_courses.lang == 'en')
print('es: {}'.format(other_courses_es.count()))
print('ru: {}'.format(other_courses_ru.count()))
print('en: {}'.format(other_courses_en.count()))

es: 1372
ru: 1229
en: 24551


- 23126 - en
- 21617 - en
- 16627 - es
- 11556 - es
- 16704 - ru
- 13702 - ru

Add sparse vectors to list

In [67]:
sel_id_lst = selected_courses.select('id').rdd.map(lambda x: x[0]).collect()
sel_id_features = selected_courses.select('normFeatures').rdd.map(lambda x: x[0]).collect()

In [68]:
en_other_id_lst = other_courses_en.select('id').rdd.map(lambda x: x[0]).collect()
en_other_id_features = other_courses_en.select('normFeatures').rdd.map(lambda x: x[0]).collect()

es_other_id_lst = other_courses_es.select('id').rdd.map(lambda x: x[0]).collect()
es_other_id_features = other_courses_es.select('normFeatures').rdd.map(lambda x: x[0]).collect()

ru_other_id_lst = other_courses_ru.select('id').rdd.map(lambda x: x[0]).collect()
ru_other_id_features = other_courses_ru.select('normFeatures').rdd.map(lambda x: x[0]).collect()

In [156]:
def sim_cos(v1,v2):
    try:
        p = 2
        return float(v1.dot(v2))/float(v1.norm(p)*v2.norm(p))
    except:
        return 0

In [159]:
from scipy.spatial import distance
def related_courses(features_lang, id_lang, selected_id_ftr, selected_id):
    
    distances = []
    for i in features_lang:
        distances.append(sim_cos(selected_id_ftr, i))

    dict_cosine = dict(zip(id_lang, distances))
    val_list = list(dict_cosine.values())
    key_list = list(dict_cosine.keys())
    lst = sorted(dict_cosine.values(), reverse=True)[:10]
    
    course_recs = []
    for i in lst:
        position = val_list.index(i)
        course_recs.append(key_list[position])
    return selected_id, course_recs

**English courses**

In [69]:
sel_id_lst[-1]

23126

In [70]:
sel_id_lst[-2]

21617

In [160]:
id_23126, recs_23126 = related_courses(en_other_id_features, en_other_id_lst, sel_id_features[-1], sel_id_lst[-1])
print(id_23126, recs_23126)

23126 [13782, 13665, 24419, 20638, 2724, 25782, 2633, 2723, 13348, 15909]


In [161]:
id_21617, recs_21617 = related_courses(en_other_id_features, en_other_id_lst, sel_id_features[-2], sel_id_lst[-2])

**Spanish courses**

In [163]:
sel_id_lst[0]

11556

In [164]:
sel_id_lst[2]

16627

In [165]:
id_11556, recs_11556 = related_courses(es_other_id_features, es_other_id_lst, sel_id_features[0], sel_id_lst[0])

In [166]:
id_16627, recs_16627 = related_courses(es_other_id_features, es_other_id_lst, sel_id_features[2], sel_id_lst[2])

**Russian courses**

In [167]:
sel_id_lst[3]

16704

In [168]:
sel_id_lst[1]

13702

In [169]:
id_16704, recs_16704 = related_courses(ru_other_id_features, ru_other_id_lst, sel_id_features[3], sel_id_lst[3])

In [170]:
id_13702, recs_13702 = related_courses(ru_other_id_features, ru_other_id_lst, sel_id_features[1], sel_id_lst[1])

**Make json**

In [171]:
json_file = {str(id_23126): recs_23126,
            str(id_21617): recs_21617,
            str(id_16627): recs_16627,
            str(id_11556): recs_11556,
            str(id_16704): recs_16704, 
            str(id_13702): recs_13702
}

In [172]:
json_file

{'23126': [13782, 13665, 24419, 20638, 2724, 25782, 2633, 2723, 13348, 15909],
 '21617': [21609,
  21608,
  21616,
  21492,
  21624,
  21623,
  21630,
  21628,
  21508,
  21703],
 '16627': [11431, 12247, 13021, 25010, 11575, 5687, 5372, 12863, 9598, 22680],
 '11556': [16488, 13461, 468, 23357, 19330, 16929, 387, 10447, 11554, 9289],
 '16704': [1365, 20645, 1426, 1426, 8217, 1236, 1164, 1219, 8123, 875],
 '13702': [864, 1216, 7173, 1052, 8313, 17017, 19613, 21017, 17015, 8082]}

In [173]:
import json

with open('lab02.json', 'w') as fp:
    json.dump(json_file, fp)

**Put into server**

In [174]:
! pwd

/data/home/dmitry.ulogov


In [175]:
! hdfs dfs -rm lab02.json

21/03/05 15:31:57 INFO fs.TrashPolicyDefault: Moved: 'hdfs://spark-de-master-1.newprolab.com:8020/user/dmitry.ulogov/lab02.json' to trash at: hdfs://spark-de-master-1.newprolab.com:8020/user/dmitry.ulogov/.Trash/Current/user/dmitry.ulogov/lab02.json


In [176]:
! hdfs dfs -ls /user/dmitry.ulogov

Found 3 items
drwx------   - dmitry.ulogov dmitry.ulogov          0 2021-03-05 15:31 /user/dmitry.ulogov/.Trash
drwxr-xr-x   - dmitry.ulogov dmitry.ulogov          0 2021-03-05 13:36 /user/dmitry.ulogov/.sparkStaging
-rw-r--r--   3 dmitry.ulogov dmitry.ulogov         84 2021-02-26 21:44 /user/dmitry.ulogov/lab01.json


In [177]:
! hdfs dfs -put lab02.json /user/dmitry.ulogov

In [178]:
! hdfs dfs -ls /user/dmitry.ulogov

Found 4 items
drwx------   - dmitry.ulogov dmitry.ulogov          0 2021-03-05 15:31 /user/dmitry.ulogov/.Trash
drwxr-xr-x   - dmitry.ulogov dmitry.ulogov          0 2021-03-05 13:36 /user/dmitry.ulogov/.sparkStaging
-rw-r--r--   3 dmitry.ulogov dmitry.ulogov         84 2021-02-26 21:44 /user/dmitry.ulogov/lab01.json
-rw-r--r--   3 dmitry.ulogov dmitry.ulogov        458 2021-03-05 15:32 /user/dmitry.ulogov/lab02.json


In [53]:
sc.stop()

In [80]:
! hdfs dfs -ls #/share/submission-files/slaba02/

Found 3 items
drwxr-xr-x   - dmitry.ulogov dmitry.ulogov          0 2021-03-04 22:22 .sparkStaging
-rw-r--r--   3 dmitry.ulogov dmitry.ulogov         84 2021-02-26 21:44 lab01.json
-rw-r--r--   3 dmitry.ulogov dmitry.ulogov        419 2021-03-04 22:52 lab02.json
