In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("korneev")
         .getOrCreate())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [1]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer, HashingTF, IDF
from pyspark.ml.linalg import DenseVector, Vectors, SparseVector, VectorUDT
from pyspark.sql.types import DoubleType,StringType

def text_regexp_filter(string):
    regex = re.compile(u'[\w\d]{2,}', re.U)
    return " ".join(regex.findall(string.lower()))
udf_text_regexp_filter = f.udf(lambda x: text_regexp_filter(x), StringType())

In [3]:
dataset = spark.read.json("/labs/slaba02/DO_record_per_line.json")

In [4]:
dataset.show(5)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|
+--------------------+--------------------+---+----+--------------------+--------------+
only showing top 5 rows



In [8]:
dataset.count()

28153

In [6]:
dataset.rdd.getNumPartitions()

2

In [13]:
dataset.show(2, vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [14]:
import pyspark.sql.functions as f

In [15]:
dataset.groupBy("lang").count().orderBy(f.desc("count")).collect()

[Row(lang='en', count=24553),
 Row(lang='es', count=1374),
 Row(lang='ru', count=1231),
 Row(lang='pt', count=187),
 Row(lang='zh', count=169),
 Row(lang='de', count=166),
 Row(lang='tr', count=120),
 Row(lang='fr', count=104),
 Row(lang='ja', count=77),
 Row(lang='it', count=62),
 Row(lang='ar', count=34),
 Row(lang='ur', count=11),
 Row(lang='he', count=8),
 Row(lang='nl', count=6),
 Row(lang='hi', count=6),
 Row(lang='ca', count=6),
 Row(lang='sk', count=5),
 Row(lang='el', count=5),
 Row(lang='hr', count=4),
 Row(lang='da', count=3),
 Row(lang='ko', count=3),
 Row(lang='nb', count=2),
 Row(lang='af', count=2),
 Row(lang='hu', count=2),
 Row(lang='bg', count=2),
 Row(lang='fi', count=2),
 Row(lang='vi', count=1),
 Row(lang='pl', count=1),
 Row(lang='fa', count=1),
 Row(lang='id', count=1),
 Row(lang='uz', count=1),
 Row(lang='sw', count=1),
 Row(lang='ms', count=1),
 Row(lang='sv', count=1),
 Row(lang='et', count=1)]

In [20]:
dataset.groupBy('provider').count().orderBy(f.desc('count')).collect()

[Row(provider='Udemy', count=16421),
 Row(provider='Lynda', count=3590),
 Row(provider='MIT OpenCourseWare', count=2354),
 Row(provider='Coursera', count=1048),
 Row(provider='Intuit', count=946),
 Row(provider='edX', count=788),
 Row(provider="O'Reilly", count=459),
 Row(provider='ed2go', count=399),
 Row(provider='Craftsy', count=395),
 Row(provider='Canvas Network', count=258),
 Row(provider='Harvard Extension School', count=242),
 Row(provider='OpenLearning', count=237),
 Row(provider='FutureLearn', count=151),
 Row(provider='NovoED (Venture-lab)', count=124),
 Row(provider='Iversity', count=102),
 Row(provider='Udacity', count=89),
 Row(provider='Universarium', count=69),
 Row(provider='Stanford', count=51),
 Row(provider='Open2Study', count=49),
 Row(provider='', count=44),
 Row(provider='Open Yale courses', count=42),
 Row(provider='Code School', count=42),
 Row(provider='Postnauka', count=26),
 Row(provider='МФТИ', count=26),
 Row(provider='xuetangX', count=23),
 Row(provider='

In [25]:
import pandas as pd 
pd.set_option('max_rows', 40)
pd.DataFrame(dataset.groupBy("cat").count().orderBy(f.desc("count")).collect())

Unnamed: 0,0,1
0,,11460
1,3/business_management,4472
2,1/arts_music_film,3437
3,5/computer_science,2401
4,14/social_sciences,1057
5,9/humanities,1050
6,8/engineering_technology,627
7,15/mathematics_statistics_and_data_analysis,458
8,17/diy,395
9,2/biology_life_sciences,249


In [28]:
#мои задания
my_ids = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], 
[21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], 
[16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], 
[11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], 
[16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], 
[13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']
]
data_list_jsons = []
for ids in my_ids:
    print(ids)
    data_json = {}
    data_json['id'] = ids[0]
    data_json['lang'] = ids[1]
    data_json['name'] = ids[2]
    data_list_jsons.append(data_json)
data_list_jsons    

[23126, 'en', 'Compass - powerful SASS library that makes your life easier']
[21617, 'en', 'Preparing for the AP* Computer Science A Exam — Part 2']
[16627, 'es', 'Aprende Excel: Nivel Intermedio by Alfonso Rinsche']
[11556, 'es', 'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo']
[16704, 'ru', 'Программирование на Lazarus']
[13702, 'ru', 'Математическая экономика']


[{'id': 23126,
  'lang': 'en',
  'name': 'Compass - powerful SASS library that makes your life easier'},
 {'id': 21617,
  'lang': 'en',
  'name': 'Preparing for the AP* Computer Science A Exam — Part 2'},
 {'id': 16627,
  'lang': 'es',
  'name': 'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'},
 {'id': 11556,
  'lang': 'es',
  'name': 'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'},
 {'id': 16704, 'lang': 'ru', 'name': 'Программирование на Lazarus'},
 {'id': 13702, 'lang': 'ru', 'name': 'Математическая экономика'}]

In [29]:
data_list_jsons

[{'id': 23126,
  'lang': 'en',
  'name': 'Compass - powerful SASS library that makes your life easier'},
 {'id': 21617,
  'lang': 'en',
  'name': 'Preparing for the AP* Computer Science A Exam — Part 2'},
 {'id': 16627,
  'lang': 'es',
  'name': 'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'},
 {'id': 11556,
  'lang': 'es',
  'name': 'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'},
 {'id': 16704, 'lang': 'ru', 'name': 'Программирование на Lazarus'},
 {'id': 13702, 'lang': 'ru', 'name': 'Математическая экономика'}]

In [30]:
dataset_my = spark.createDataFrame(data_list_jsons)
dataset_my.show(5)



+-----+----+--------------------+
|   id|lang|                name|
+-----+----+--------------------+
|23126|  en|Compass - powerfu...|
|21617|  en|Preparing for the...|
|16627|  es|Aprende Excel: Ni...|
|11556|  es|Aprendizaje Colab...|
|16704|  ru|Программирование ...|
+-----+----+--------------------+
only showing top 5 rows



In [34]:
tokenizer = Tokenizer(inputCol="desc", outputCol="words")

In [56]:
# строим токенизатор
dataset_lang = dataset
tokenizer = Tokenizer(inputCol="desc", outputCol="words")
dataset_lang = tokenizer.transform(dataset_lang)

#считаем tf
ht = HashingTF(inputCol="words", outputCol="features", numFeatures=10000)
dataset_lang = ht.transform(dataset_lang)

#считаем tfidf
idf = IDF(inputCol="features", outputCol="features_tfidf").fit(dataset_lang)
dataset_lang = idf.transform(dataset_lang)

In [62]:
dataset_lang.show(1,vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [46]:
cosine_similarity = f.udf(lambda x,y: float(x.dot(y)/(x.norm(2)*y.norm(2))), DoubleType())

In [52]:
my_collect_dicts_res

[{'id': 23126,
  'lang': 'en',
  'name': 'Compass - powerful SASS library that makes your life easier',
  'result': [Row(id=13782, cos_sim=0.4615945735430712),
   Row(id=13665, cos_sim=0.4472198354497587),
   Row(id=24419, cos_sim=0.42547827554646006),
   Row(id=20638, cos_sim=0.4137012231575676),
   Row(id=2724, cos_sim=0.3740908853197787),
   Row(id=25782, cos_sim=0.29155071682291966),
   Row(id=2633, cos_sim=0.264277464308723),
   Row(id=2723, cos_sim=0.2623679510012487),
   Row(id=13348, cos_sim=0.25064274782442897),
   Row(id=15909, cos_sim=0.24258522561941123)]},
 {'id': 21617,
  'lang': 'en',
  'name': 'Preparing for the AP* Computer Science A Exam — Part 2',
  'result': [Row(id=21609, cos_sim=0.9905678275271237),
   Row(id=21608, cos_sim=0.4735811480927774),
   Row(id=21616, cos_sim=0.46984441601583776),
   Row(id=21492, cos_sim=0.3759522864918765),
   Row(id=21624, cos_sim=0.3281136723386965),
   Row(id=21623, cos_sim=0.32329240114955754),
   Row(id=21630, cos_sim=0.3230590459

In [63]:
# берем нужный курс 
my_id= 23126
my_lang = 'en'

In [70]:
need_row = dataset_lang.filter(f.col("id") == my_id).rdd.takeSample(False, 1)
need_row

[Row(cat='', desc=" Improve your SASS skill by learning benefits coming from Compass framework, which has many saving time useful utilities Compass is a framework (library) for SASS which means that you can find inside it lots of useful utilities that are gonna be shown to you inside this course. These utilities can save your time while you are developing your websites. By using Compass you can use created by others functions/mixins which you would need to most time write on your own. As always time = money. Save your time by learning Compass. Make your website load faster using Sprites within Compass. With Compass you gonna create Sprites almost instantly. If you do not know what are Sprites watch a free lesson about Sprites and you will see that this course is worth taking. After this course you will know how to: install compass integrate compass with sass use compass use and know what are sprites use Compass inside a great IDE create vendor prefixes using mixins and much more I give

In [72]:
row_dict = [r.asDict() for r in need_row]
row_dict

[{'cat': '',
  'desc': " Improve your SASS skill by learning benefits coming from Compass framework, which has many saving time useful utilities Compass is a framework (library) for SASS which means that you can find inside it lots of useful utilities that are gonna be shown to you inside this course. These utilities can save your time while you are developing your websites. By using Compass you can use created by others functions/mixins which you would need to most time write on your own. As always time = money. Save your time by learning Compass. Make your website load faster using Sprites within Compass. With Compass you gonna create Sprites almost instantly. If you do not know what are Sprites watch a free lesson about Sprites and you will see that this course is worth taking. After this course you will know how to: install compass integrate compass with sass use compass use and know what are sprites use Compass inside a great IDE create vendor prefixes using mixins and much more I

In [73]:
features_tfidf = row_dict[0]['features_tfidf']
features_tfidf

SparseVector(10000, {87: 3.0156, 91: 1.6515, 96: 5.4013, 113: 3.9897, 128: 1.3578, 246: 3.952, 258: 3.6401, 263: 15.7245, 341: 6.0067, 388: 3.4979, 419: 2.7345, 427: 3.2965, 461: 1.6504, 492: 6.0558, 524: 2.173, 572: 0.91, 631: 0.869, 721: 1.0208, 727: 2.3192, 814: 5.0469, 870: 2.8892, 937: 2.7274, 1022: 2.2394, 1036: 3.5834, 1072: 3.2641, 1073: 6.2608, 1169: 2.7516, 1173: 1.5834, 1187: 0.9071, 1197: 3.6134, 1216: 4.1701, 1218: 4.9322, 1259: 3.5684, 1263: 4.8332, 1272: 7.9896, 1299: 0.977, 1368: 1.9587, 1420: 2.0026, 1425: 11.3347, 1443: 3.2014, 1445: 1.5852, 1463: 17.1162, 1470: 35.7265, 1499: 3.5759, 1612: 2.4837, 1645: 3.795, 1652: 3.1841, 1770: 4.575, 1777: 2.2907, 1882: 2.7043, 1939: 4.647, 1959: 0.6698, 2077: 6.2685, 2080: 0.8248, 2082: 0.6966, 2150: 0.8063, 2159: 2.3305, 2213: 1.8763, 2217: 1.206, 2370: 3.227, 2412: 2.5774, 2460: 4.5028, 2495: 0.9839, 2609: 4.1273, 2656: 2.5189, 2689: 5.3038, 2691: 4.3346, 2884: 0.9863, 2971: 5.1517, 3038: 4.249, 3102: 2.9302, 3115: 2.7701, 3138

In [77]:
features_tfidf_course = f.udf(lambda: features_tfidf, VectorUDT())
tfidf_cos_sim = dataset_lang.withColumn("features_tfidf_course_{}".format(my_id), features_tfidf_course())
tfidf_cos_sim.show(1,truncate=False,vertical=True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [78]:
dataset_lang = tfidf_cos_sim.withColumn("cos_sim",
                                    cosine_similarity(tfidf_cos_sim["features_tfidf"],
                                                      tfidf_cos_sim["features_tfidf_course_{}".format(my_id)]))

In [84]:
dataset_lang.show(15)

+--------------------+--------------------+---+----+--------------------+--------------+--------------------+--------------------+--------------------+---------------------------+--------------------+
|                 cat|                desc| id|lang|                name|      provider|               words|            features|      features_tfidf|features_tfidf_course_23126|             cos_sim|
+--------------------+--------------------+---+----+--------------------+--------------+--------------------+--------------------+--------------------+---------------------------+--------------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|[this, course, in...|(10000,[36,42,63,...|(10000,[36,42,63,...|       (10000,[87,91,96,...| 0.03514739014479155|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|[this, online, co...|(10000,[32,222,29...|(10000,[32,222,29...|       (10000,[87,91,96,...|0.021611102495411

In [85]:
dataset_lang.filter(f.col("id") != my_id)\
                          .filter(f.col("cos_sim") != "NaN")\
                          .sort(f.desc("cos_sim"), f.asc("name"),f.asc("id"))\
                          .select('id', 'cos_sim')\
                          .head(10)

[Row(id=13782, cos_sim=0.4615945735430712),
 Row(id=13665, cos_sim=0.4472198354497587),
 Row(id=24419, cos_sim=0.42547827554646006),
 Row(id=20638, cos_sim=0.4137012231575676),
 Row(id=2724, cos_sim=0.3740908853197787),
 Row(id=25782, cos_sim=0.29155071682291966),
 Row(id=2633, cos_sim=0.264277464308723),
 Row(id=2723, cos_sim=0.2623679510012487),
 Row(id=13348, cos_sim=0.25064274782442897),
 Row(id=15909, cos_sim=0.24258522561941123)]

In [79]:
result_list = dataset_lang.filter(f.col("id") != my_id)\
                          .filter(f.col("cos_sim") != "NaN")\
                          .sort(f.desc("cos_sim"), f.asc("name"),f.asc("id"))\
                          .select('id', 'cos_sim')\
                          .head(10)
result_list

[Row(id=13782, cos_sim=0.4615945735430712),
 Row(id=13665, cos_sim=0.4472198354497587),
 Row(id=24419, cos_sim=0.42547827554646006),
 Row(id=20638, cos_sim=0.4137012231575676),
 Row(id=2724, cos_sim=0.3740908853197787),
 Row(id=25782, cos_sim=0.29155071682291966),
 Row(id=2633, cos_sim=0.264277464308723),
 Row(id=2723, cos_sim=0.2623679510012487),
 Row(id=13348, cos_sim=0.25064274782442897),
 Row(id=15909, cos_sim=0.24258522561941123)]

In [48]:
# считаем для всех курсов
my_collect_dicts_res = list() 
for case in data_list_jsons:
    #отбираем курс
    my_id = case['id']
    my_lang = case['lang']
    #отбираем курс и размножаем его tfidf на каждую строку spark-df
    need_row = dataset_lang.filter(f.col("id") == my_id)\
                       .rdd.takeSample(False, 1)
    row_dict = [r.asDict() for r in need_row]
    features_tfidf = row_dict[0]['features_tfidf']
    features_tfidf_course = f.udf(lambda: features_tfidf, VectorUDT())
    tfidf_cos_sim = dataset_lang.withColumn("features_tfidf_course_{}".format(my_id), features_tfidf_course())

    #считаем cos_sim
    dataset_lang = tfidf_cos_sim.withColumn("cos_sim",
                                        cosine_similarity(tfidf_cos_sim["features_tfidf"],
                                                          tfidf_cos_sim["features_tfidf_course_{}".format(my_id)]))
    #зачищаем результирующий df и сортируемся по указанным полям
    #по метрике (убывание) => по названию (лексикографически по возрастанию) => по возрастанию id.
    result_list = dataset_lang.filter(f.col("id") != my_id)\
                              .filter(f.col("cos_sim") != "NaN")\
                              .sort(f.desc("cos_sim"), f.asc("name"),f.asc("id"))\
                              .select('id', 'cos_sim')\
                              .head(10)
    case["result"] = result_list
    my_collect_dicts_res.append(case)

In [49]:
my_collect_dicts_res

[{'id': 23126,
  'lang': 'en',
  'name': 'Compass - powerful SASS library that makes your life easier',
  'result': [Row(id=13782, cos_sim=0.4615945735430712),
   Row(id=13665, cos_sim=0.4472198354497587),
   Row(id=24419, cos_sim=0.42547827554646006),
   Row(id=20638, cos_sim=0.4137012231575676),
   Row(id=2724, cos_sim=0.3740908853197787),
   Row(id=25782, cos_sim=0.29155071682291966),
   Row(id=2633, cos_sim=0.264277464308723),
   Row(id=2723, cos_sim=0.2623679510012487),
   Row(id=13348, cos_sim=0.25064274782442897),
   Row(id=15909, cos_sim=0.24258522561941123)]},
 {'id': 21617,
  'lang': 'en',
  'name': 'Preparing for the AP* Computer Science A Exam — Part 2',
  'result': [Row(id=21609, cos_sim=0.9905678275271237),
   Row(id=21608, cos_sim=0.4735811480927774),
   Row(id=21616, cos_sim=0.46984441601583776),
   Row(id=21492, cos_sim=0.3759522864918765),
   Row(id=21624, cos_sim=0.3281136723386965),
   Row(id=21623, cos_sim=0.32329240114955754),
   Row(id=21630, cos_sim=0.3230590459

In [51]:
# export 
data = {str(sample["id"]):[x.asDict()['id'] for x in sample["result"]] for sample in my_collect_dicts_res}
with open("lab02.json",'w') as file:
    json.dump(data, file)