In [121]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [122]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Normalizer
from pyspark.sql.window import Window
import pyspark.sql.functions as F
from pyspark.sql.types import DoubleType
import json

In [123]:
conf = SparkConf()
conf.set("spark.app.name", "evgeniy osipchuk lab02") 

spark = SparkSession.builder.config(conf=conf).appName("evgeniy osipchuk lab02").getOrCreate()

### Считывание данных

Для более красивого вывода переопределим функцию show так, чтобы данные выводились в формате pandas. Pandas будет использоваться только для вывода на экран

In [124]:
import pyspark

def show(self, n=5):
    return self.limit(n).toPandas()

pyspark.sql.dataframe.DataFrame.show = show

In [125]:
courses_id = [23126, 21617, 16627, 11556, 16704, 13702]

In [126]:
df = spark.read\
          .format("json")\
          .option("sep", ",")\
          .load("/labs/slaba02/DO_record_per_line.json")

In [127]:
df

DataFrame[cat: string, desc: string, id: bigint, lang: string, name: string, provider: string]

In [128]:
df.show(5)

Unnamed: 0,cat,desc,id,lang,name,provider
0,3/business_management|6/economics_finance,This course introduces the basic financial sta...,4,en,Accounting Cycle: The Foundation of Business M...,Canvas Network
1,11/law,This online course will introduce you to Ameri...,5,en,American Counter Terrorism Law,Canvas Network
2,5/computer_science|15/mathematics_statistics_a...,This course is taught in French Vous voulez co...,6,fr,Arithmétique: en route pour la cryptographie,Canvas Network
3,14/social_sciences,We live in a digitally connected world. The wa...,7,en,Becoming a Dynamic Educator,Canvas Network
4,2/biology_life_sciences,This self-paced course is designed to show tha...,8,en,Bioethics,Canvas Network


In [129]:
df.filter(df.id.isin(courses_id)).show(10)

Unnamed: 0,cat,desc,id,lang,name,provider
0,5/computer_science,An introduction to how computing can be used t...,21617,en,Preparing for the AP* Computer Science A Exam ...,edX
1,,Improve your SASS skill by learning benefits ...,23126,en,Compass - powerful SASS library that makes you...,Udemy
2,,La transformación del aula con el Aprendizaje...,11556,es,Aprendizaje Colaborativo by UNID Universidad I...,Udemy
3,6/economics_finance|15/mathematics_statistics_...,Математическая экономика – это набор моделей в...,13702,ru,Математическая экономика,Intuit
4,,"Hazte más empleable, obtén una nueva competen...",16627,es,Aprende Excel: Nivel Intermedio by Alfonso Rin...,Udemy
5,5/computer_science|14/social_sciences,В курсе рассматривается среда программирования...,16704,ru,Программирование на Lazarus,Intuit


### Токенизация описаний

In [130]:
tokenizer = Tokenizer(inputCol="desc", outputCol="desc_tokenized")

In [131]:
tokenized_df = tokenizer.transform(df)

In [132]:
tokenized_df.show()

Unnamed: 0,cat,desc,id,lang,name,provider,desc_tokenized
0,3/business_management,It's not luck and it's not by accident. Makin...,9852,en,How to Make Easy Money Online: Get Started Now,Udemy,"[, it's, not, luck, and, it's, not, by, accide..."
1,1/arts_music_film,"An Easy To Follow, Step-By-Step Guide to Mast...",9853,en,How To Get Started with WordPress by Ten Ton O...,Udemy,"[, an, easy, to, follow,, step-by-step, guide,..."
2,,You have been doing exceptional in everything...,9854,en,How to Get That Raise by Laura Rose,Udemy,"[, you, have, been, doing, exceptional, in, ev..."
3,,Learn the skills you need to get control of y...,9855,en,Escape Email Overwhelm! In Under 1 Hour by Ros...,Udemy,"[, learn, the, skills, you, need, to, get, con..."
4,3/business_management,A Proven Blueprint To Help You Get Paid For Y...,9856,en,How to Get New Customers: Get Your First Client,Udemy,"[, a, proven, blueprint, to, help, you, get, p..."


### TF-IDF

In [133]:
hashingTF = HashingTF(inputCol="desc_tokenized", outputCol="tf", numFeatures=10000)
tf_idf = hashingTF.transform(tokenized_df)

In [134]:
idf = IDF(inputCol="tf", outputCol="features")
idf = idf.fit(tf_idf)

In [135]:
df_idf = idf.transform(tf_idf)
normalizer = Normalizer(inputCol="features", outputCol="norm_features")
df_idf = normalizer.transform(df_idf)

In [136]:
df_idf.show()

  Unsupported type in conversion to Arrow: VectorUDT
Attempting non-optimization as 'spark.sql.execution.arrow.fallback.enabled' is set to true.


Unnamed: 0,cat,desc,id,lang,name,provider,desc_tokenized,tf,features,norm_features
0,3/business_management|6/economics_finance,This course introduces the basic financial sta...,4,en,Accounting Cycle: The Foundation of Business M...,Canvas Network,"[this, course, introduces, the, basic, financi...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,11/law,This online course will introduce you to Ameri...,5,en,American Counter Terrorism Law,Canvas Network,"[this, online, course, will, introduce, you, t...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,5/computer_science|15/mathematics_statistics_a...,This course is taught in French Vous voulez co...,6,fr,Arithmétique: en route pour la cryptographie,Canvas Network,"[this, course, is, taught, in, french, vous, v...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,14/social_sciences,We live in a digitally connected world. The wa...,7,en,Becoming a Dynamic Educator,Canvas Network,"[we, live, in, a, digitally, connected, world....","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,2/biology_life_sciences,This self-paced course is designed to show tha...,8,en,Bioethics,Canvas Network,"[this, self-paced, course, is, designed, to, s...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


### Нахождение косинусного расстояния

In [137]:
scalar_product_udf = F.udf(lambda a,b: float(a.dot(b)), DoubleType())

In [138]:
answer = {}

In [162]:
for course_id in courses_id:
    course_features = df_idf.filter(F.col('id') == course_id).select(F.col('norm_features').alias('course_features'))

    result = (df_idf
                 .filter(F.col('id') != course_id)
                 .select('name', 'id', 'norm_features')
                 .crossJoin(course_features)
                 .select(scalar_product_udf('norm_features', 'course_features').alias('scalar_product'), 'id', 'name')
                 .sort(F.desc('scalar_product'), F.asc('name'), F.asc('id'))
                 .select('id')
                 .rdd
                 .flatMap(lambda x: x)
                 .collect()[:10]
               )
    
    answer[course_id] = result

In [163]:
answer

{23126: [13782, 13665, 24419, 20638, 2724, 25782, 2633, 2723, 13348, 15909],
 21617: [21609, 21608, 21616, 21492, 21624, 21623, 21630, 21628, 21508, 21857],
 16627: [11431, 12247, 13021, 25010, 11575, 5687, 9598, 5372, 12863, 16769],
 11556: [16488, 13461, 468, 10447, 387, 22710, 9289, 5936, 23357, 7833],
 16704: [3864, 23407, 1365, 20645, 1426, 20105, 8217, 1236, 1164, 23864],
 13702: [864, 1216, 7173, 8313, 1052, 17017, 19613, 21017, 17015, 8082]}

### Сохранение значений

In [164]:
with open("/data/home/evgeniy.osipchuk/lab02.json", mode='w') as file:
    json.dump(answer, file)

In [166]:
spark.stop()