In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 4g --executor-cores 1 --driver-memory 2g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "ML app") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [200]:
spark.stop()

In [3]:
spark

In [4]:
dataset_test = spark.read.json("/labs/slaba02/DO_record_per_line.json")
dataset_test

DataFrame[cat: string, desc: string, id: bigint, lang: string, name: string, provider: string]

In [32]:
dataset = spark.read.json("/labs/slaba02/DO_record_per_line.json")[["id", "desc", "name", "lang"]]
#dataset[["id", "lang", "desc"]].take(1)
dataset.take(1)

[Row(id=4, desc='This course introduces the basic financial statements used by most businesses, as well as the essential tools used to prepare them. This course will serve as a resource to help business students succeed in their upcoming university-level accounting classes, and as a refresher for upper division accounting students who are struggling to recall elementary concepts essential to more advanced accounting topics. Business owners will also benefit from this class by gaining essential skills necessary to organize and manage information pertinent to operating their business. At the conclusion of the class, students will understand the balance sheet, income statement, and cash flow statement. They will be able to differentiate between cash basis and accrual basis techniques, and know when each is appropriate. They’ll also understand the accounting equation, how to journalize and post transactions, how to adjust and close accounts, and how to prepare key financial reports. All ma

In [6]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

In [33]:
tokenizer = Tokenizer(inputCol="desc", outputCol="words")
wordsData = tokenizer.transform(dataset)

In [186]:
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=100000)
featurizedData = hashingTF.transform(wordsData)

In [187]:
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

In [188]:
rescaledData.select("id", "features").where("id = 23126").show(1)

+-----+--------------------+
|   id|            features|
+-----+--------------------+
|23126|(100000,[1197,125...|
+-----+--------------------+



-цикл по нескольким ИД:
-из дата фрейма получить вектор конкретного ИД
-расчитать меру со всеми другими ИД
-отсортировать по убыванию меры и взять первые 10
-записать результат в список

In [41]:
from pyspark.ml.linalg import Vectors

In [45]:
import pyspark.sql.functions as f

In [46]:
from pyspark.sql.types import FloatType

In [48]:
from pyspark.ml.linalg import DenseVector, SparseVector

In [49]:
from pyspark.sql.functions import udf,col

In [189]:
rescaledData_aim = rescaledData.filter(rescaledData.id.isin([23126, 21617, 16627, 11556, 16704, 13702]))\
                               .select(rescaledData.id.alias("id_aim"), rescaledData.name.alias("name_aim")\
                                       ,rescaledData.lang.alias("lang_aim"), rescaledData.features.alias("features_aim"))

In [190]:
rescaledData.count()

28153

In [77]:
rescaledData_aim.show()

+------+--------------------+--------+--------------------+
|id_aim|            name_aim|lang_aim|        features_aim|
+------+--------------------+--------+--------------------+
| 11556|Aprendizaje Colab...|      es|(1000,[3,19,31,34...|
| 13702|Математическая эк...|      ru|(1000,[172,203,31...|
| 16627|Aprende Excel: Ni...|      es|(1000,[3,7,29,30,...|
| 16704|Программирование ...|      ru|(1000,[32,103,144...|
| 21617|Preparing for the...|      en|(1000,[0,3,6,18,2...|
| 23126|Compass - powerfu...|      en|(1000,[6,13,18,22...|
+------+--------------------+--------+--------------------+



In [67]:
28153 * 6

168918

In [191]:
df = rescaledData.crossJoin(rescaledData_aim)

In [88]:
rescaledData.crossJoin(rescaledData_aim).show(2)

+---+--------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------------------+--------+--------------------+
| id|                desc|                name|lang|               words|         rawFeatures|            features|id_aim|            name_aim|lang_aim|        features_aim|
+---+--------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------------------+--------+--------------------+
|  4|This course intro...|Accounting Cycle:...|  en|[this, course, in...|(1000,[8,15,17,23...|(1000,[8,15,17,23...| 11556|Aprendizaje Colab...|      es|(1000,[3,19,31,34...|
|  4|This course intro...|Accounting Cycle:...|  en|[this, course, in...|(1000,[8,15,17,23...|(1000,[8,15,17,23...| 13702|Математическая эк...|      ru|(1000,[172,203,31...|
+---+--------------------+--------------------+----+--------------------+--------------------+--------------------+------+--------

In [192]:
function = udf(lambda x, y: float(x.dot(y) / (x.norm(2) * y.norm(2))), FloatType())

In [193]:
df_cosine = df.withColumn("cosine_dist", function("features", "features_aim"))

In [194]:
df_cosine.filter(df_cosine.id == 8571).show(10)

+----+----+--------------------+----+-----+--------------+--------------+------+--------------------+--------+--------------------+-----------+
|  id|desc|                name|lang|words|   rawFeatures|      features|id_aim|            name_aim|lang_aim|        features_aim|cosine_dist|
+----+----+--------------------+----+-----+--------------+--------------+------+--------------------+--------+--------------------+-----------+
|8571|    |Adjust the Bust: ...|  en|   []|(100000,[],[])|(100000,[],[])| 11556|Aprendizaje Colab...|      es|(100000,[3487,379...|        NaN|
|8571|    |Adjust the Bust: ...|  en|   []|(100000,[],[])|(100000,[],[])| 13702|Математическая эк...|      ru|(100000,[2172,694...|        NaN|
|8571|    |Adjust the Bust: ...|  en|   []|(100000,[],[])|(100000,[],[])| 16627|Aprende Excel: Ni...|      es|(100000,[30,939,1...|        NaN|
|8571|    |Adjust the Bust: ...|  en|   []|(100000,[],[])|(100000,[],[])| 16704|Программирование ...|      ru|(100000,[381,3610...|     

In [98]:
my_list = [23126, 21617, 16627, 11556, 16704, 13702]

In [97]:
import json

In [179]:
import pyspark.sql.functions as sf
#df.filter(sf.size('column_with_lists') > 0)

In [181]:
df_cosine.select("id", "cosine_dist","name").filter(df_cosine.id_aim == 23126)\
         .where("id <> id_aim and lang = lang_aim").filter(sf.size('words') > 0)\
         .sort(col("cosine_dist").desc(), col("name").asc(), col("id").asc()).take(10)
         #.take(10)

[Row(id=10764, cosine_dist=0.4965316653251648, name='Aptitude Interview Questions For Job Interviews & Examinations'),
 Row(id=24419, cosine_dist=0.475035697221756, name='Learn the 7 Mindsets to live your Ultimate Life'),
 Row(id=9949, cosine_dist=0.4676150977611542, name='How to Use YouTube Live Events for Your Business by Video Editor, YouTube Marketer Marc Bullard'),
 Row(id=5114, cosine_dist=0.46372050046920776, name='Learn Delicious Vegan Recipes : Online Cooking Classes'),
 Row(id=13782, cosine_dist=0.46329018473625183, name='Assembling Sass Part 2'),
 Row(id=13665, cosine_dist=0.45630595088005066, name='The Next Step with Sass and Compass by Lisa Catalano'),
 Row(id=13727, cosine_dist=0.4556930661201477, name='Effective Events Management by EduCBA Academy for Business Studies'),
 Row(id=23478, cosine_dist=0.4395541250705719, name='Entrepreneurs: How to Use Live Events to Grow Your Business!'),
 Row(id=20638, cosine_dist=0.4180535674095154, name='Introduction to Photo Compositing

In [None]:
[Row(id=6527, cosine_dist=0.07960835844278336),
 Row(id=6528, cosine_dist=0.08900269120931625),
 Row(id=6529, cosine_dist=0.11000870913267136),
 Row(id=6530, cosine_dist=0.09787169843912125),
 Row(id=6531, cosine_dist=0.09348708391189575),
 Row(id=6532, cosine_dist=0.1195501908659935),
 Row(id=6533, cosine_dist=0.123387411236763),
 Row(id=6534, cosine_dist=0.1351056843996048),
 Row(id=6535, cosine_dist=0.11342219263315201),
 Row(id=6536, cosine_dist=0.11445394903421402)]

In [195]:
df_cosine.coalesce(6).cache()

DataFrame[id: bigint, desc: string, name: string, lang: string, words: array<string>, rawFeatures: vector, features: vector, id_aim: bigint, name_aim: string, lang_aim: string, features_aim: vector, cosine_dist: float]

In [196]:
ans = []
for l in my_list:
    temp = df_cosine.select("id").filter(df_cosine.id_aim == l)\
                   .where("id <> id_aim and lang = lang_aim").filter(sf.size('words') > 0)\
                   .sort(col("cosine_dist").desc(), col("name").asc(), col("id").asc())\
                   .take(10)
    ans_final = list(map(lambda x: int(str(x)[str(x).find("=")+1:str(x).find(")")]) , temp))
    ans.append([l,ans_final])
ans

[[23126,
  [13782, 13665, 25782, 14760, 15909, 13348, 17499, 25071, 17329, 7153]],
 [21617,
  [21609, 21608, 21616, 21492, 21624, 21623, 21630, 21628, 21508, 21506]],
 [16627, [11431, 12247, 25010, 13021, 5687, 11575, 12660, 12863, 5372, 22680]],
 [11556, [16488, 13461, 468, 23357, 16929, 22710, 387, 19330, 9289, 10447]],
 [16704, [1164, 1236, 1365, 20645, 1426, 20105, 18331, 875, 8207, 8217]],
 [13702, [864, 1052, 13057, 8313, 1216, 21079, 8082, 1426, 20105, 28074]]]

In [197]:
with open('lab02.json', 'w') as f:
    json.dump(dict(ans), f)

In [198]:
dict(ans)

{23126: [13782, 13665, 25782, 14760, 15909, 13348, 17499, 25071, 17329, 7153],
 21617: [21609, 21608, 21616, 21492, 21624, 21623, 21630, 21628, 21508, 21506],
 16627: [11431, 12247, 25010, 13021, 5687, 11575, 12660, 12863, 5372, 22680],
 11556: [16488, 13461, 468, 23357, 16929, 22710, 387, 19330, 9289, 10447],
 16704: [1164, 1236, 1365, 20645, 1426, 20105, 18331, 875, 8207, 8217],
 13702: [864, 1052, 13057, 8313, 1216, 21079, 8082, 1426, 20105, 28074]}

In [199]:
f = open('lab02.json')
data = json.load(f)
print(data)

{'23126': [13782, 13665, 25782, 14760, 15909, 13348, 17499, 25071, 17329, 7153], '21617': [21609, 21608, 21616, 21492, 21624, 21623, 21630, 21628, 21508, 21506], '16627': [11431, 12247, 25010, 13021, 5687, 11575, 12660, 12863, 5372, 22680], '11556': [16488, 13461, 468, 23357, 16929, 22710, 387, 19330, 9289, 10447], '16704': [1164, 1236, 1365, 20645, 1426, 20105, 18331, 875, 8207, 8217], '13702': [864, 1052, 13057, 8313, 1216, 21079, 8082, 1426, 20105, 28074]}
