In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer,StopWordsRemover
from pyspark.sql.functions import udf, col, lower, regexp_replace

In [3]:
path = "/labs/slaba02/DO_record_per_line.json"
df = spark.read.json(path)
df.printSchema()

root
 |-- cat: string (nullable = true)
 |-- desc: string (nullable = true)
 |-- id: long (nullable = true)
 |-- lang: string (nullable = true)
 |-- name: string (nullable = true)
 |-- provider: string (nullable = true)



In [4]:
tokenizer = Tokenizer(inputCol="desc", outputCol="vectorized_words")

In [5]:
stop_words = StopWordsRemover.loadDefaultStopWords("russian")

In [6]:
stop_words_remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="filtered_words", stopWords=stop_words)

In [7]:
tokenizer_df = tokenizer.transform(df)

In [8]:
tokenizer_filtered_df = stop_words_remover.transform(tokenizer_df)

In [9]:
hashing_df = HashingTF(inputCol='filtered_words', outputCol="raw_features", numFeatures=10000)

In [14]:
featurized_df = hashing_df.transform(tokenizer_filtered_df)
# альтернатива, CountVectorizer можно использовать для получения векторов частоты слов(term frequency vectors)

In [15]:
idf = IDF(minDocFreq = 1,inputCol="raw_features", outputCol="features")
idf_model = idf.fit(featurized_df)
rescaled_df = idf_model.transform(featurized_df)

In [18]:
rescaled_df.select("id", "features",'words','filtered_words','lang').show()

+---+--------------------+--------------------+--------------------+----+
| id|            features|               words|      filtered_words|lang|
+---+--------------------+--------------------+--------------------+----+
|  4|(10000,[36,42,63,...|[this, course, in...|[this, course, in...|  en|
|  5|(10000,[32,222,29...|[this, online, co...|[this, online, co...|  en|
|  6|(10000,[30,41,246...|[this, course, is...|[this, course, is...|  fr|
|  7|(10000,[493,572,7...|[we, live, in, a,...|[we, live, in, a,...|  en|
|  8|(10000,[32,65,115...|[this, self-paced...|[this, self-paced...|  en|
|  9|(10000,[56,91,268...|[this, game-based...|[this, game-based...|  en|
| 10|(10000,[1045,1263...|[what’s, in, your...|[what’s, in, your...|  en|
| 11|(10000,[87,157,57...|[the, goal, of, t...|[the, goal, of, t...|  en|
| 12|(10000,[161,164,4...|[ready, to, explo...|[ready, to, explo...|  en|
| 13|(10000,[26,1072,1...|[this, self-paced...|[this, self-paced...|  en|
| 14|(10000,[63,145,23...|[what, is, “

In [22]:
id_s = ['23126','21617','16627','11556','16704','13702']

In [20]:
import numpy as np
rescaled_array_df = rescaled_df.select(['id',"features",'lang']).collect()

In [23]:
def cosine_sim(a,vector):
    return a.dot(vector)/(a.norm(2)*vector.norm(2))

dict_ids = {}
for i in id_s:
    list_distance = {}
    a = rescaled_df.select(["id",'features','lang']).where("id == "+i).take(1)[0]
    for j in range(len(rescaled_array_df)):
        if a.lang == array[j].lang:
            list_distance[array[j].id] = cosine_sim(a.features,array[j].features)
    dict_ids[i] = list_distance    

  after removing the cwd from sys.path.


In [24]:
result = {}
for i in id_s:
    result[i] = [sorted(dict_ids[i].items(), key=lambda x: x[1])[::-1][1:11][j][0] for j in range(10)]

In [26]:
import json
with open('lab02.json', 'w') as outfile:
    json.dump(result, outfile)