In [None]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'
spark_home = os.environ.get('SPARK_HOME', None)
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

In [None]:
from pyspark.sql import functions as sf
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Normalizer

In [None]:
spec_courses = [23126, 21617, 16627, 11556, 16704, 13702]

In [None]:
df = spark.read.json('/labs/slaba02/DO_record_per_line.json')
# df.printSchema
# df.show

In [None]:
tokenizer = Tokenizer(inputCol="desc", outputCol="tokens")

In [None]:
hashing_tf = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="hashed_tf", numFeatures=10000)

In [None]:
idf = IDF(inputCol=hashing_tf.getOutputCol(), outputCol="tf_idf")

In [None]:
normalizer = Normalizer(inputCol=idf.getOutputCol(), outputCol="features")

In [None]:
pipeline = Pipeline(stages=[
    tokenizer,
    hashing_tf,
    idf,
    normalizer,
])

In [None]:
features = pipeline.fit(df).transform(df)[['id', 'lang', 'features']]

In [None]:
@sf.udf(DoubleType())
def cosine_distance(x, y): 
    return float(x.dot(y))

In [None]:
target = (
    features.where(sf.col('id').isin(spec_courses))
    .withColumnRenamed("id", "target_id")
    .withColumnRenamed("lang", "target_lang")
    .withColumnRenamed("features", "target_features")
)

In [None]:
target.show()

In [None]:
result = (
    target
    .join(features, sf.col("target_id") != sf.col("id"))
    .where(sf.col("target_lang") == sf.col("lang"))
    .withColumn("cosine_distance", cosine_distance(sf.col('target_features'), sf.col('features')))
    .drop("target_lang", "lang", "target_features", "features")
    .orderBy(sf.col('cosine_distance').desc())
)

In [None]:
result.show()

In [None]:
output = {
    target_id: [x["id"] for x in df.select('id').where(sf.col('target_id') == target_id).limit(10).collect()]
    for target_id in spec_ids
}

In [None]:
import json

In [None]:
with open("/data/home/andrey.blednykh/lab02.json", "w") as f:
    f.write(json.dumps(output))

In [None]:
spark.stop()