In [1]:
import os
import sys
import json
import re
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.ml.feature import HashingTF, RegexTokenizer, IDF, Normalizer
from pyspark.ml.linalg import Vectors

conf = SparkConf()
conf.set("spark.app.name", "Nabat_lab2") 

spark = SparkSession.builder.config(conf=conf).appName("Nabat_lab2").getOrCreate()

In [3]:
sc = spark.sparkContext

In [4]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
import pyspark.sql.functions as f

In [126]:
schema = StructType(fields=[
    StructField("lang", StringType()),
    StructField("name", StringType()),
    StructField("cat", StringType()),
    StructField("provider", StringType()),
    StructField("id", StringType()),
    StructField("desc", StringType()),
])

In [127]:
df = spark.read.json("/labs/slaba02/DO_record_per_line.json", schema=schema)

In [128]:
regexTokenizer = RegexTokenizer(inputCol="desc", outputCol="words", pattern="[^a-z^а-я]")

In [129]:
df_reg_t = regexTokenizer.transform(df)

In [130]:
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=1000000)
featurizedData  = hashingTF.transform(df_reg_t)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

In [131]:
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=2.0)
l1NormData = normalizer.transform(rescaledData)

In [132]:
collect_data = l1NormData.select('lang', 'name','id','normFeatures').cache()

In [133]:
def my_dot(v1, v2):
    return float(v1.dot(v2))
dotUdf = f.udf(my_dot, DoubleType())

In [134]:
df_my_cources1 = collect_data.filter(f.col('id') == "23126")\
    .select('lang','id','normFeatures')\
    .withColumnRenamed('id', 'id_join')\
    .withColumnRenamed('normFeatures', 'features_point')\
    .cache()

df_my_cources2 = collect_data.filter(f.col('id') == "21617")\
    .select('lang','id','normFeatures')\
    .withColumnRenamed('id', 'id_join')\
    .withColumnRenamed('normFeatures', 'features_point')\
    .cache()

df_my_cources3 = collect_data.filter(f.col('id') == "16627")\
    .select('lang','id','normFeatures')\
    .withColumnRenamed('id', 'id_join')\
    .withColumnRenamed('normFeatures', 'features_point')\
    .cache()

df_my_cources4 = collect_data.filter(f.col('id') == "11556")\
    .select('lang','id','normFeatures')\
    .withColumnRenamed('id', 'id_join')\
    .withColumnRenamed('normFeatures', 'features_point')\
    .cache()
df_my_cources5 = collect_data.filter(f.col('id') == "16704")\
    .select('lang','id','normFeatures')\
    .withColumnRenamed('id', 'id_join')\
    .withColumnRenamed('normFeatures', 'features_point')\
    .cache()
df_my_cources6 = collect_data.filter(f.col('id') == "13702")\
    .select('lang','id','normFeatures')\
    .withColumnRenamed('id', 'id_join')\
    .withColumnRenamed('normFeatures', 'features_point')\
    .cache()

In [135]:
collect_data1 = collect_data.join(df_my_cources1, collect_data.lang == df_my_cources1.lang)\
    .select("name", "id", "normFeatures", "id_join", "features_point").cache()

collect_data2 = collect_data.join(df_my_cources2, collect_data.lang == df_my_cources1.lang)\
    .select("name", "id", "normFeatures", "id_join", "features_point").cache()

collect_data3 = collect_data.join(df_my_cources3, collect_data.lang == df_my_cources1.lang)\
    .select("name", "id", "normFeatures", "id_join", "features_point").cache()

collect_data4 = collect_data.join(df_my_cources4, collect_data.lang == df_my_cources1.lang)\
    .select("name", "id", "normFeatures", "id_join", "features_point").cache()

collect_data5 = collect_data.join(df_my_cources5, collect_data.lang == df_my_cources1.lang)\
    .select("name", "id", "normFeatures", "id_join", "features_point").cache()

collect_data6 = collect_data.join(df_my_cources6, collect_data.lang == df_my_cources1.lang)\
    .select("name", "id", "normFeatures", "id_join", "features_point").cache()

In [136]:
df_itog = collect_data1.withColumn('cosine_sim', dotUdf('normFeatures','features_point'))\
    .orderBy(f.desc("cosine_sim"), "name", "id").limit(11).select("id", "id_join")\
    .union(
        collect_data2.withColumn('cosine_sim', dotUdf('normFeatures','features_point'))\
    .orderBy(f.desc("cosine_sim"), "name", "id").limit(11).select("id", "id_join")
    )\
    .union(
        collect_data3.withColumn('cosine_sim', dotUdf('normFeatures','features_point'))\
    .orderBy(f.desc("cosine_sim"), "name", "id").limit(11).select("id", "id_join")
    )\
    .union(
        collect_data4.withColumn('cosine_sim', dotUdf('normFeatures','features_point'))\
    .orderBy(f.desc("cosine_sim"), "name", "id").limit(11).select("id", "id_join")
    )\
    .union(
        collect_data5.withColumn('cosine_sim', dotUdf('normFeatures','features_point'))\
    .orderBy(f.desc("cosine_sim"), "name", "id").limit(11).select("id", "id_join")
    )\
    .union(
        collect_data6.withColumn('cosine_sim', dotUdf('normFeatures','features_point'))\
    .orderBy(f.desc("cosine_sim"), "name", "id").limit(11).select("id", "id_join")
    )\
    .cache()

In [137]:
df_finally = df_itog.orderBy("id_join").where(df_itog.id != df_itog.id_join).cache()

In [139]:
d = {}

In [146]:
res = df_finally.collect()

In [158]:
i = 1
sub_list = list()
for id_ , id_join in res:
    sub_list.append(int(id_))
    if i%10 == 0:
        d[id_join] = sub_list
        sub_list = list()
    i += 1

In [166]:
with open('/data/home/ilya.nabatchikov/lab02.json', 'w') as fp:
    json.dump(d, fp)

In [None]:
sc.stop()