# 1. Библиотеки и настройки сессии

In [1]:
DATA_FILE = "/labs/slaba02/DO_record_per_line.json"

In [2]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'
spark_home = os.environ.get('SPARK_HOME', None)
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [3]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "ivan.strazov - lab02")

spark = SparkSession.builder.config(conf=conf).appName("ivan.strazov - lab02").getOrCreate()

In [4]:
import pyspark.sql.functions as f
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, LongType, DoubleType
from pyspark.ml.feature import HashingTF, IDF, Normalizer
from pyspark.sql.window import Window
import re

# 2. Загрузка датасета

In [5]:
df = spark.read \
        .format("json") \
        .load(DATA_FILE)
df.printSchema()

root
 |-- cat: string (nullable = true)
 |-- desc: string (nullable = true)
 |-- id: long (nullable = true)
 |-- lang: string (nullable = true)
 |-- name: string (nullable = true)
 |-- provider: string (nullable = true)



In [6]:
df.show(2)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
+--------------------+--------------------+---+----+--------------------+--------------+
only showing top 2 rows



In [12]:
base_id = (23126, 21617, 16627, 11556, 16704, 13702)

@f.udf
def rec(course_id):
    if course_id in base_id:
        return 1
    return 0

df_full = df.select("id", "lang", "desc", "name").withColumn("rec", rec("id"))
df_full.show(2)

+---+----+--------------------+--------------------+---+
| id|lang|                desc|                name|rec|
+---+----+--------------------+--------------------+---+
|  4|  en|This course intro...|Accounting Cycle:...|  0|
|  5|  en|This online cours...|American Counter ...|  0|
+---+----+--------------------+--------------------+---+
only showing top 2 rows



# 3. Расчёт TF-IDF

In [13]:
@f.udf("string")
def extract_text(text):
    """
    Extract clear text.
    
    Parameters:
        text (str) - text.
    
    Returns:
        text (str) - clear text.
    """
    
    try:
        text = re.sub(r"[^А-яA-z\d]", " ", text).strip().lower()
        text = re.sub(r"\s+", " ", text)
        return text
    except TypeError:
        return None

In [14]:
courses = df_full.withColumn("docs", f.split(extract_text("desc"), " "))
courses.show(5)

+---+----+--------------------+--------------------+---+--------------------+
| id|lang|                desc|                name|rec|                docs|
+---+----+--------------------+--------------------+---+--------------------+
|  4|  en|This course intro...|Accounting Cycle:...|  0|[this, course, in...|
|  5|  en|This online cours...|American Counter ...|  0|[this, online, co...|
|  6|  fr|This course is ta...|Arithmétique: en ...|  0|[this, course, is...|
|  7|  en|We live in a digi...|Becoming a Dynami...|  0|[we, live, in, a,...|
|  8|  en|This self-paced c...|           Bioethics|  0|[this, self, pace...|
+---+----+--------------------+--------------------+---+--------------------+
only showing top 5 rows



In [15]:
hashingTF = HashingTF(inputCol="docs",
                      outputCol="tf",
                      numFeatures=10000)
tf = hashingTF.transform(courses)

idf = IDF(inputCol="tf", outputCol="tfidf").fit(tf)
tfidf = idf.transform(tf)

normalizer = Normalizer(inputCol="tfidf",
                        outputCol="normVec",
                        p=2)
vectors = normalizer \
                .transform(tfidf) \
                .select("id", "lang", "name", "normVec")

vectors.show(5)

+---+----+--------------------+--------------------+
| id|lang|                name|             normVec|
+---+----+--------------------+--------------------+
|  4|  en|Accounting Cycle:...|(10000,[36,63,138...|
|  5|  en|American Counter ...|(10000,[32,222,36...|
|  6|  fr|Arithmétique: en ...|(10000,[30,118,24...|
|  7|  en|Becoming a Dynami...|(10000,[493,572,7...|
|  8|  en|           Bioethics|(10000,[32,115,13...|
+---+----+--------------------+--------------------+
only showing top 5 rows



# 4. Поиск рекомендаций

In [25]:
base_set = vectors \
                .filter(courses.rec == 1) \
                .select(f.col("id").alias("base_id"),
                        f.col("normVec").alias("base_normVec"),
                        "lang")
rec_set = vectors \
                .filter(courses.rec == 0) \
                .select(f.col("id").alias("rec_id"),
                        f.col("normVec").alias("rec_normVec"),
                        f.col("name").alias("name"),
                        "lang")

data = base_set.join(rec_set, on="lang", how="left")
data.show(5)

+----+-------+--------------------+------+--------------------+--------------------+
|lang|base_id|        base_normVec|rec_id|         rec_normVec|                name|
+----+-------+--------------------+------+--------------------+--------------------+
|  en|  21617|(10000,[17,128,16...|  9852|(10000,[18,69,291...|How to Make Easy ...|
|  en|  21617|(10000,[17,128,16...|  9853|(10000,[15,62,128...|How To Get Starte...|
|  en|  21617|(10000,[17,128,16...|  9854|(10000,[32,157,28...|How to Get That R...|
|  en|  21617|(10000,[17,128,16...|  9855|(10000,[87,128,15...|Escape Email Over...|
|  en|  21617|(10000,[17,128,16...|  9856|(10000,[169,173,2...|How to Get New Cu...|
+----+-------+--------------------+------+--------------------+--------------------+
only showing top 5 rows



In [26]:
@f.udf(DoubleType())
def tfidf_cos(x, y):
    return float(x.dot(y))

In [27]:
data = data.withColumn("cos", tfidf_cos("base_normVec", "rec_normVec"))
data.show(5)

+----+-------+--------------------+------+--------------------+--------------------+--------------------+
|lang|base_id|        base_normVec|rec_id|         rec_normVec|                name|                 cos|
+----+-------+--------------------+------+--------------------+--------------------+--------------------+
|  en|  21617|(10000,[17,128,16...|  9852|(10000,[18,69,291...|How to Make Easy ...|  0.0687399943676534|
|  en|  21617|(10000,[17,128,16...|  9853|(10000,[15,62,128...|How To Get Starte...|0.030242484408690068|
|  en|  21617|(10000,[17,128,16...|  9854|(10000,[32,157,28...|How to Get That R...|  0.0316790665944085|
|  en|  21617|(10000,[17,128,16...|  9855|(10000,[87,128,15...|Escape Email Over...| 0.03280009908657444|
|  en|  21617|(10000,[17,128,16...|  9856|(10000,[169,173,2...|How to Get New Cu...| 0.04256926626683566|
+----+-------+--------------------+------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [28]:
data = data.withColumn("rank",
                       f.dense_rank().over(Window.partitionBy("base_id").orderBy(f.desc("cos"), f.col("name"), f.col("rec_id"))))
data.show(5)

+----+-------+--------------------+------+--------------------+--------------------+------------------+----+
|lang|base_id|        base_normVec|rec_id|         rec_normVec|                name|               cos|rank|
+----+-------+--------------------+------+--------------------+--------------------+------------------+----+
|  en|  23126|(10000,[87,91,128...| 14760|(10000,[957,1263,...|Foundation 4: Inc...|0.6253172468854246|   1|
|  en|  23126|(10000,[87,91,128...| 13665|(10000,[51,93,128...|The Next Step wit...|0.6220896374764014|   2|
|  en|  23126|(10000,[87,91,128...| 13782|(10000,[1263,1470...|Assembling Sass P...|0.5561861139671257|   3|
|  en|  23126|(10000,[87,91,128...| 20638|(10000,[3775,4343...|Introduction to P...|0.4532950084720502|   4|
|  en|  23126|(10000,[87,91,128...| 24419|(10000,[1,50,77,8...|Learn the 7 Minds...|0.4386267864400899|   5|
+----+-------+--------------------+------+--------------------+--------------------+------------------+----+
only showing top 5 

In [29]:
data.orderBy(f.desc("cos")).show(5)

+----+-------+--------------------+------+--------------------+--------------------+------------------+----+
|lang|base_id|        base_normVec|rec_id|         rec_normVec|                name|               cos|rank|
+----+-------+--------------------+------+--------------------+--------------------+------------------+----+
|  ru|  13702|(10000,[310,942,2...|   864|(10000,[310,942,2...|Математическая эк...|               1.0|   1|
|  en|  21617|(10000,[17,128,16...| 21609|(10000,[17,128,16...|Preparing for the...|0.9998692556930249|   1|
|  es|  16627|(10000,[55,76,86,...| 11431|(10000,[48,55,70,...|Excel Básico by A...|0.7158166824422696|   1|
|  en|  23126|(10000,[87,91,128...| 14760|(10000,[957,1263,...|Foundation 4: Inc...|0.6253172468854247|   1|
|  en|  23126|(10000,[87,91,128...| 13665|(10000,[51,93,128...|The Next Step wit...|0.6220896374764014|   2|
+----+-------+--------------------+------+--------------------+--------------------+------------------+----+
only showing top 5 

In [30]:
array2json = data \
                .filter(f.col("rank") <= 10) \
                .select("base_id", "rec_id", "rank") \
                .collect()
array2json[0]

Row(base_id=23126, rec_id=14760, rank=1)

# 5. Запись ответа

In [31]:
dct = {}

for (base, rec, _) in sorted(array2json, key=lambda x: x[2], reverse=False):
    base, rec = str(base), rec
    if base in dct:
        dct[base].append(rec)
    else:
        dct[base] = [rec]

In [32]:
import json

with open("lab02.json", "w") as file:
    json.dump(dct, file)

In [33]:
spark.stop()