In [1]:
import json
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("Lab2_by_sand")
         .getOrCreate())
spark

In [3]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer, IDF
import pyspark.sql.functions as f
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import struct

In [4]:
COURCES_FILE = "/labs/slaba02/DO_record_per_line.json"

In [5]:
#Source dataset
cources = spark.read.json(COURCES_FILE)
cources.show(5)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|
+--------------------+--------------------+---+----+--------------------+--------------+
only showing top 5 rows



In [6]:
cources = cources.select('id', 'lang', 'desc')
cources.show(5)

+---+----+--------------------+
| id|lang|                desc|
+---+----+--------------------+
|  4|  en|This course intro...|
|  5|  en|This online cours...|
|  6|  fr|This course is ta...|
|  7|  en|We live in a digi...|
|  8|  en|This self-paced c...|
+---+----+--------------------+
only showing top 5 rows



In [7]:
TEST_DATA = [
    [23126, u'en', u'Compass - powerful SASS library that makes your life easier'], 
    [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], 
    [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], 
    [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], 
    [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], 
    [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]

In [8]:
schema = StructType(fields=[
    StructField("id", IntegerType()),
    StructField("lang", StringType()),
    StructField("desc", StringType()),
])

In [9]:
#Dtaset for raiting
test = spark.createDataFrame(data=TEST_DATA, schema=schema)
test.printSchema()
test.show()

root
 |-- id: integer (nullable = true)
 |-- lang: string (nullable = true)
 |-- desc: string (nullable = true)

+-----+----+--------------------+
|   id|lang|                desc|
+-----+----+--------------------+
|23126|  en|Compass - powerfu...|
|21617|  en|Preparing for the...|
|16627|  es|Aprende Excel: Ni...|
|11556|  es|Aprendizaje Colab...|
|16704|  ru|Программирование ...|
|13702|  ru|Математическая эк...|
+-----+----+--------------------+



In [10]:
cources.rdd.getNumPartitions()

2

In [11]:
# Configure an ML pipeline, which consists of three stages: tokenizer, tf, and idf.
tokenizer = Tokenizer(inputCol="desc", outputCol="words")
tf = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="tf", ).setNumFeatures(10000)
idf = IDF(inputCol=tf.getOutputCol(), outputCol="idf", )
pipeline = Pipeline(stages=[tokenizer, tf, idf])
pipeline

Pipeline_9df0fb73adba

In [12]:
preproc = pipeline.fit(cources)
preproc

PipelineModel_db876fa9046b

In [13]:
#Transform source
c1 = preproc.transform(cources).select('id', 'lang', 'desc', 'idf') \
        .withColumnRenamed('idf', 'idf_src') \
        .withColumnRenamed('desc', 'desc_src') \
        .withColumnRenamed('id', 'id_src')

In [14]:
#Transform tests
c2 = preproc.transform(test).select('id', 'lang', 'desc', 'idf') \
        .withColumnRenamed('idf', 'idf_dst') \
        .withColumnRenamed('desc', 'desc_dst') \
        .withColumnRenamed('id', 'id_dst')

In [15]:
#Join source and test for compare tfidf vectors (only if lang equals)
c = c1.join(c2, how='inner', on='lang')

In [16]:
num_executors = spark.sparkContext.getConf().get("spark.executor.instances")
int(num_executors)

2

In [17]:
#Coalesce dataframe partitions from 200 to 3 * num_executors
c = c.coalesce(int(num_executors) * 3).cache()

In [18]:
c.rdd.getNumPartitions()

6

In [19]:
#Udf function for calculate cosine distance between vectors
@f.udf(FloatType())
def cosine_distance(v1, v2):
#    norm = Vectors.norm(v1, p=2) * Vectors.norm(v2, p=2)
    norm = v1.norm(p=2) * v2.norm(p=2)
    if norm != 0:
        return float(v1.dot(v2) / norm)
    else:
        return None

In [20]:
#Calc distances
c = c.withColumn('cos_dist', cosine_distance(f.col('idf_src'), f.col('idf_dst'))) \
        .select(f.col('id_dst'), f.col('id_src'), f.col('desc_src'), f.col('cos_dist')) \
        .sort(f.col('id_dst'), f.col('cos_dist').desc(), f.col('desc_src').asc(), f.col('id_src').asc())
c

DataFrame[id_dst: int, id_src: bigint, desc_src: string, cos_dist: float]

In [21]:
c.printSchema()

root
 |-- id_dst: integer (nullable = true)
 |-- id_src: long (nullable = true)
 |-- desc_src: string (nullable = true)
 |-- cos_dist: float (nullable = true)



In [22]:
#Make result dataframe
res = c.select(f.col('id_dst').cast('string').alias('id'), f.col('id_src')) \
        .groupBy('id').agg(f.collect_list('id_src')) \
        .withColumnRenamed('collect_list(id_src)', 'recs') 
res.cache()
res.show(5)

+-----+--------------------+
|   id|                recs|
+-----+--------------------+
|21617|[21703, 21700, 21...|
|13702|[864, 13702, 1266...|
|23126|[13665, 23126, 13...|
|16704|[1252, 1427, 2010...|
|16627|[9471, 19161, 388...|
+-----+--------------------+
only showing top 5 rows



In [23]:
#Write to local file
home_dir = os.environ['HOME']
with open(f"/{home_dir}/lab02.json", 'wt') as file:
    json.dump({k: v for (k, v) in res.collect()}, fp=file, indent=3)

In [24]:
spark.stop()

In [25]:
#Check result file
!cat ~/lab02.json

{
   "21617": [
      21703,
      21700,
      21675,
      21506,
      21676,
      21706,
      21508,
      21854,
      21679,
      11092,
      21840,
      21666,
      21523,
      21857,
      210,
      22298,
      8289,
      28299,
      12062,
      26887,
      21081,
      8288,
      26578,
      27188,
      9333,
      1583,
      21861,
      21530,
      25702,
      6843,
      25726,
      331,
      3728,
      6637,
      19417,
      21531,
      21627,
      21621,
      21632,
      28295,
      27473,
      5758,
      21696,
      18778,
      21492,
      21626,
      19190,
      26926,
      21624,
      15838,
      8919,
      525,
      27504,
      416,
      18130,
      28296,
      22708,
      21498,
      24940,
      25329,
      21623,
      21630,
      21628,
      524,
      5759,
      18649,
      19089,
      25353,
      17817,
      22726,
      8924,
      813

      15563,
      14794,
      21781,
      8510,
      15591,
      5069,
      14281,
      11926,
      28067,
      22293,
      2661,
      8052,
      6628,
      4216,
      26284,
      27182,
      25387,
      21980,
      27558,
      16679,
      26751,
      27255,
      28002,
      12592,
      11596,
      16668,
      20708,
      9947,
      16389,
      11911,
      18426,
      13431,
      28295,
      10349,
      26894,
      15600,
      26652,
      24073,
      11820,
      18084,
      27007,
      11955,
      4303,
      15836,
      26659,
      26549,
      24993,
      19105,
      15759,
      4998,
      5417,
      26619,
      25579,
      17398,
      17796,
      11712,
      13585,
      19067,
      7412,
      7743,
      6981,
      26970,
      12590,
      15936,
      21232,
      11861,
      16333,
      25458,
      8888,
      3230,
      7112,
      6717,
      211