In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("Groo-IA")
         .getOrCreate())

In [3]:
data = spark.read.json("/labs/slaba02/DO_record_per_line.json")

In [5]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Normalizer, StopWordsRemover
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf, col, isnan, isnull, broadcast, desc, lower
from pyspark.sql.types import FloatType, ArrayType, StringType
import json
import re

In [6]:
import pyspark.sql.functions as f
from  pyspark.sql.functions import pandas_udf
def clear_string(series):
    regex = re.compile('[\w\d]{1,}', re.U)
    words = series.str.findall(regex)
    return words

tokenizer_udf = pandas_udf(clear_string, ArrayType(StringType()))


In [7]:
schema = StructType(fields=[
    StructField("lang", StringType()),
    StructField("name", StringType()),
    StructField("cat", StringType()),
    StructField("provider", StringType()),
    StructField("id", IntegerType()),
    StructField("desc", StringType())
])

In [8]:
data = spark.read.schema(schema).json("/labs/slaba02/DO_record_per_line.json")

In [14]:
data.show(6)

+----+--------------------+--------------------+--------------+---+--------------------+
|lang|                name|                 cat|      provider| id|                desc|
+----+--------------------+--------------------+--------------+---+--------------------+
|  en|Accounting Cycle:...|3/business_manage...|Canvas Network|  4|This course intro...|
|  en|American Counter ...|              11/law|Canvas Network|  5|This online cours...|
|  fr|Arithmétique: en ...|5/computer_scienc...|Canvas Network|  6|This course is ta...|
|  en|Becoming a Dynami...|  14/social_sciences|Canvas Network|  7|We live in a digi...|
|  en|           Bioethics|2/biology_life_sc...|Canvas Network|  8|This self-paced c...|
|  en|College Foundatio...|9/humanities|15/m...|Canvas Network|  9|This game-based c...|
+----+--------------------+--------------------+--------------+---+--------------------+
only showing top 6 rows



In [10]:
mas = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'],
 [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'],
 [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'],
 [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'],
 [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'],
 [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]

In [11]:
schema2 = StructType(fields=[
    StructField("t_id", IntegerType()),
    StructField("lang", StringType()),
    StructField("t_name", StringType()),
])

In [12]:
df = spark.createDataFrame(mas, schema=schema2)


In [13]:
df.show(6)

+-----+----+--------------------+
| t_id|lang|              t_name|
+-----+----+--------------------+
|23126|  en|Compass - powerfu...|
|21617|  en|Preparing for the...|
|16627|  es|Aprende Excel: Ni...|
|11556|  es|Aprendizaje Colab...|
|16704|  ru|Программирование ...|
|13702|  ru|Математическая эк...|
+-----+----+--------------------+



In [15]:
data = data.select("id", "lang", "desc").filter("lang in ('en','es','ru')").cache()

In [11]:
data1 = data.withColumn("token", tokenizer_udf("desc")).cache()
data1.show(5)

+---+----+--------------------+--------------------+
| id|lang|                desc|               token|
+---+----+--------------------+--------------------+
|  4|  en|This course intro...|[This, course, in...|
|  5|  en|This online cours...|[This, online, co...|
|  7|  en|We live in a digi...|[We, live, in, a,...|
|  8|  en|This self-paced c...|[This, self, pace...|
|  9|  en|This game-based c...|[This, game, base...|
+---+----+--------------------+--------------------+
only showing top 5 rows



In [12]:
data2 =data1.select("id","lang", F.expr("transform(token, x -> lower(x))").alias("token")).cache()
data2.show(5)

+---+----+--------------------+
| id|lang|               token|
+---+----+--------------------+
|  4|  en|[this, course, in...|
|  5|  en|[this, online, co...|
|  7|  en|[we, live, in, a,...|
|  8|  en|[this, self, pace...|
|  9|  en|[this, game, base...|
+---+----+--------------------+
only showing top 5 rows



In [145]:
data

DataFrame[id: int, lang: string, desc: string]

In [96]:
tf = HashingTF(inputCol="token", outputCol="tf", numFeatures= 10000)

In [97]:
htf = HashingTF(inputCol="token", outputCol="tf")

In [98]:
tf = htf.transform(data2)

In [15]:
tf.select("token", "tf").show(5, False, True)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [99]:
idf = IDF(inputCol="tf", outputCol="idf")
tfidf = idf.fit(tf).transform(tf)
tfidf.show(5,truncate=False)

+---+----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [100]:
cond = [tfidf.id == df.t_id]
data_t = df.join(tfidf, cond, how ="inner" ).cache()

data_t.show(16)

+-----+----+--------------------+-----+----+--------------------+--------------------+--------------------+
| t_id|lang|              t_name|   id|lang|               token|                  tf|                 idf|
+-----+----+--------------------+-----+----+--------------------+--------------------+--------------------+
|13702|  ru|Математическая эк...|13702|  ru|[математическая, ...|(262144,[53548,56...|(262144,[53548,56...|
|16627|  es|Aprende Excel: Ni...|16627|  es|[hazte, más, empl...|(262144,[7975,874...|(262144,[7975,874...|
|16704|  ru|Программирование ...|16704|  ru|[в, курсе, рассма...|(262144,[33343,60...|(262144,[33343,60...|
|23126|  en|Compass - powerfu...|23126|  en|[improve, your, s...|(262144,[1712,174...|(262144,[1712,174...|
|21617|  en|Preparing for the...|21617|  en|[an, introduction...|(262144,[170,1079...|(262144,[170,1079...|
|11556|  es|Aprendizaje Colab...|11556|  es|[la, transformaci...|(262144,[2584,825...|(262144,[2584,825...|
+-----+----+----------------

In [101]:
data_k = data_t.select("t_id", df.lang, data_t.idf.alias("t_idf")).cache()
data_k.show(6)

+-----+----+--------------------+
| t_id|lang|               t_idf|
+-----+----+--------------------+
|13702|  ru|(262144,[53548,56...|
|16627|  es|(262144,[7975,874...|
|16704|  ru|(262144,[33343,60...|
|23126|  en|(262144,[1712,174...|
|21617|  en|(262144,[170,1079...|
|11556|  es|(262144,[2584,825...|
+-----+----+--------------------+



In [102]:
data_j = data_k.join(tfidf, on="lang" , how ="inner" ).cache()
data_j.show(5)

+----+-----+--------------------+---+--------------------+--------------------+--------------------+
|lang| t_id|               t_idf| id|               token|                  tf|                 idf|
+----+-----+--------------------+---+--------------------+--------------------+--------------------+
|  en|21617|(262144,[170,1079...|  4|[this, course, in...|(262144,[4211,753...|(262144,[4211,753...|
|  en|23126|(262144,[1712,174...|  4|[this, course, in...|(262144,[4211,753...|(262144,[4211,753...|
|  en|21617|(262144,[170,1079...|  5|[this, online, co...|(262144,[1598,172...|(262144,[1598,172...|
|  en|23126|(262144,[1712,174...|  5|[this, online, co...|(262144,[1598,172...|(262144,[1598,172...|
|  en|21617|(262144,[170,1079...|  7|[we, live, in, a,...|(262144,[836,4525...|(262144,[836,4525...|
+----+-----+--------------------+---+--------------------+--------------------+--------------------+
only showing top 5 rows



In [None]:
data_j.select("t_id", "id", )

In [None]:
data_j.withColumn("task_idf", ).show(5)

In [None]:
dot_udf = psf.udf(lambda x,y: float(x.dot(y)), DoubleType())
data.alias("i").join(data.alias("j"), psf.col("i.ID") < psf.col("j.ID"))\
    .select(
        psf.col("i.ID").alias("i"), 
        psf.col("j.ID").alias("j"), 
        dot_udf("i.norm", "j.norm").alias("dot"))\
    .sort("i", "j")\
    .show()

In [106]:
dot_udf = udf(lambda x,y: float(x.dot(y)), DoubleType())

In [107]:
data_m.select(col("t_id"), col("id"), dot_udf("t_norm", "norm").alias("dot")).show()

+-----+---+--------------------+
| t_id| id|                 dot|
+-----+---+--------------------+
|21617|  4| 0.05711331788652602|
|23126|  4|0.013423609131844004|
|21617|  5| 0.03394689353939763|
|23126|  5|0.008618305961278352|
|21617|  7|  0.0336162342874345|
|23126|  7|0.030740527572219404|
|21617|  8| 0.04164244832707799|
|23126|  8|  0.0137559483425217|
|21617|  9| 0.09756470597585692|
|23126|  9|0.014769156642553261|
|21617| 10|0.026266075068980946|
|23126| 10| 0.03075852504622817|
|21617| 11|0.017735046777514845|
|23126| 11|0.007204233886337355|
|21617| 12|0.018837213159980867|
|23126| 12|0.011790359841865505|
|21617| 13|0.036018393530069706|
|23126| 13|  0.0080119634752116|
|21617| 14| 0.04548713724646372|
|23126| 14|0.017386377261927953|
+-----+---+--------------------+
only showing top 20 rows



In [108]:
data_r = data_m.select(col("t_id"), col("id"), dot_udf("t_norm", "norm").alias("dot")).cache()

In [103]:
from pyspark.ml.feature import Normalizer

In [104]:
normalizer = Normalizer(inputCol="idf", outputCol="norm")
data_n = normalizer.transform(data_j)

In [105]:
normalizer = Normalizer(inputCol="t_idf", outputCol="t_norm")
data_m = normalizer.transform(data_n)

In [112]:
data_r = data_r.filter("t_id != id").orderBy("t_id","dot", ascending=False).cache()

In [152]:
data_r.filter("t_id = 16627").orderBy("dot", ascending=False).show(10)

+-----+-----+------------------+
| t_id|   id|               dot|
+-----+-----+------------------+
|16627|11431|0.6897973063326052|
|16627|12247|0.5739864017317108|
|16627|11575|0.5421395089166967|
|16627| 5687| 0.540906617741916|
|16627|12660|0.5386708966102562|
|16627|17964| 0.533579016704733|
|16627|25010|0.5277382319817439|
|16627| 5558|0.5250870721549883|
|16627|17961|0.5228496895304764|
|16627|13021|0.5190978788835491|
+-----+-----+------------------+
only showing top 10 rows



In [147]:
data_r.groupBy("t_id").agg(f.sort_array(f.collect_list(f.struct("dot", "id")), asc=False).alias("collected_list")).withColumn("sort", col("collected_list.id")).show()

+-----+--------------------+--------------------+
| t_id|      collected_list|                sort|
+-----+--------------------+--------------------+
|13702|[[1.0, 864], [0.1...|[864, 1052, 21079...|
|16627|[[0.6897973063326...|[11431, 12247, 11...|
|16704|[[0.2114592080837...|[1236, 1247, 1164...|
|23126|[[0.6686228013891...|[14760, 13782, 13...|
|21617|[[0.9998976932251...|[21609, 21616, 21...|
|11556|[[0.4998614042239...|[16488, 13461, 46...|
+-----+--------------------+--------------------+



In [148]:
list = data_r.groupBy("t_id").agg(f.sort_array(f.collect_list(f.struct("dot", "id")), asc=False).alias("collected_list")).withColumn("sort", col("collected_list.id")).drop("collected_list").collect()


In [150]:
res = {list[i][0]: list[i][1][:10] for i in range(len(list))}

    


In [153]:
res

{13702: [864, 1052, 21079, 8082, 13057, 8123, 8313, 1216, 20105, 1426],
 16627: [11431, 12247, 11575, 5687, 12660, 17964, 25010, 5558, 17961, 13021],
 16704: [1236, 1247, 1164, 8203, 20288, 1273, 1228, 1233, 1365, 20645],
 23126: [14760, 13782, 13665, 15909, 25782, 17499, 13348, 19270, 25071, 7153],
 21617: [21609, 21616, 21608, 22298, 21630, 21628, 21623, 21081, 19417, 21624],
 11556: [16488, 13461, 468, 23357, 19330, 7833, 9289, 16929, 22710, 11523]}

In [154]:
with open('lab02.json', 'w') as f:
     json.dump( res, f)

In [56]:
data_m.show(5)

+----+-----+--------------------+---+--------------------+--------------------+--------------------+--------------------+--------------------+
|lang| t_id|               t_idf| id|               token|                  tf|                 idf|                norm|              t_norm|
+----+-----+--------------------+---+--------------------+--------------------+--------------------+--------------------+--------------------+
|  en|21617|(262144,[170,1079...|  4|[this, course, in...|(262144,[4211,753...|(262144,[4211,753...|(262144,[4211,753...|(262144,[170,1079...|
|  en|23126|(262144,[1712,174...|  4|[this, course, in...|(262144,[4211,753...|(262144,[4211,753...|(262144,[4211,753...|(262144,[1712,174...|
|  en|21617|(262144,[170,1079...|  5|[this, online, co...|(262144,[1598,172...|(262144,[1598,172...|(262144,[1598,172...|(262144,[170,1079...|
|  en|23126|(262144,[1712,174...|  5|[this, online, co...|(262144,[1598,172...|(262144,[1598,172...|(262144,[1598,172...|(262144,[1712,174...|

In [16]:
spark.stop()