In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
spark

In [3]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("test")
         .getOrCreate())

In [169]:
import re
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, Normalizer
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number

In [49]:
# !hdfs dfs -ls /labs/slaba02/DO_record_per_line.json

In [11]:
data = spark.read.json("/labs/slaba02/DO_record_per_line.json")

In [18]:
data.show(2)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
+--------------------+--------------------+---+----+--------------------+--------------+
only showing top 2 rows



In [39]:
data.count(), data.select('id').distinct().count()

(28153, 28153)

In [19]:
data.rdd.getNumPartitions()

2

In [117]:
test = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]
test_ids = [i[0] for i in test]
print(len(test))

6


In [160]:
test = spark.createDataFrame(test)

In [29]:
# !hdfs dfs -ls /share/submission-files/slaba02/lab02.json/

In [81]:
r = data.filter(F.col('id') == 13702).collect()
string = r[0]['desc']
string

'Математическая экономика – это набор моделей в той или иной степени правильно описывающих процессы в экономике.'

In [82]:
regex = re.compile(u'[\w\d]{2,}', re.U)
regex.findall(string.lower())

['математическая',
 'экономика',
 'это',
 'набор',
 'моделей',
 'той',
 'или',
 'иной',
 'степени',
 'правильно',
 'описывающих',
 'процессы',
 'экономике']

In [91]:
tokenizer = Tokenizer(inputCol="desc", outputCol="words")
data2 = tokenizer.transform(data)

In [161]:
test = tokenizer.transform(test)

In [121]:
@F.udf(returnType=ArrayType(StringType()))
def find_words(string):
    import re
    regex = re.compile(u'[\w\d]{2,}', re.U)
    return regex.findall(string.lower())

In [122]:
data = data.withColumn('desc_words', find_words(F.lower(F.col('desc'))))

In [162]:
test = test.withColumn('desc_words', find_words(F.lower(F.col('desc'))))

In [123]:
data.filter(F.col('id') == 13702).show(vertical=True, truncate=False)

-RECORD 0----------------------------------------------------------------------------------------------------------------------------
 cat        | 6/economics_finance|15/mathematics_statistics_and_data_analysis                                                        
 desc       | Математическая экономика – это набор моделей в той или иной степени правильно описывающих процессы в экономике.        
 id         | 13702                                                                                                                  
 lang       | ru                                                                                                                     
 name       | Математическая экономика                                                                                               
 provider   | Intuit                                                                                                                 
 desc_words | [математическая, экономика, это, набор, моделей,

In [124]:
data.printSchema()

root
 |-- cat: string (nullable = true)
 |-- desc: string (nullable = true)
 |-- id: long (nullable = true)
 |-- lang: string (nullable = true)
 |-- name: string (nullable = true)
 |-- provider: string (nullable = true)
 |-- desc_words: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [131]:
hashingTF = HashingTF(numFeatures=10000, inputCol='desc_words', outputCol='desc_tf')
tf = hashingTF.transform(data).cache()

In [163]:
test = hashingTF.transform(test).cache()

In [132]:
tf.filter(F.col('id') == 13702).show(vertical=True, truncate=False)

-RECORD 0------------------------------------------------------------------------------------------------------------------------------------
 cat        | 6/economics_finance|15/mathematics_statistics_and_data_analysis                                                                
 desc       | Математическая экономика – это набор моделей в той или иной степени правильно описывающих процессы в экономике.                
 id         | 13702                                                                                                                          
 lang       | ru                                                                                                                             
 name       | Математическая экономика                                                                                                       
 provider   | Intuit                                                                                                                         
 desc_

In [133]:
tf.count()

28153

In [136]:
idf = IDF(inputCol='desc_tf', outputCol='desc_tfidf').fit(tf)
tfidf = idf.transform(tf)

In [164]:
test = idf.transform(test)

In [137]:
tfidf.filter(F.col('id') == 13702).show(vertical=True, truncate=False)

-RECORD 0----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 cat        | 6/economics_finance|15/mathematics_statistics_and_data_analysis                                                                                                                                                                                                                                                        
 desc       | Математическая экономика – это набор моделей в той или иной степени правильно описывающих процессы в экономике.                                                                                                                                                                                                        
 id         | 13702   

In [165]:
# normalizer = Normalizer(inputCol="desc_tfidf", outputCol="desc_norm")
# norm = normalizer.transform(tfidf)

In [168]:
@F.udf
def sim_cos(a, b):
    try:
        return float(a.dot(b)) / (float(a.norm(2) * b.norm(2)))
    except:
        return 0

In [170]:
result = test.alias("i").join(tfidf.alias("j"), (F.col("i.id") < F.col("j.id")) | ((F.col("i.id") > F.col("j.id"))))\
    .select(
        F.col("i.id").alias("i"),
        F.col("j.id").alias("j"),
        F.col("i.lang").alias("i_lang"),
        F.col("j.lang").alias("j_lang"),
        sim_cos("i.idf", "j.idf").alias("sim_cosine"))\
    .sort("i", "j")\
.withColumn('sim_cosine_float', F.col('sim_cosine').cast('double'))

In [171]:
output = {}

win = Window.partitionBy("i").orderBy(col("sim_cosine_float").desc())
counter = 0

for i in ids:
    df = result.filter((F.col('i') == i) & (F.col('j_lang') == languages[counter]))\
    .withColumn("row", row_number().over(win)).filter(col("row") <= 10)
    
    counter = counter + 1
    output[i] = list(df.select('j').toPandas()['j'])

In [172]:
with open("lab02.json", "w") as outfile:
    json.dump(output, outfile)

In [173]:
spark.stop()