In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("test")
         .getOrCreate())

In [3]:
spark

In [4]:
sc = spark.sparkContext

In [5]:
sc

In [6]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [7]:
need_to_predict_for = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]

In [8]:
need_to_predict_for = sorted(need_to_predict_for, key=lambda row: row[0])
need_to_predict_for

[[11556,
  'es',
  'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'],
 [13702, 'ru', 'Математическая экономика'],
 [16627, 'es', 'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'],
 [16704, 'ru', 'Программирование на Lazarus'],
 [21617, 'en', 'Preparing for the AP* Computer Science A Exam — Part 2'],
 [23126, 'en', 'Compass - powerful SASS library that makes your life easier']]

In [9]:
!hdfs dfs -head '/labs/slaba02/DO_record_per_line.json'

{"lang": "en", "name": "Accounting Cycle: The Foundation of Business Measurement and Reporting", "cat": "3/business_management|6/economics_finance", "provider": "Canvas Network", "id": 4, "desc": "This course introduces the basic financial statements used by most businesses, as well as the essential tools used to prepare them. This course will serve as a resource to help business students succeed in their upcoming university-level accounting classes, and as a refresher for upper division accounting students who are struggling to recall elementary concepts essential to more advanced accounting topics. Business owners will also benefit from this class by gaining essential skills necessary to organize and manage information pertinent to operating their business. At the conclusion of the class, students will understand the balance sheet, income statement, and cash flow statement. They will be able to differentiate between cash basis and accrual basis techniques, and know when each is appro

In [10]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

In [11]:
schema = StructType(fields=[
    StructField("id", IntegerType()),
    StructField("name", StringType()),
    StructField("cat", StringType()),
    StructField("lang", StringType()),
    StructField("provider", StringType()),
    StructField("desc", StringType())
])

In [12]:
courses = spark.read\
          .schema(schema)\
          .format("json")\
          .load("/labs/slaba02/DO_record_per_line.json")\
          .cache()

In [13]:
courses

DataFrame[id: int, name: string, cat: string, lang: string, provider: string, desc: string]

In [14]:
courses.rdd.getNumPartitions()

2

In [15]:
courses = courses.repartition(6)

In [16]:
courses.schema.fieldNames()

['id', 'name', 'cat', 'lang', 'provider', 'desc']

In [None]:
courses = courses[[courses.id, courses.desc]]

In [None]:
import pyspark.sql.functions as f

In [None]:
from pyspark.ml.feature import HashingTF, IDF, RegexTokenizer

### Clean desc

In [63]:
tokenizer = RegexTokenizer(inputCol="desc", outputCol="words", pattern="[\\p{L}\\w\\d]{2,}", gaps=False)
wordsData = tokenizer.transform(courses)
wordsData = wordsData[f.size(wordsData.words) > 0]
wordsData.show(5)

+-----+--------------------+--------------------+
|   id|                desc|               words|
+-----+--------------------+--------------------+
|  469|Tinkering activit...|[tinkering, activ...|
| 2465|This course is th...|[this, course, is...|
|  202|Environmental law...|[environmental, l...|
|12505| Accounting skill...|[accounting, skil...|
|12440|
Aprendemos, sobr...|[aprendemos, sobr...|
+-----+--------------------+--------------------+
only showing top 5 rows



In [64]:
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=10000)
featurizedData = hashingTF.transform(wordsData).cache()
featurizedData.show(5)

+-----+--------------------+--------------------+--------------------+
|   id|                desc|               words|         rawFeatures|
+-----+--------------------+--------------------+--------------------+
|  469|Tinkering activit...|[tinkering, activ...|(10000,[32,70,128...|
| 2465|This course is th...|[this, course, is...|(10000,[157,201,3...|
|  202|Environmental law...|[environmental, l...|(10000,[32,70,91,...|
|12505| Accounting skill...|[accounting, skil...|(10000,[1,8,11,18...|
|12440|
Aprendemos, sobr...|[aprendemos, sobr...|(10000,[21,147,24...|
+-----+--------------------+--------------------+--------------------+
only showing top 5 rows



In [65]:
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=1)
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData).cache()

rescaledData[[rescaledData.id, rescaledData.rawFeatures, rescaledData.features]].show(5)

+-----+--------------------+--------------------+
|   id|         rawFeatures|            features|
+-----+--------------------+--------------------+
|  469|(10000,[32,70,128...|(10000,[32,70,128...|
| 2465|(10000,[157,201,3...|(10000,[157,201,3...|
|  202|(10000,[32,70,91,...|(10000,[32,70,91,...|
|12505|(10000,[1,8,11,18...|(10000,[1,8,11,18...|
|12440|(10000,[21,147,24...|(10000,[21,147,24...|
+-----+--------------------+--------------------+
only showing top 5 rows



In [45]:
rescaledData.schema.fieldNames()

['id', 'desc', 'words', 'rawFeatures', 'features']

In [27]:
def cos_for_specific_vector(a):
    return f.udf(lambda b: float(a.dot(b) / (a.norm(2) * b.norm(2))), FloatType())

In [23]:
vectors_map = {}
for i in [i[0] for i in need_to_predict_for]:
    vectors_map[i] = rescaledData[rescaledData.id == i][[rescaledData.features]].take(1)[0].features

In [70]:
result = {}
for i in [i[0] for i in need_to_predict_for]:
    rows = rescaledData.withColumn(str(i), cos_for_specific_vector(vectors_map[i])("features"))\
        .filter(rescaledData.id != i)\
        .orderBy(str(i), ascending=False)[[rescaledData.id, f.col(str(i))]]\
        .take(10)
    result[str(i)] = [row.id for row in rows]

In [71]:
result

{'11556': [16488, 468, 13461, 23357, 7833, 19330, 9289, 10447, 22710, 387],
 '13702': [864, 6516, 25502, 8313, 8979, 15649, 17834, 16357, 15678, 13890],
 '16627': [11431, 11575, 25010, 12247, 5687, 13021, 17964, 18661, 9408, 12863],
 '16704': [3864, 25724, 23407, 25726, 23864, 1247, 1236, 18023, 25627, 25991],
 '21617': [21609,
  21608,
  21616,
  21492,
  21624,
  21623,
  21506,
  21703,
  21675,
  21700],
 '23126': [14760,
  13665,
  13782,
  20638,
  24419,
  15909,
  2724,
  25782,
  23756,
  17499]}

In [72]:
import json
with open("lab02.json", "w") as outfile:
    json.dump(result, outfile)

In [73]:
! cat lab02.json

{"11556": [16488, 468, 13461, 23357, 7833, 19330, 9289, 10447, 22710, 387], "13702": [864, 6516, 25502, 8313, 8979, 15649, 17834, 16357, 15678, 13890], "16627": [11431, 11575, 25010, 12247, 5687, 13021, 17964, 18661, 9408, 12863], "16704": [3864, 25724, 23407, 25726, 23864, 1247, 1236, 18023, 25627, 25991], "21617": [21609, 21608, 21616, 21492, 21624, 21623, 21506, 21703, 21675, 21700], "23126": [14760, 13665, 13782, 20638, 24419, 15909, 2724, 25782, 23756, 17499]}

In [74]:
spark.stop()