In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 --executor-memory 3g --driver-memory 2g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "Bulatov Nikolai ML") 

spark = SparkSession.builder.config(conf=conf).appName("Bulatov Nikolai ML app").getOrCreate()

In [3]:
spark

In [4]:
! hdfs dfs -head /labs/slaba02/DO_record_per_line.json

{"lang": "en", "name": "Accounting Cycle: The Foundation of Business Measurement and Reporting", "cat": "3/business_management|6/economics_finance", "provider": "Canvas Network", "id": 4, "desc": "This course introduces the basic financial statements used by most businesses, as well as the essential tools used to prepare them. This course will serve as a resource to help business students succeed in their upcoming university-level accounting classes, and as a refresher for upper division accounting students who are struggling to recall elementary concepts essential to more advanced accounting topics. Business owners will also benefit from this class by gaining essential skills necessary to organize and manage information pertinent to operating their business. At the conclusion of the class, students will understand the balance sheet, income statement, and cash flow statement. They will be able to differentiate between cash basis and accrual basis techniques, and know when each is appro

In [5]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

schema = StructType([
    StructField("lang", StringType()),
    StructField("name", StringType()),
    StructField("cat", StringType()),
    StructField("provider", StringType()),
    StructField("id", IntegerType()),
    StructField("desc", StringType())
])

In [6]:
raw_dataset = spark.read.json("/labs/slaba02/DO_record_per_line.json", schema=schema)

In [7]:
raw_dataset.printSchema()

root
 |-- lang: string (nullable = true)
 |-- name: string (nullable = true)
 |-- cat: string (nullable = true)
 |-- provider: string (nullable = true)
 |-- id: integer (nullable = true)
 |-- desc: string (nullable = true)



In [8]:
raw_dataset.rdd.getNumPartitions()

3

In [9]:
raw_dataset = raw_dataset.repartition(9)

In [10]:
raw_dataset.groupby("lang").count().collect()

[Row(lang='en', count=24553),
 Row(lang='vi', count=1),
 Row(lang='nb', count=2),
 Row(lang='uz', count=1),
 Row(lang='ur', count=11),
 Row(lang='pl', count=1),
 Row(lang='sk', count=5),
 Row(lang='pt', count=187),
 Row(lang='sw', count=1),
 Row(lang='ko', count=3),
 Row(lang='ms', count=1),
 Row(lang='tr', count=120),
 Row(lang='de', count=166),
 Row(lang='es', count=1374),
 Row(lang='hr', count=4),
 Row(lang='el', count=5),
 Row(lang='it', count=62),
 Row(lang='af', count=2),
 Row(lang='ar', count=34),
 Row(lang='sv', count=1),
 Row(lang='nl', count=6),
 Row(lang='hu', count=2),
 Row(lang='ca', count=6),
 Row(lang='ru', count=1231),
 Row(lang='fa', count=1),
 Row(lang='bg', count=2),
 Row(lang='hi', count=6),
 Row(lang='et', count=1),
 Row(lang='zh', count=169),
 Row(lang='fr', count=104),
 Row(lang='ja', count=77),
 Row(lang='id', count=1),
 Row(lang='da', count=3),
 Row(lang='fi', count=2),
 Row(lang='he', count=8)]

In [11]:
from pyspark.sql import functions as  f

dataset = raw_dataset.select("id", "lang", "name", f.regexp_replace(f.lower(f.regexp_replace('desc',r'[^\pL{0-9}\p{Space}]','')), r'\p{Space}+', ' ').alias("desc_filtered"))
dataset.show(10)

+----+----+--------------------+--------------------+
|  id|lang|                name|       desc_filtered|
+----+----+--------------------+--------------------+
|7169|  es|Técnicas para gan...| mi libro técnica...|
|8265|  en|Leadership: Ident...|this course exami...|
|6807|  en|  Lean Canvas Course| we live in an ag...|
|3994|  en|CCNP ROUTE 2014 V...| in september you...|
|4784|  en|Cultural Diversit...| how to make chan...|
|5765|  en|Kickboxing Workou...| lose weight and ...|
|3510|  en|Ambient Intellige...|this course focus...|
| 761|  en|       Wit and Humor| emphasizing wit ...|
|8715|  en|AIESEC Indonesia ...|this is a course ...|
|4890|  en|How to Draw Caric...| learn how to dra...|
+----+----+--------------------+--------------------+
only showing top 10 rows



In [12]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer

stop_words = \
    StopWordsRemover.loadDefaultStopWords("russian") + \
    StopWordsRemover.loadDefaultStopWords("english") + \
    StopWordsRemover.loadDefaultStopWords("spanish")


In [13]:
from pyspark.ml.feature import Tokenizer, HashingTF, IDF

tokenizer = Tokenizer(inputCol="desc_filtered", outputCol="words")
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="words_filtered", stopWords=stop_words)
tf = HashingTF(inputCol=swr.getOutputCol(), outputCol="rawFeatures", numFeatures=10000)
tf_idf = IDF(inputCol=tf.getOutputCol(), outputCol="features")

In [14]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[
    tokenizer,
    swr,
    tf,
    tf_idf
])

In [15]:
tfidf_model = pipeline.fit(dataset)

In [16]:
test_schema = StructType([
    StructField("id", IntegerType()),
    StructField("lang", StringType()),
    StructField("desc", StringType())
])

In [17]:
import re

test_array = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]
test_tuple = [(x[0],x[1], x[2]) for x in test_array]

test_df = spark.createDataFrame(test_tuple, schema=test_schema)
test_df = test_df.select("id","lang",f.regexp_replace(f.lower(f.regexp_replace('desc',r'[^\pL{0-9}\p{Space}]','')), r'\p{Space}+', ' ').alias("desc_filtered"))
test_df.show(10)

+-----+----+--------------------+
|   id|lang|       desc_filtered|
+-----+----+--------------------+
|23126|  en|compass powerful ...|
|21617|  en|preparing for the...|
|16627|  es|aprende excel niv...|
|11556|  es|aprendizaje colab...|
|16704|  ru|программирование ...|
|13702|  ru|математическая эк...|
+-----+----+--------------------+



In [18]:
base_tfidf = tfidf_model.transform(dataset)

In [19]:
base_tfidf.filter("lang = 'en'").take(5)

[Row(id=9645, lang='en', name='How to Get Your First Job after College - Udemy', desc_filtered=' learn how to job search with this stepbystep guide to getting your first job after college learn the skills you need to find a job the skills they didnt teach you in school most of us have sat through a career counseling session while we were getting ready to graduate college but the advice they gave us was generic and vague tweak your resume network apply to jobs in your industry what are we supposed to do with that this job hunt course is about more than just tweaking your resume this indepth job search course takes you stepbystep through the process of finding your first real job after college youll learn how to choose an industrycareer path narrow it down with a personal brand message build a network that actually helps you find a job and master the interview process well also cover how to refine your resume and align it with your brand message crafting an amazing job application and ne

In [20]:
#test_tfidf = tfidf_model.transform(test_df)
test_tfidf = test_df.select("id").join(base_tfidf, "id", "inner")

In [21]:
test_tfidf.collect()

[Row(id=13702, lang='ru', name='Математическая экономика', desc_filtered='математическая экономика это набор моделей в той или иной степени правильно описывающих процессы в экономике', words=['математическая', 'экономика', 'это', 'набор', 'моделей', 'в', 'той', 'или', 'иной', 'степени', 'правильно', 'описывающих', 'процессы', 'в', 'экономике'], words_filtered=['математическая', 'экономика', 'это', 'набор', 'моделей', 'той', 'иной', 'степени', 'правильно', 'описывающих', 'процессы', 'экономике'], rawFeatures=SparseVector(10000, {310: 1.0, 942: 1.0, 2172: 1.0, 2788: 1.0, 2855: 1.0, 4800: 1.0, 5647: 1.0, 6943: 1.0, 7822: 1.0, 8203: 1.0, 9329: 1.0, 9678: 1.0}), features=SparseVector(10000, {310: 4.9984, 942: 5.2896, 2172: 5.6403, 2788: 4.4464, 2855: 5.5359, 4800: 4.8205, 5647: 5.4748, 6943: 5.4497, 7822: 3.9283, 8203: 4.5219, 9329: 6.1679, 9678: 5.2415})),
 Row(id=16627, lang='es', name='Aprende Excel: Nivel Intermedio by Alfonso Rinsche', desc_filtered=' hazte más empleable obtén una nuev

In [22]:
#joined_df = test_tfidf.select("id", "features", "lang").crossJoin(base_tfidf.select("id", "features", "lang").withColumnRenamed("id","rec_id").withColumnRenamed("features","rec_features").withColumnRenamed("lang","rec_lang")).cache()

joined_df = base_tfidf.select("id", "features", "lang", "name").withColumnRenamed("id","rec_id").withColumnRenamed("features","rec_features").join(test_tfidf.select("id", "features", "lang"), on="lang", how="inner").filter("id != rec_id").cache()


joined_df.take(1)

[Row(lang='en', rec_id=8265, rec_features=SparseVector(10000, {77: 2.5334, 855: 1.3861, 1714: 4.3266, 2460: 0.4147, 4314: 4.0634, 4377: 2.4432, 5293: 3.9157, 6209: 1.8842, 6842: 3.3932, 9432: 2.5379, 9615: 4.0696}), name='Leadership: Identity, Influence and Power', id=23126, features=SparseVector(10000, {87: 2.6937, 246: 3.9557, 258: 3.5002, 263: 16.2998, 341: 5.8105, 419: 2.6024, 524: 2.1652, 721: 1.0062, 727: 2.1538, 814: 4.8293, 870: 2.6848, 937: 2.5488, 948: 5.9016, 966: 0.8726, 1022: 2.0601, 1072: 3.1401, 1073: 5.8753, 1169: 2.6361, 1197: 3.6108, 1218: 5.1275, 1222: 5.6403, 1272: 7.7787, 1312: 0.8143, 1368: 1.9569, 1443: 3.0631, 1463: 16.5894, 1470: 41.5647, 1645: 3.6958, 1652: 3.2261, 1682: 3.5037, 1770: 3.3681, 1851: 3.0971, 1882: 2.7187, 1959: 0.6697, 2060: 5.6503, 2080: 0.8229, 2412: 2.5713, 2460: 4.5622, 2682: 2.9758, 2691: 4.3212, 2801: 2.3006, 2865: 5.0864, 2971: 5.0922, 3102: 2.9302, 3115: 2.7645, 3145: 2.6617, 3154: 8.5281, 3162: 3.0639, 3202: 2.8065, 3330: 1.6927, 3372: 

In [23]:
joined_df.count()

54310

In [24]:
from pyspark.ml.linalg import Vectors
from pyspark.sql.types import FloatType
from pyspark.sql import functions as  f

@f.udf(FloatType())
def cos_sim(vec1, vec2):
    #vec1 = Vectors.dense(str1)
    #vec2 = Vectors.dense(str2)
    #if vec1.norm(2) * vec2.norm(2) != 0:
        dot_value = vec1.dot(vec2) / (vec1.norm(2) * vec2.norm(2))
        return float(dot_value)

#print(joined_df.take(1)[0][1])
#print(joined_df.take(6)[0][3])


In [25]:
from pyspark.sql.window import Window

cos_df = joined_df.select("id", cos_sim("features", "rec_features").alias("cos_sim"), "rec_id", "name").filter("cos_sim != 'NaN'")

windowSpec  = Window.partitionBy("id").orderBy(f.col("cos_sim").desc(), f.col("name"), f.col("rec_id"))
cos_df = cos_df.select(f.row_number().over(windowSpec).alias("rn"), "id", "rec_id", "cos_sim").filter("rn <= 10").groupBy("id").agg(f.collect_list("rec_id").alias("res"), f.collect_list("cos_sim").alias("cos_sim"))
cos_df.explain(True)

== Parsed Logical Plan ==
'Aggregate ['id], [unresolvedalias('id, None), collect_list('rec_id, 0, 0) AS res#298, collect_list('cos_sim, 0, 0) AS cos_sim#300]
+- Filter (rn#288 <= 10)
   +- Project [rn#288, id#77, rec_id#148, cos_sim#282]
      +- Project [id#77, rec_id#148, cos_sim#282, name#1, rn#288, rn#288]
         +- Window [row_number() windowspecdefinition(id#77, cos_sim#282 DESC NULLS LAST, name#1 ASC NULLS FIRST, rec_id#148 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS rn#288], [id#77], [cos_sim#282 DESC NULLS LAST, name#1 ASC NULLS FIRST, rec_id#148 ASC NULLS FIRST]
            +- Project [id#77, rec_id#148, cos_sim#282, name#1]
               +- Filter NOT (cos_sim#282 = cast(NaN as float))
                  +- Project [id#77, cos_sim(features#118, rec_features#153) AS cos_sim#282, rec_id#148, name#1]
                     +- Filter NOT (id#77 = rec_id#148)
                        +- Project [lang#0, rec_id#148, rec_features#153, na

In [26]:
res_list = cos_df.collect()

In [27]:
cos_df.take(6)

[Row(id=13702, res=[864, 28074, 1041, 21079, 8300, 13057, 8313, 1111, 1033, 21025], cos_sim=[1.0, 0.1483365148305893, 0.13178977370262146, 0.1265052706003189, 0.1234009861946106, 0.11379192769527435, 0.10992132127285004, 0.10780233889818192, 0.10676135867834091, 0.10662287473678589]),
 Row(id=16627, res=[11431, 5687, 17964, 12660, 12247, 17961, 16694, 11575, 13551, 13550], cos_sim=[0.5855640172958374, 0.4148993492126465, 0.4060797691345215, 0.3966037631034851, 0.38649189472198486, 0.36245444416999817, 0.35170799493789673, 0.3333212733268738, 0.332559734582901, 0.31563282012939453]),
 Row(id=16704, res=[1247, 1236, 1365, 8186, 1164, 1273, 20288, 1233, 1229, 8203], cos_sim=[0.29510167241096497, 0.2798807621002197, 0.2513659596443176, 0.23989179730415344, 0.23735979199409485, 0.23207278549671173, 0.23207278549671173, 0.22857524454593658, 0.22281649708747864, 0.20797212421894073]),
 Row(id=23126, res=[14760, 13665, 13782, 20638, 24419, 15909, 2724, 25782, 17499, 13348], cos_sim=[0.60111457

In [28]:
res = {str(r["id"]): r["res"] for r in res_list}
res

{'13702': [864, 28074, 1041, 21079, 8300, 13057, 8313, 1111, 1033, 21025],
 '16627': [11431,
  5687,
  17964,
  12660,
  12247,
  17961,
  16694,
  11575,
  13551,
  13550],
 '16704': [1247, 1236, 1365, 8186, 1164, 1273, 20288, 1233, 1229, 8203],
 '23126': [14760,
  13665,
  13782,
  20638,
  24419,
  15909,
  2724,
  25782,
  17499,
  13348],
 '21617': [21609,
  21608,
  21616,
  21492,
  21624,
  21703,
  21700,
  21623,
  21508,
  21506],
 '11556': [16488, 468, 10447, 23357, 19330, 22710, 13461, 10384, 21707, 13776]}

In [29]:
import json

with open('lab02.json', 'w') as fp:
    fp.write(json.dumps(res))

In [30]:
#spark.stop()