In [89]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [90]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "KuznetsovA ContentBasedRecsys app") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [91]:
sc = spark.sparkContext

In [92]:
datapath = '/labs/slaba02/DO_record_per_line.json'

In [93]:
df = spark.read.format('json').load(datapath)

In [94]:
df.rdd.getNumPartitions()

3

In [95]:
df.show(5)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|
+--------------------+--------------------+---+----+--------------------+--------------+
only showing top 5 rows



In [96]:
courses_to_make_rec = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], 
                       [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], 
                       [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], 
                       [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], 
                       [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], 
                       [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]

In [97]:
lang_ids = {}
for course in courses_to_make_rec:
    if course[1] in lang_ids:
        lang_ids[course[1]].append(course[0])
    else:
        lang_ids[course[1]] = [course[0]]

In [98]:
lang_ids

{'en': [23126, 21617], 'es': [16627, 11556], 'ru': [16704, 13702]}

In [99]:
import pyspark.sql.functions as f
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType, FloatType
import re

In [100]:
def extract_words(string):
    regex = re.compile(u'[\w\d]{2,}', re.U)
    return regex.findall(string.lower())

extract_words_udf = udf(extract_words, ArrayType(StringType()))

In [158]:
(df
 .withColumn("words", extract_words_udf("desc"))
 .withColumn("words_lenght", f.size(f.col("words")))
 .orderBy("words_lenght")
 .show(5, truncate=False, vertical=True)
)

-RECORD 0---------------------------------------------------------------
 cat          | 15/mathematics_statistics_and_data_analysis             
 desc         | 
	  	     
	  	                                         
 id           | 651                                                     
 lang         | en                                                      
 name         | International Political Economy                         
 provider     | Harvard Extension School                                
 words        | []                                                      
 words_lenght | 0                                                       
-RECORD 1---------------------------------------------------------------
 cat          | 17/diy                                                  
 desc         |                                                         
 id           | 8371                                                    
 lang         | en                                 

In [159]:
dataset = (
    df
   .withColumn("words", extract_words_udf("desc"))
   .withColumn("words_lenght", f.size(f.col("words")))
   .filter("words_lenght != 0")
   .drop('cat', 'desc', 'provader')
   .filter(df.lang.isin(['en', 'es', 'ru']))
   .cache()
)

In [160]:
def dot_prod(u, v):
    return float(v.dot(u) / (v.norm(2) * u.norm(2)))

dot_prod_udf = udf(dot_prod, FloatType())

In [161]:
from pyspark.ml.feature import HashingTF, IDF

In [162]:
courses_recs = {}
for lang in lang_ids.keys():
    dataset_lang = dataset.filter(f"lang == '{lang}'")
    hashingTF = HashingTF(numFeatures=10000, binary=False, inputCol='words', outputCol='tf')
    dataset_lang = hashingTF.transform(dataset_lang)
    dataset_lang_tfidf = IDF(inputCol='tf', outputCol='tfidf').fit(dataset_lang).transform(dataset_lang)
    for idx in lang_ids[lang]:
        id_df = dataset_lang_tfidf.filter(f"id == {idx}").select("tfidf")
        dataset_lang_tfidf = (
            dataset_lang_tfidf
             .filter(f"id != {idx}")
             .select("id", "lang" ,"name", "tfidf")
             .crossJoin(id_df.select("tfidf").withColumnRenamed("tfidf", "tfidf_id").hint("broadcast"))
        )
        courses_recs[idx] = (
            dataset_lang_tfidf
            .withColumn("cosine_dist", dot_prod_udf("tfidf", "tfidf_id"))
            .orderBy(["cosine_dist", "name", "id"], ascending=[False, True, True])
            .limit(10)
            .select("id")
            .toPandas().values.squeeze().tolist()
        )

In [163]:
courses_recs

{23126: [14760, 13665, 13782, 20638, 24419, 15909, 2724, 25782, 17499, 13348],
 21617: [21609, 21616, 22298, 21608, 21630, 21628, 21508, 21623, 21081, 19417],
 16627: [11431, 17961, 17964, 5687, 12247, 16694, 5558, 12660, 11575, 9563],
 11556: [10384, 16488, 468, 22710, 13461, 21707, 19330, 23357, 10447, 9465],
 16704: [1219, 1327, 20362, 1228, 26980, 55, 1236, 1247, 1365, 913],
 13702: [864, 21079, 1111, 792, 1410, 8123, 1041, 1033, 8313, 1396]}

In [173]:
import json

In [174]:
with open('./lab02.json', 'w') as f:
    json.dump(courses_recs, f)

In [175]:
!ls

custom_regex_transformer_skeleton.ipynb  lab01.json   lab02.json
lab01.ipynb				 lab02.ipynb


### Experiment with 1 id

In [164]:
lang = 'en'
dataset_lang = dataset.filter(f"lang == '{lang}'")

In [165]:
hashingTF = HashingTF(numFeatures=10000, binary=False, inputCol='words', outputCol='tf')
dataset_lang = hashingTF.transform(dataset_lang)
dataset_lang_tfidf = IDF(inputCol='tf', outputCol='tfidf').fit(dataset_lang).transform(dataset_lang)

In [166]:
dataset_lang_tfidf.show(5, vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [167]:
id_df = dataset_lang_tfidf.filter(f"id == {ids_courses[0]}").select("tfidf").cache()

In [168]:
dataset_lang_tfidf = (
    dataset_lang_tfidf
     .filter(f"id != {lang_ids[lang][0]}")
     .select("id", "lang" ,"name", "tfidf")
     .crossJoin(id_df.select("tfidf").withColumnRenamed("tfidf", "tfidf_id").hint("broadcast"))
)

In [169]:
dataset_lang_tfidf.show(5)

+---+----+--------------------+--------------------+--------------------+
| id|lang|                name|               tfidf|            tfidf_id|
+---+----+--------------------+--------------------+--------------------+
|  4|  en|Accounting Cycle:...|(10000,[36,63,138...|(10000,[87,91,128...|
|  5|  en|American Counter ...|(10000,[32,222,36...|(10000,[87,91,128...|
|  7|  en|Becoming a Dynami...|(10000,[493,572,7...|(10000,[87,91,128...|
|  8|  en|           Bioethics|(10000,[32,115,13...|(10000,[87,91,128...|
|  9|  en|College Foundatio...|(10000,[56,91,300...|(10000,[87,91,128...|
+---+----+--------------------+--------------------+--------------------+
only showing top 5 rows



In [171]:
(dataset_lang_tfidf
.withColumn("cosine_dist", dot_prod_udf("tfidf", "tfidf_id"))
.orderBy(["cosine_dist", "name", "id"], ascending=[False, True, True])
.limit(10)
.select("id", "cosine_dist")
.toPandas().values.squeeze().tolist()
)

[[14760.0, 0.6317840814590454],
 [13665.0, 0.620358943939209],
 [13782.0, 0.5623417496681213],
 [20638.0, 0.45999443531036377],
 [24419.0, 0.43609362840652466],
 [15909.0, 0.3982919752597809],
 [2724.0, 0.37013301253318787],
 [25782.0, 0.3348028063774109],
 [17499.0, 0.3044833540916443],
 [13348.0, 0.29755929112434387]]

In [172]:
df.filter(df.id.isin([23126, 14760])).show(vertical=True, truncate=False)

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [176]:
spark.stop()