In [1]:
import pandas as pd

In [2]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [3]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("second_lab")
         .getOrCreate())

In [4]:
spark

In [19]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, RegexTokenizer
from pyspark.sql import Window
import pyspark.sql.functions as fun

In [6]:
schema = StructType([
    StructField("lang", StringType()),
    StructField("name", StringType()),
    StructField("cat", StringType()),
    StructField("provider", StringType()),
    StructField("id", IntegerType()),
    StructField("desc", StringType())
])

In [7]:
df = spark.read.json("/labs/slaba02/DO_record_per_line.json", schema=schema)

In [8]:
df.show(5)

+----+--------------------+--------------------+--------------+---+--------------------+
|lang|                name|                 cat|      provider| id|                desc|
+----+--------------------+--------------------+--------------+---+--------------------+
|  en|Accounting Cycle:...|3/business_manage...|Canvas Network|  4|This course intro...|
|  en|American Counter ...|              11/law|Canvas Network|  5|This online cours...|
|  fr|Arithmétique: en ...|5/computer_scienc...|Canvas Network|  6|This course is ta...|
|  en|Becoming a Dynami...|  14/social_sciences|Canvas Network|  7|We live in a digi...|
|  en|           Bioethics|2/biology_life_sc...|Canvas Network|  8|This self-paced c...|
+----+--------------------+--------------------+--------------+---+--------------------+
only showing top 5 rows



In [9]:
tokenizer = RegexTokenizer(inputCol='desc', outputCol='split_txt')
split_df = tokenizer.transform(df)
split_df.show(5)

+----+--------------------+--------------------+--------------+---+--------------------+--------------------+
|lang|                name|                 cat|      provider| id|                desc|           split_txt|
+----+--------------------+--------------------+--------------+---+--------------------+--------------------+
|  en|Accounting Cycle:...|3/business_manage...|Canvas Network|  4|This course intro...|[this, course, in...|
|  en|American Counter ...|              11/law|Canvas Network|  5|This online cours...|[this, online, co...|
|  fr|Arithmétique: en ...|5/computer_scienc...|Canvas Network|  6|This course is ta...|[this, course, is...|
|  en|Becoming a Dynami...|  14/social_sciences|Canvas Network|  7|We live in a digi...|[we, live, in, a,...|
|  en|           Bioethics|2/biology_life_sc...|Canvas Network|  8|This self-paced c...|[this, self-paced...|
+----+--------------------+--------------------+--------------+---+--------------------+--------------------+
only showi

In [10]:
hashingTF = HashingTF(inputCol='split_txt', outputCol='rawFeatures', numFeatures=10000)
hash_data = hashingTF.transform(split_df)

In [11]:
hash_data[['rawFeatures']].show(5)

+--------------------+
|         rawFeatures|
+--------------------+
|(10000,[36,42,63,...|
|(10000,[32,222,29...|
|(10000,[30,41,246...|
|(10000,[493,572,7...|
|(10000,[32,65,115...|
+--------------------+
only showing top 5 rows



In [12]:
idf = IDF(inputCol="rawFeatures", outputCol="features")
idf_ = idf.fit(hash_data)
transform_data = idf_.transform(hash_data)

In [13]:
transform_data.show(5)

+----+--------------------+--------------------+--------------+---+--------------------+--------------------+--------------------+--------------------+
|lang|                name|                 cat|      provider| id|                desc|           split_txt|         rawFeatures|            features|
+----+--------------------+--------------------+--------------+---+--------------------+--------------------+--------------------+--------------------+
|  en|Accounting Cycle:...|3/business_manage...|Canvas Network|  4|This course intro...|[this, course, in...|(10000,[36,42,63,...|(10000,[36,42,63,...|
|  en|American Counter ...|              11/law|Canvas Network|  5|This online cours...|[this, online, co...|(10000,[32,222,29...|(10000,[32,222,29...|
|  fr|Arithmétique: en ...|5/computer_scienc...|Canvas Network|  6|This course is ta...|[this, course, is...|(10000,[30,41,246...|(10000,[30,41,246...|
|  en|Becoming a Dynami...|  14/social_sciences|Canvas Network|  7|We live in a digi...|

In [15]:
trans_df = transform_data.select('id', 'lang', 'features')

In [16]:
trans_df.count()

28153

In [17]:
my_courses = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], 
              [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'],
              [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], 
              [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], 
              [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'],[13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]
num_courses = [x[0] for x in my_courses]


[23126, 21617, 16627, 11556, 16704, 13702]

In [20]:
my_courses_pred = trans_df.filter(fun.col('id').isin(num_courses))

In [23]:
my_courses_pred = trans_df.filter(
    fun.col('id').isin(num_courses))

In [24]:
pred_df = my_courses_pred.select(fun.col('id').alias('course_pred'),
              fun.col('features').alias('course_pred_upd'),
              'lang').join(trans_df, on='lang', how='left')

In [25]:
pred_df.count()

54316

In [26]:
pred_df.show(20)

+----+-----------+--------------------+---+--------------------+
|lang|course_pred|     course_pred_upd| id|            features|
+----+-----------+--------------------+---+--------------------+
|  en|      21617|(10000,[213,360,4...|  4|(10000,[36,42,63,...|
|  en|      21617|(10000,[213,360,4...|  5|(10000,[32,222,29...|
|  en|      21617|(10000,[213,360,4...|  7|(10000,[493,572,7...|
|  en|      21617|(10000,[213,360,4...|  8|(10000,[32,65,115...|
|  en|      21617|(10000,[213,360,4...|  9|(10000,[56,91,268...|
|  en|      21617|(10000,[213,360,4...| 10|(10000,[1045,1263...|
|  en|      21617|(10000,[213,360,4...| 11|(10000,[87,157,57...|
|  en|      21617|(10000,[213,360,4...| 12|(10000,[161,164,4...|
|  en|      21617|(10000,[213,360,4...| 13|(10000,[26,1072,1...|
|  en|      21617|(10000,[213,360,4...| 14|(10000,[63,145,23...|
|  en|      21617|(10000,[213,360,4...| 15|(10000,[32,65,77,...|
|  en|      21617|(10000,[213,360,4...| 16|(10000,[32,273,30...|
|  en|      21617|(10000,

In [27]:
@fun.udf(returnType=DoubleType())
def cos_distance(v1,v2):
    try:
        p = 2
        return float(v1.dot(v2))/float(v1.norm(p)*v2.norm(p))
    except:
        return 0

In [28]:
pred_df_cos = pred_df.withColumn('cos_distance', cos_distance('course_pred_upd', 'features'))                           

In [29]:
[23126, 21617, 16627, 11556, 16704, 13702]

[23126, 21617, 16627, 11556, 16704, 13702]

In [30]:
pred_df_cos.show(5)

+----+-----------+--------------------+---+--------------------+--------------------+
|lang|course_pred|     course_pred_upd| id|            features|        cos_distance|
+----+-----------+--------------------+---+--------------------+--------------------+
|  en|      21617|(10000,[213,360,4...|  4|(10000,[36,42,63,...| 0.07806882371748854|
|  en|      21617|(10000,[213,360,4...|  5|(10000,[32,222,29...| 0.04676028409292413|
|  en|      21617|(10000,[213,360,4...|  7|(10000,[493,572,7...|0.036594215419073484|
|  en|      21617|(10000,[213,360,4...|  8|(10000,[32,65,115...| 0.06916295420320383|
|  en|      21617|(10000,[213,360,4...|  9|(10000,[56,91,268...| 0.11014821826251854|
+----+-----------+--------------------+---+--------------------+--------------------+
only showing top 5 rows



In [31]:
window = Window.partitionBy(pred_df_cos['course_pred']).orderBy(pred_df_cos['cos_distance'].desc())
filt_values = pred_df_cos.select('*', fun.rank().over(window).alias('rank')).filter(fun.col('rank') <= 10) 

In [32]:
filt_values = filt_values.select('lang', 'course_pred', 'id', 'cos_distance')

In [33]:
print
((pred_df.count(), 
len
(pred_df.columns)))

(54316, 5)

In [34]:
print
((filt_values.count(), 
len
(filt_values.columns)))

(60, 4)

In [35]:
filt_values = filt_values.toPandas()

In [36]:
len(filt_values)

60

In [37]:
top_values = filt_values.groupby('course_pred')['id'].apply(list).reset_index()

In [38]:
top_values

Unnamed: 0,course_pred,id
0,11556,"[11556, 16488, 13461, 468, 10447, 387, 22710, ..."
1,13702,"[864, 13702, 1216, 7173, 8313, 1052, 17017, 19..."
2,16627,"[16627, 11431, 12247, 13021, 25010, 11575, 568..."
3,16704,"[16704, 1365, 20645, 20105, 1426, 8217, 1236, ..."
4,21617,"[21617, 21609, 21608, 21616, 21492, 21624, 216..."
5,23126,"[23126, 13782, 13665, 24419, 20638, 2724, 2578..."


In [39]:
top_values_list_dict = dict(zip(top_values.course_pred,top_values.id))

In [40]:
top_values_list_dict

{11556: [11556, 16488, 13461, 468, 10447, 387, 22710, 9289, 5936, 23357],
 13702: [864, 13702, 1216, 7173, 8313, 1052, 17017, 19613, 21017, 17015],
 16627: [16627, 11431, 12247, 13021, 25010, 11575, 5687, 9598, 5372, 12863],
 16704: [16704, 1365, 20645, 20105, 1426, 8217, 1236, 1164, 1219, 8123],
 21617: [21617, 21609, 21608, 21616, 21492, 21624, 21623, 21630, 21628, 21508],
 23126: [23126, 13782, 13665, 24419, 20638, 2724, 25782, 2633, 2723, 13348]}

In [41]:
with open("lab02.json", "w") as outfile:
    json.dump(top_values_list_dict, outfile)