In [1]:
try:
    sc.stop()
except:
    pass

In [2]:
import json
import os
import sys

os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [3]:
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import functions as sf
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.sql.types import IntegerType, StringType, ArrayType, MapType, DoubleType

import re
import numpy as np

spark.conf.set('spark.sql.execution.arrow.enabled', 'false')

In [4]:
spark

## Courses

In [5]:
courses = spark.read.json('/labs/slaba02/DO_record_per_line.json')

In [6]:
courses.limit(5).toPandas()

Unnamed: 0,cat,desc,id,lang,name,provider
0,3/business_management|6/economics_finance,This course introduces the basic financial sta...,4,en,Accounting Cycle: The Foundation of Business M...,Canvas Network
1,11/law,This online course will introduce you to Ameri...,5,en,American Counter Terrorism Law,Canvas Network
2,5/computer_science|15/mathematics_statistics_a...,This course is taught in French Vous voulez co...,6,fr,Arithmétique: en route pour la cryptographie,Canvas Network
3,14/social_sciences,We live in a digitally connected world. The wa...,7,en,Becoming a Dynamic Educator,Canvas Network
4,2/biology_life_sciences,This self-paced course is designed to show tha...,8,en,Bioethics,Canvas Network


In [7]:
courses = courses.select('id', 'lang','desc')

In [8]:
tokinizer = Tokenizer(inputCol='desc', outputCol='words')
courses = tokinizer.transform(courses)

In [9]:
#define function to clean list of words
def clear_list(words_list):
    regex = re.compile('[\w\d]{2,}', re.U)
    filtered = [i for i in words_list if regex.match(i)]
    return filtered

clear_list_udf = sf.udf(clear_list, ArrayType(StringType()))

In [10]:
courses = courses.withColumn("clear_words",clear_list_udf(sf.col("words")))
courses = courses.select('id', 'lang','clear_words')

## TF-IDF

In [11]:
#Get term frequency vector through HashingTF (TF)
ht = HashingTF(inputCol="clear_words", outputCol="word_vector_freq", numFeatures=10000) 
tf = ht.transform(courses)

# Carrying out Inverse Document Frequency on the TF data
idf=IDF(inputCol="word_vector_freq", outputCol="tfidf_feature")
idfModel = idf.fit(tf)
courses_result = idfModel.transform(tf)

courses_result.cache().count()

28153

In [12]:
courses_result = courses_result.select('id', 'lang', 'tfidf_feature')

In [13]:
courses_result.show()

+---+----+--------------------+
| id|lang|       tfidf_feature|
+---+----+--------------------+
|  4|  en|(10000,[36,42,63,...|
|  5|  en|(10000,[32,222,29...|
|  6|  fr|(10000,[30,41,246...|
|  7|  en|(10000,[493,572,7...|
|  8|  en|(10000,[32,65,115...|
|  9|  en|(10000,[56,91,268...|
| 10|  en|(10000,[1045,1263...|
| 11|  en|(10000,[87,157,57...|
| 12|  en|(10000,[161,164,4...|
| 13|  en|(10000,[26,1072,1...|
| 14|  en|(10000,[145,234,3...|
| 15|  en|(10000,[32,65,77,...|
| 16|  en|(10000,[32,273,30...|
| 17|  en|(10000,[695,1420,...|
| 18|  en|(10000,[307,316,3...|
| 19|  en|(10000,[572,768,8...|
| 20|  en|(10000,[91,273,31...|
| 21|  en|(10000,[148,157,1...|
| 22|  en|(10000,[128,177,4...|
| 23|  en|(10000,[91,332,52...|
+---+----+--------------------+
only showing top 20 rows



In [14]:
courses2predict = courses_result.filter(
    sf.col('id').isin([23126, 21617, 16627, 11556, 16704, 13702]))

## join 2 datasets

In [15]:
df = courses2predict.select(sf.col('id').alias('id2predict'),
              sf.col('tfidf_feature').alias('tfidf_feature2predict'),
              'lang').join(courses_result, on='lang', how='left')

In [16]:
df = df.filter(df.id2predict != df.id)

In [17]:
df.show()

+----+----------+---------------------+-----+--------------------+
|lang|id2predict|tfidf_feature2predict|   id|       tfidf_feature|
+----+----------+---------------------+-----+--------------------+
|  en|     21617| (10000,[213,360,4...|16308|(10000,[505,1387,...|
|  en|     21617| (10000,[213,360,4...|16309|(10000,[128,996,1...|
|  en|     21617| (10000,[213,360,4...|16310|(10000,[505,706,1...|
|  en|     21617| (10000,[213,360,4...|16311|(10000,[240,281,1...|
|  en|     21617| (10000,[213,360,4...|16312|(10000,[1036,1239...|
|  en|     21617| (10000,[213,360,4...|16313|(10000,[304,1387,...|
|  en|     21617| (10000,[213,360,4...|16314|(10000,[505,1186,...|
|  en|     21617| (10000,[213,360,4...|16315|(10000,[71,2286,2...|
|  en|     21617| (10000,[213,360,4...|16316|(10000,[68,71,221...|
|  en|     21617| (10000,[213,360,4...|16317|(10000,[542,1387,...|
|  en|     21617| (10000,[213,360,4...|16318|(10000,[524,1187,...|
|  en|     21617| (10000,[213,360,4...|16319|(10000,[163,1239,

## Count cosine similarity

In [18]:
def sim_cos(v1, v2):
    cosine_angle = float(np.dot(v1,v2)/np.linalg.norm(v1) * np.linalg.norm(v2))
    return cosine_angle

sim_cos_udf = sf.udf(sim_cos, FloatType())

In [19]:
final_df_cos = df.withColumn('cos', sim_cos_udf('tfidf_feature2predict', 'tfidf_feature'))                           

In [20]:
#final_df_cos.show()

In [21]:
from pyspark.sql import Window

window = Window.partitionBy(final_df_cos['id2predict']).orderBy(final_df_cos['cos'].desc())

top_ten = final_df_cos.select('*', sf.rank().over(window).alias('rank')).filter(sf.col('rank') <= 10) 

In [22]:
#top_ten.show()

In [23]:
top_ten = top_ten.select('lang', 'id2predict', 'id', 'cos')

In [None]:
top_ten = top_ten.toPandas()

In [None]:
top_ten.head(10)

In [None]:
top_ten_list = top_ten.groupby('id2predict')['id'].apply(list).reset_index(name='id_lists')

In [None]:
top_ten_list.head(10)

## Save to JSON

In [None]:
top_ten_list_dict = dict(zip(top_ten_list.id2predict,top_ten_list.id_lists))

In [None]:
json_object = json.dumps(top_ten_list_dict, indent = 4) 

In [None]:
print(json_object)

In [None]:
with open("lab02.json", "w") as outfile:
    json.dump(top_ten_list_dict, outfile)

In [None]:
sc.stop()