In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 8 pyspark-shell'
spark_home = os.environ.get('SPARK_HOME', None)
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
from pyspark.sql import SparkSession
sparkSession = SparkSession.builder \
    .appName("nikita.mospan lab2") \
    .getOrCreate()

In [2]:
#Lab2 parameters

inputFilePath = '/labs/slaba02/DO_record_per_line.json'
idsOfCoursesForRecommendations = [23126,21617,16627,11556,16704,13702]
numberOfFeatures = 10000

In [3]:
# Constants

ID = "id"
NAME = "name"
LANG = "lang"
DESC = "desc"
CLEAN_DESC = "cleanDesc"
WORDS = "words"
CLEAN_WORDS = "cleanWords"
RAW_FEATURES = "rawFeatures"
FEATURES = "features"
NORM_FEATURES = "normFeatures"
CURR_NORM_FEATURES = "currNormFeatures"
COSINE = "cosine"

In [5]:
# schema to read courses data

from pyspark.sql.types import StructField, StructType, StringType, LongType

inputSchema = StructType([
    StructField(LANG, StringType()),
    StructField(NAME, StringType()),
    StructField("cat", StringType()),
    StructField("provider", StringType()),
    StructField(ID, LongType()),
    StructField(DESC, StringType())])

In [7]:
#udf to preprocess course description

import string
from pyspark.sql.functions import udf

def cleanString(in_str):
    return ' '.join(in_str.lower()\
            .translate(str.maketrans('', '', string.punctuation))\
            .translate(str.maketrans('', '', string.digits)).split())

cleanStringUdf = udf(cleanString, StringType())

In [8]:
# read and cache courses data
# caching is useful because dataframe will be reused for each course from idsOfCoursesForRecommendations

from pyspark.sql.functions import col
from pyspark.storagelevel import StorageLevel
coursesDf = sparkSession.read.schema(inputSchema).format("json").load(inputFilePath)\
            .withColumn(CLEAN_DESC, cleanStringUdf(col(DESC)))
coursesDf.persist(StorageLevel.MEMORY_AND_DISK)

DataFrame[lang: string, name: string, cat: string, provider: string, id: bigint, desc: string, cleanDesc: string]

In [9]:
# udf to calculate dot product between vectors

from pyspark.ml.linalg import SparseVector
from pyspark.sql.types import IntegerType, DoubleType

dotProductUdf = udf(lambda v1, v2: float(v1.dot(v2)), DoubleType())

In [10]:
from pyspark.ml.feature import HashingTF, IDF, Normalizer, StopWordsRemover, Tokenizer
from pyspark.sql.functions import asc, desc

def getRecommendationsForSingleCourse(courseId, coursesDf, numberOfFeatures):
    courseLang = coursesDf.where(col(ID) == courseId).select(LANG).collect()[0][0]
    tokenizer = Tokenizer(inputCol=CLEAN_DESC, outputCol=WORDS)
    tokenizedCoursesDf = tokenizer.transform(coursesDf.where(col(LANG) == courseLang))
    
    stopWordsRemover = StopWordsRemover(inputCol=WORDS, outputCol=CLEAN_WORDS)
    stopWordsRemovedCoursesDf = stopWordsRemover.transform(tokenizedCoursesDf)
    
    hashingTF = HashingTF(inputCol=CLEAN_WORDS, outputCol=RAW_FEATURES, numFeatures = numberOfFeatures)
    coursesWithRawFeaturesDf = hashingTF.transform(stopWordsRemovedCoursesDf)
    
    idf = IDF(inputCol = RAW_FEATURES, outputCol = FEATURES)
    coursesWithRescaledFeaturesDf = idf.fit(coursesWithRawFeaturesDf).transform(coursesWithRawFeaturesDf)
    
    normalizer = Normalizer(inputCol = FEATURES, outputCol = NORM_FEATURES)
    coursesWithNormalizedFeaturesDf = normalizer.transform(coursesWithRescaledFeaturesDf)
    
    return coursesWithNormalizedFeaturesDf.where(col(ID) == courseId)\
            .select(col(NORM_FEATURES).alias(CURR_NORM_FEATURES))\
            .crossJoin(coursesWithNormalizedFeaturesDf)\
            .withColumn(COSINE, dotProductUdf(col(CURR_NORM_FEATURES), col(NORM_FEATURES)))\
            .where(col(ID) != courseId)\
            .orderBy(desc(COSINE), asc(NAME), asc(ID))\
            .select(ID)\
            .limit(10).toPandas().iloc[:, 0].tolist()
    

In [11]:
resultAsDict = {}
for courseId in idsOfCoursesForRecommendations:
    resultAsDict[courseId] = getRecommendationsForSingleCourse(courseId, coursesDf, numberOfFeatures)

In [12]:
import json
with open('lab02.json', 'w') as lab02_out:
    lab02_out.write(json.dumps(resultAsDict, indent=3))

In [13]:
sparkSession.stop()