### Spark

In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()

spark = SparkSession.builder.config(conf=conf).appName("andrey.frolov").getOrCreate()

In [3]:
spark

### Load data

In [4]:
from pyspark.sql.types import *

In [5]:
data_schema = StructType(fields=[
    StructField("lang", StringType()),
    StructField("name", StringType()),
    StructField("cat", StringType()),
    StructField("provider", StringType()),
    StructField("id", IntegerType()),
    StructField("desc", StringType())
])

data = spark.read.json("/labs/slaba02/DO_record_per_line.json", schema=data_schema).cache()

In [6]:
data.show(5)

+----+--------------------+--------------------+--------------+---+--------------------+
|lang|                name|                 cat|      provider| id|                desc|
+----+--------------------+--------------------+--------------+---+--------------------+
|  en|Accounting Cycle:...|3/business_manage...|Canvas Network|  4|This course intro...|
|  en|American Counter ...|              11/law|Canvas Network|  5|This online cours...|
|  fr|Arithmétique: en ...|5/computer_scienc...|Canvas Network|  6|This course is ta...|
|  en|Becoming a Dynami...|  14/social_sciences|Canvas Network|  7|We live in a digi...|
|  en|           Bioethics|2/biology_life_sc...|Canvas Network|  8|This self-paced c...|
+----+--------------------+--------------------+--------------+---+--------------------+
only showing top 5 rows



### Tokenization

In [7]:
import re

In [8]:
import pyspark.sql.functions as f

In [9]:
from pyspark.sql.types import *

In [10]:
@f.pandas_udf(ArrayType(StringType()))
def get_words(desc):
    regex = re.compile(u'[\w\d]{2,}', re.U)
    return desc.apply(lambda x: regex.findall(x.lower()))

In [11]:
data_token = data.withColumn("tokens", get_words("desc"))

In [12]:
data_token.show(5)

+----+--------------------+--------------------+--------------+---+--------------------+--------------------+
|lang|                name|                 cat|      provider| id|                desc|              tokens|
+----+--------------------+--------------------+--------------+---+--------------------+--------------------+
|  en|Accounting Cycle:...|3/business_manage...|Canvas Network|  4|This course intro...|[this, course, in...|
|  en|American Counter ...|              11/law|Canvas Network|  5|This online cours...|[this, online, co...|
|  fr|Arithmétique: en ...|5/computer_scienc...|Canvas Network|  6|This course is ta...|[this, course, is...|
|  en|Becoming a Dynami...|  14/social_sciences|Canvas Network|  7|We live in a digi...|[we, live, in, di...|
|  en|           Bioethics|2/biology_life_sc...|Canvas Network|  8|This self-paced c...|[this, self, pace...|
+----+--------------------+--------------------+--------------+---+--------------------+--------------------+
only showi

### TF-IDF

In [13]:
from pyspark.ml.feature import HashingTF, IDF

In [14]:
hashtf = HashingTF(numFeatures=10000, inputCol="tokens", outputCol='tf')
tf = hashtf.transform(data_token)

In [15]:
idf = IDF(inputCol="tf", outputCol="idf").fit(tf)

In [16]:
tfidf = idf.transform(tf)

### Cosine distance

In [17]:
df_joined = tfidf.join(
    tfidf.select(f.col('lang'), f.col('id').alias('id2'), f.col('idf').alias('idf2')), 
    on='lang', 
    how='inner'
)

In [42]:
df = df_joined.select(
    f.col('lang'),    
    f.col('name'),
    f.col('cat'),
    f.col('provider'),
    f.col('desc'),
    f.col('id'),
    f.col('id2'),
    f.col('idf'),
    f.col('idf2')
)

In [43]:
# my courses
df = df.filter(f.col('id').isin([23126, 21617, 16627, 11556, 16704, 13702]))

In [44]:
from pyspark.ml.feature import Normalizer

In [45]:
df = Normalizer(inputCol="idf", outputCol="idf_norm", p=2) \
    .transform(df)
df = Normalizer(inputCol="idf2", outputCol="idf2_norm", p=2) \
    .transform(df)

In [46]:
dotProductUdf = f.udf(
    lambda v1, v2: float(v1.dot(v2)), DoubleType())

df = df.withColumn('cosine', dotProductUdf(f.col('idf_norm'), f.col('idf2_norm')))

### Nearest courses

In [47]:
from pyspark.sql.window import Window

In [48]:
order_window = Window().partitionBy('id').orderBy(f.col('cosine').desc(), f.col('name'), f.col('id2'))

In [49]:
df = df.withColumn('n_row', f.row_number().over(order_window))

In [50]:
df = df.filter(f.col('id') != f.col('id2')).filter(f.col('n_row') <= 10)

### Prepare solution

In [51]:
import json

data = {
    "11556": df.filter(f.col('id') == 11556).select('id2').rdd.flatMap(lambda x: x).collect(),
    "13702": df.filter(f.col('id') == 13702).select('id2').rdd.flatMap(lambda x: x).collect(),
    "16627": df.filter(f.col('id') == 16627).select('id2').rdd.flatMap(lambda x: x).collect(),
    "16704": df.filter(f.col('id') == 16704).select('id2').rdd.flatMap(lambda x: x).collect(),
    "21617": df.filter(f.col('id') == 21617).select('id2').rdd.flatMap(lambda x: x).collect(),
    "23126": df.filter(f.col('id') == 23126).select('id2').rdd.flatMap(lambda x: x).collect()
}

with open('/data/home/andrey.frolov/lab02.json', 'w') as fl:
    json.dump(data, fl)

In [52]:
spark.stop()