In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
MY_COURSES = [[23126, u'en', u'Compass - powerful SASS library that makes your life easier'], [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']]

In [3]:
MY_COURSES

[[23126, 'en', 'Compass - powerful SASS library that makes your life easier'],
 [21617, 'en', 'Preparing for the AP* Computer Science A Exam — Part 2'],
 [16627, 'es', 'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'],
 [11556,
  'es',
  'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'],
 [16704, 'ru', 'Программирование на Lazarus'],
 [13702, 'ru', 'Математическая экономика']]

In [4]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "Temirlan Spark Dataframe app") 

spark = SparkSession.builder.config(conf=conf).appName("Recommend system app").getOrCreate()

In [6]:
!hdfs dfs -get /labs/slaba02/DO_record_per_line.json

In [5]:
spark

In [6]:
rdd = spark.read.json("/labs/slaba02/DO_record_per_line.json")

In [None]:
from pyspark.sql.functions import udf, col, lower, regexp_replace
from pyspark.ml.feature import Tokenizer, StopWordsRemover
import pyspark.sql.functions as f

In [None]:
log_with_regions.groupBy("region")\
                .agg(f.count("ip").alias("count"), f.count(f.lit("1")).alias("cnt"))\
                .orderBy("count", ascending=False)\
                .show(10)

In [None]:
dataset = dataset.withColumn('desc', lower(col('desc'))) \
                 .withColumn('desc', regexp_replace('desc', "[^a-zA-Z\\s]", "")) \
                 .select(['id', 'lang', 'desc']).cache()

In [None]:
# Clean text
dataset = dataset[(dataset.lang=='en') | (dataset.lang=='es') | (dataset.lang=='ru')]

In [None]:
import nltk
nltk.download("stopwords")

In [None]:
english_stop_words = nltk.corpus.stopwords.words('english')
spanish_stop_words = nltk.corpus.stopwords.words('spanish')
russian_stop_words = nltk.corpus.stopwords.words('russian')

In [None]:
english_remover = StopWordsRemover(inputCol='desc', outputCol='desc_clean', stopWords=english_stop_words)
spanish_remover = StopWordsRemover(inputCol='desc', outputCol='desc_clean', stopWords=spanish_stop_words)
russian_remover = StopWordsRemover(inputCol='desc', outputCol='desc_clean', stopWords=russian_stop_words)

In [None]:
df_english = dataset[dataset.lang == 'en']
df_spanigh = dataset[dataset.lang == 'es']
df_russian = dataset[dataset.lang == 'ru']

In [None]:
df_english = english_remover.transform(df_english.withColumn('desc', f.split('desc', ' ')))

In [None]:
df_spanigh = spanish_remover.transform(df_spanigh.withColumn('desc', f.split('desc', ' ')))
df_russian = russian_remover.transform(df_russian.withColumn('desc', f.split('desc', ' ')))

In [None]:
df_english.show(10)

In [None]:
stopwords = {'eng': english_stop_words,
             'esp': spanish_stop_words,
             'rus': russian_stop_words}

In [None]:
dataset.printSchema()

In [7]:
from pyspark.sql.functions import udf, max as max_, col, rank, lit
from pyspark.sql.types import IntegerType, DoubleType, ArrayType, StringType
from pyspark.sql.window import Window
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml.linalg import Vectors, VectorUDT
import re
import numpy as np 

def cos_(x, y):
    if(x == None or y == None):
        return np.nan
    else:
        if float(x.norm(2)*y.norm(2))!=0:
            return float(x.dot(y)/(x.norm(2)*y.norm(2)))
        else:
            return float(-1)
cosinus_ = udf(lambda x,y: cos_(x, y), DoubleType())

def get_tokens(s_):
    regex = re.compile(u'[\w\d]{2,}', re.U)
    return regex.findall(s_.lower())

get_tokens_ = udf(lambda x: get_tokens(x), ArrayType(StringType()))

result_dict = {}
len_ = len(MY_COURSES)
for i in range(0, len_):
    cur_course = MY_COURSES[i]
    df_ = rdd \
    .filter(col('lang')==cur_course[1])

    t_ = df_ \
    .withColumn('desc', get_tokens_(col('desc')))
    
    ht = HashingTF(inputCol='desc', outputCol='vector_tf', numFeatures=10000)
    tf_ = ht.transform(t_)

    idf_m = IDF(inputCol='vector_tf', outputCol='vector_')  
    idf_r = idf_m.fit(tf_) 
    idf_ = idf_r.transform(tf_)
    
    v_ = idf_\
    .filter(col('id')==cur_course[0])\
    .withColumnRenamed('id','id_base')\
    .withColumnRenamed('vector_','vector_base')\
    .select('id_base','vector_base')

    tv_ = idf_\
    .join(v_,col('id')!=col('id_base'))\
    .withColumn('cos', cosinus_(col('vector_base'), col('vector_')))\
    .select('id', 'id_base', 'name', 'cos')

    window = Window.orderBy(tv_['cos'].desc(), tv_['name'])
    t_ = tv_.select('id', 'cos', rank().over(window).alias('rank')).filter(col('rank') <= 10).collect()
    l_ = [int(row[0]) for row in t_]
    result_dict[str(cur_course[0])] = l_
    print(i)

0
1
2
3
4
5


In [8]:
print(result_dict)

{'23126': [14760, 13665, 13782, 20638, 24419, 15909, 2724, 25782, 17499, 13348], '21617': [21609, 21616, 22298, 21608, 21630, 21628, 21508, 21623, 21081, 19417], '16627': [11431, 17961, 17964, 5687, 12247, 16694, 5558, 12660, 11575, 9563], '11556': [10384, 16488, 468, 22710, 13461, 21707, 19330, 23357, 10447, 9465], '16704': [1219, 1327, 20362, 1228, 26980, 55, 1236, 1247, 1365, 913, 20095], '13702': [864, 21079, 1111, 792, 1410, 8123, 1041, 1033, 8313, 1396]}


In [9]:
import json
with open('lab02.json', 'w') as fp:
    json.dump(result_dict, fp)