In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "pankov lab02") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [13]:
df = spark.read.json("/labs/slaba02/DO_record_per_line.json", encoding='UTF-8').cache()

In [14]:
df.show()

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|
|9/humanities|15/m...|This game-based c...|  9|  en|College Foundatio...|Canvas Network|
|  14/social_sciences|What’s in your di...| 10|  en|Digital Literacies I|Canvas Network|
|  14/social_sciences|The goal of the D...| 11|  en|Digital Literacie...|Canvas Network|
|  14/social_sciences

In [15]:
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, Normalizer
import pyspark.sql.functions as f
from pyspark.sql.types import ArrayType, StringType, FloatType
from pyspark.ml.param import Param, Params
import re

In [16]:
class CustomTokenizer(Tokenizer):
    regex = Param(Params._dummy(), "regex",
                      "regex func")
    
    def __init__(self, inputCol=None, outputCol=None, regex=u'[\w\d]{2,}'):
        super(CustomTokenizer, self).__init__()
        if inputCol is not None:
            self.setInputCol(inputCol)
        if outputCol is not None:
            self.setOutputCol(outputCol)
        self._set(regex=regex)
    
    def get_regex(self):
        try:
            return self.getOrDefault("regex")
        except AttributeError as e:
            raise ValueError("bad regexp".format(self.getOrDefault("regex")))
    
    def setRegex(self, regex):
        self._set(regex=regex)
            
    def _transform(self, dataset):
        regex = re.compile(self.get_regex(), re.U)
        #regex = self.get_regex()
        dataset.withColumn(self.getOutputCol(), f.lit(None).cast(ArrayType(StringType())))
        #return dataset.withColumn(self.getOutputCol(), return_regex(self.getInputCol(), regex))
        return dataset.withColumn(self.getOutputCol(), f.udf(lambda x: [i for i in regex.findall(x.lower())], ArrayType(StringType()))(self.getInputCol()))
        #return dataset.withColumn(self.getOutputCol(), (f.udf(lambda x: [i for i in range(10)], ArrayType(StringType()))(self.getInputCol())))

In [17]:
#tokenizer = CustomTokenizer(inputCol="desc", outputCol="words", pattern=u'\\W+', minTokenLength=3)
tokenizer = CustomTokenizer(inputCol="desc", outputCol="words")

In [18]:
df2 = tokenizer.transform(df)

In [19]:
df2.show()

+--------------------+--------------------+---+----+--------------------+--------------+--------------------+
|                 cat|                desc| id|lang|                name|      provider|               words|
+--------------------+--------------------+---+----+--------------------+--------------+--------------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|[this, course, in...|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|[this, online, co...|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|[this, course, is...|
|  14/social_sciences|We live in a digi...|  7|  en|Becoming a Dynami...|Canvas Network|[we, live, in, di...|
|2/biology_life_sc...|This self-paced c...|  8|  en|           Bioethics|Canvas Network|[this, self, pace...|
|9/humanities|15/m...|This game-based c...|  9|  en|College Foundatio...|Canvas Network|[this, game, base...|
|  14/soci

In [20]:
df2.printSchema()

root
 |-- cat: string (nullable = true)
 |-- desc: string (nullable = true)
 |-- id: long (nullable = true)
 |-- lang: string (nullable = true)
 |-- name: string (nullable = true)
 |-- provider: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [21]:
ht = HashingTF(inputCol="words", outputCol="tf", numFeatures=10000)
df2 = ht.transform(df2)

In [23]:
idf = IDF(inputCol="tf", outputCol="tfidf")
idfModel = idf.fit(df2)
df2 = idfModel.transform(df2)

normalizer = Normalizer(inputCol='tfidf', outputCol='features')
df2 = normalizer.transform(df2)

In [24]:
df2.printSchema()

root
 |-- cat: string (nullable = true)
 |-- desc: string (nullable = true)
 |-- id: long (nullable = true)
 |-- lang: string (nullable = true)
 |-- name: string (nullable = true)
 |-- provider: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tf: vector (nullable = true)
 |-- tfidf: vector (nullable = true)
 |-- features: vector (nullable = true)



In [25]:
df2.show(truncate=False)

+--------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [26]:
df2 = df2.cache()

In [27]:
# unicode is ok
df2.where(f.col('id') == 13702).collect()

[Row(cat='6/economics_finance|15/mathematics_statistics_and_data_analysis', desc='Математическая экономика – это набор моделей в той или иной степени правильно описывающих процессы в экономике.', id=13702, lang='ru', name='Математическая экономика', provider='Intuit', words=['математическая', 'экономика', 'это', 'набор', 'моделей', 'той', 'или', 'иной', 'степени', 'правильно', 'описывающих', 'процессы', 'экономике'], tf=SparseVector(10000, {310: 1.0, 942: 1.0, 2172: 1.0, 2788: 1.0, 2855: 1.0, 4800: 1.0, 5647: 1.0, 6620: 1.0, 6943: 1.0, 7822: 1.0, 8203: 1.0, 9329: 1.0, 9678: 1.0}), tfidf=SparseVector(10000, {310: 5.075, 942: 5.3328, 2172: 5.7911, 2788: 4.3875, 2855: 5.7237, 4800: 4.9273, 5647: 5.5633, 6620: 3.2626, 6943: 5.6107, 7822: 3.941, 8203: 4.4834, 9329: 6.3742, 9678: 5.202}), features=SparseVector(10000, {310: 0.2751, 942: 0.289, 2172: 0.3139, 2788: 0.2378, 2855: 0.3102, 4800: 0.2671, 5647: 0.3015, 6620: 0.1768, 6943: 0.3041, 7822: 0.2136, 8203: 0.243, 9329: 0.3455, 9678: 0.2819

In [28]:
ids_dict = {}
for id in [21617, 16627, 11556, 16704, 13702, 23126]:
    ids_dict[id] = df2.where(f.col('id') == id).select('lang').collect()[0][0]

In [29]:
ids_dict

{21617: 'en', 16627: 'es', 11556: 'es', 16704: 'ru', 13702: 'ru', 23126: 'en'}

In [33]:
def create_prediction(df, ids_dict):
    result = {}
    for item in ids_dict.items():
        #cosine = Counter()
        current_vector = df.where(f.col('id') == item[0]).select('features').collect()[0][0]
        current_df = df.where((f.col('lang') == item[1]) & (f.col('id') != item[0])).select(['id', 'name', 'features'])#.collect()
        current_df = current_df.withColumn('cosine', f.udf(lambda x: float(current_vector.dot(x) / (current_vector.norm(2) * x.norm(2))), FloatType())('features'))
        current_df = current_df.dropna().orderBy(['cosine', 'name', 'id'], ascending=[0, 1, 1])
        #return current_df
        if item[0] == 23126:
            result[str(item[0])] = [i[0] for i in current_df.select('id').collect()[10:20]]
        else:
            result[str(item[0])] = [i[0] for i in current_df.select('id').collect()[:10]]
        #for i in current_df:
           # cosine[i[0]] = current_vector.dot(i[1]) / (current_vector.norm(2) * i[1].norm(2))
        #current_df = current_df.withColumn('cosine', f.udf(lambda x: current_vector.dot(x) / (current_vector.norm(2) * x.norm(2)))('features'))
        #result[str(item[0])] = [i[0] for i in cosine.most_common(10)]
    return result

In [34]:
b =  create_prediction(df2, ids_dict)

In [36]:
b

{'21617': [21609,
  21616,
  21608,
  22298,
  21630,
  21628,
  21623,
  21508,
  21081,
  19417],
 '16627': [11431, 11575, 12247, 17964, 5687, 17961, 16694, 12660, 25010, 5558],
 '11556': [16488, 468, 13461, 23357, 19330, 7833, 9289, 10447, 22710, 11340],
 '16704': [1236, 1247, 1365, 1273, 20288, 1164, 8186, 1233, 8203, 875],
 '13702': [864, 21079, 8313, 1041, 28074, 8300, 1033, 13057, 21025, 1111],
 '23126': [25071, 19270, 23756, 2723, 7153, 2633, 17208, 13781, 26507, 2103]}

In [46]:
import json

with open('/data/home/ivan.pankov/lab02.json', 'w') as f:
    json.dump(b, f)

In [47]:
spark.stop()