In [1]:
import os
import sys

os.environ["PYSPARK_PYTHON"] = '/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"] = '/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"] = '--num-executors 4 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


#  Создание спарк сессии

In [75]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

sc = (
    SparkSession
    .builder
    .config(conf=conf)
    .appName("lab02-alexander.yusov")
    .getOrCreate()
)

# Данные

In [76]:
data = sc.read.json("/labs/slaba02/DO_record_per_line.json")
data.printSchema()

root
 |-- cat: string (nullable = true)
 |-- desc: string (nullable = true)
 |-- id: long (nullable = true)
 |-- lang: string (nullable = true)
 |-- name: string (nullable = true)
 |-- provider: string (nullable = true)



In [77]:
data.show(3)

+--------------------+--------------------+---+----+--------------------+--------------+
|                 cat|                desc| id|lang|                name|      provider|
+--------------------+--------------------+---+----+--------------------+--------------+
|3/business_manage...|This course intro...|  4|  en|Accounting Cycle:...|Canvas Network|
|              11/law|This online cours...|  5|  en|American Counter ...|Canvas Network|
|5/computer_scienc...|This course is ta...|  6|  fr|Arithmétique: en ...|Canvas Network|
+--------------------+--------------------+---+----+--------------------+--------------+
only showing top 3 rows



# Features

In [78]:
import re

from pyspark.ml.feature import Tokenizer, RegexTokenizer, HashingTF, IDF, VectorAssembler, Normalizer
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param, Params, TypeConverters
from pyspark.ml import Transformer
from pyspark import keyword_only
from pyspark.sql import DataFrame
from pyspark.sql.types import StringType, ArrayType

## Desc

### Tokenizer

In [79]:
class RegexTransformer(Transformer, HasInputCol, HasOutputCol, DefaultParamsReadable, DefaultParamsWritable):
    input_col = Param(
        parent=Params._dummy(), 
        name="input_col", 
        doc="Input column", 
        typeConverter=TypeConverters.toString
    )
    output_col = Param(
        parent=Params._dummy(), 
        name="output_col", 
        doc="Output column", 
        typeConverter=TypeConverters.toString
    )
    pattern = Param(
        parent=Params._dummy(), 
        name="pattern", 
        doc="Regex pattern",
        typeConverter=TypeConverters.toString
    )

    def __init__(
        self, 
        input_col: str = "input", 
        output_col: str = "output",
        pattern: str = ""
    ):
        super().__init__()
        self._setDefault(input_col=None, output_col=None, pattern=None)
        
        self.regex = re.compile(pattern, re.U)
        self.udf = f.udf(
            f=lambda x: self.regex.findall(x.lower()),
            returnType=ArrayType(StringType())
        )
        
        self.set_params(
            input_col=input_col,
            output_col=output_col,
            pattern=pattern,
        )
        
    @keyword_only
    def set_params(
        self, 
        input_col: str = "input", 
        output_col: str = "output",
        pattern: str = ""
    ):
        kwargs = self._input_kwargs
        self._set(**kwargs)

    def get_input_col(self):
        return self.getOrDefault(self.input_col)

    def get_output_col(self):
        return self.getOrDefault(self.output_col)
    
    def get_pattern(self):
        return self.getOrDefault(self.pattern)
    
    def set_pattern(self, pattern: str):
        return self._set(pattern=pattern)
    
    def _transform(self, df: DataFrame) -> DataFrame:
        input_col = self.get_input_col()
        output_col = self.get_output_col()
        pattern = self.get_pattern()
        
        return df.withColumn(
            output_col, 
            self.udf(input_col)
        )

# Извлечение признаков

In [80]:
from pyspark.ml.feature import StopWordsRemover

russian_stopwords = StopWordsRemover.loadDefaultStopWords("russian")
english_stopwords = StopWordsRemover.loadDefaultStopWords("english")
spanish_stopwords = StopWordsRemover.loadDefaultStopWords("spanish")

all_stopwords = russian_stopwords + english_stopwords + spanish_stopwords

In [81]:
def create_fe_pipeline(
    numFeatures: int = 100
):
    desc_tokenizer = RegexTransformer(input_col="desc", output_col="desc_tokens", pattern=u"[\w\d]{2,}")
#     desc_tokenizer = Tokenizer(inputCol="desc", outputCol="desc_tokens")
    stopwords = StopWordsRemover(inputCol="desc_tokens", outputCol="desc_tokens_filtered", stopWords=all_stopwords)
    desc_tf = HashingTF(numFeatures=numFeatures, inputCol="desc_tokens_filtered", outputCol="desc_tf")
    desc_idf = IDF(inputCol="desc_tf", outputCol="desc_tfidf")

    return Pipeline(stages=[
        desc_tokenizer,
        stopwords,
        desc_tf,
        desc_idf
    ])

## По модели на язык

## Одна модель на все языки

In [82]:
data_train = (
    data
#     .withColumn("desc", f.col("name"))
#     .withColumn("desc", f.concat_ws(" ", f.col("name"), f.col("desc")))
)

fe_pipeline = create_fe_pipeline(10000).fit(data_train)
features_data = fe_pipeline.transform(data_train)

# Load test data

In [83]:
test_data = [
    [23126, u'en', u'Compass - powerful SASS library that makes your life easier'], 
    [21617, u'en', u'Preparing for the AP* Computer Science A Exam \u2014 Part 2'], 
    [16627, u'es', u'Aprende Excel: Nivel Intermedio by Alfonso Rinsche'], 
    [11556, u'es', u'Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo'], 
    [16704, u'ru', u'\u041f\u0440\u043e\u0433\u0440\u0430\u043c\u043c\u0438\u0440\u043e\u0432\u0430\u043d\u0438\u0435 \u043d\u0430 Lazarus'], 
    [13702, u'ru', u'\u041c\u0430\u0442\u0435\u043c\u0430\u0442\u0438\u0447\u0435\u0441\u043a\u0430\u044f \u044d\u043a\u043e\u043d\u043e\u043c\u0438\u043a\u0430']
]

test_data = sc.createDataFrame(data=test_data, schema=StructType([
    StructField("id", IntegerType(), False), 
    StructField("lang", StringType(), False), 
    StructField("desc", StringType(), False)
]))
test_data.show(truncate=False)

+-----+----+------------------------------------------------------------------------------+
|id   |lang|desc                                                                          |
+-----+----+------------------------------------------------------------------------------+
|23126|en  |Compass - powerful SASS library that makes your life easier                   |
|21617|en  |Preparing for the AP* Computer Science A Exam — Part 2                        |
|16627|es  |Aprende Excel: Nivel Intermedio by Alfonso Rinsche                            |
|11556|es  |Aprendizaje Colaborativo by UNID Universidad Interamericana para el Desarrollo|
|16704|ru  |Программирование на Lazarus                                                   |
|13702|ru  |Математическая экономика                                                      |
+-----+----+------------------------------------------------------------------------------+



## Join

In [84]:
features_data_columns = features_data.columns

(
    features_data
    .join(
        other=(
            features_data
            .join(other=test_data.select("id"), on="id", how="right")
            .select(*[
                f.col(name).alias(name+"_test") if name != "lang" else "lang"
                for name in features_data_columns
            ])
        ),
        on="lang",
        how="right"
    )
    .printSchema()
#     .show(1, False, True)
)

root
 |-- lang: string (nullable = true)
 |-- cat: string (nullable = true)
 |-- desc: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- provider: string (nullable = true)
 |-- desc_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- desc_tokens_filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- desc_tf: vector (nullable = true)
 |-- desc_tfidf: vector (nullable = true)
 |-- cat_test: string (nullable = true)
 |-- desc_test: string (nullable = true)
 |-- id_test: integer (nullable = false)
 |-- name_test: string (nullable = true)
 |-- provider_test: string (nullable = true)
 |-- desc_tokens_test: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- desc_tokens_filtered_test: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- desc_tf_test: vector (nullable = true)
 |-- desc_tfidf_test: vector (nullable = true)



In [85]:
from pyspark.sql.window import Window


@f.udf(FloatType())
def cos_sim(x, y):
    try:
        return float(x.dot(y)) / float((x.norm(2) * y.norm(2)))
    except:
        return 0


window = Window.partitionBy("id_test").orderBy([
    f.col("similarity").desc(),
    f.col("name"),
    f.col("id")
])


result_df = (
    features_data
    .join(
        other=(
            features_data
            .join(other=test_data.select("id"), on="id", how="right")
#             .withColumn("desc", f.concat_ws(" ", f.col("name"), f.col("desc")))
            .select(*[
                f.col(name).alias(name+"_test") if name != "lang" else "lang"
                for name in features_data_columns
            ])
        ),
        on="lang",
        how="right"
    )
    .filter("id_test != id")
    .withColumn("similarity", cos_sim("desc_tfidf", "desc_tfidf_test"))
    .withColumn("row", f.row_number().over(window))
    .filter(f.col("row") <= 10)
    .persist()
)


result_df.show(1, truncate=False, vertical=True)

-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 lang                      | ru                                                                                                                                                                                                                                                                                              
 cat                       | 6/economics_finance|15/mathematics_statistics_and_data_analysis                                                                                                                                                                                                                                 
 desc                      | Математическая эк

23126
21617
16627
11556
16704
13702

In [86]:
(
    result_df
    .filter(f.col("id_test") == 16704)
    .select(["id", "id_test", "name", "desc_test", "similarity"])
    .show(truncate=False, vertical=True)
)

-RECORD 0------------------------------------------------------------------------------
 id         | 1236                                                                     
 id_test    | 16704                                                                    
 name       | Программирование на языке C++                                            
 desc_test  | В курсе рассматривается среда программирования Lazarus и компилятор FPC. 
 similarity | 0.2936135                                                                
-RECORD 1------------------------------------------------------------------------------
 id         | 1247                                                                     
 id_test    | 16704                                                                    
 name       | Введение в программирование                                              
 desc_test  | В курсе рассматривается среда программирования Lazarus и компилятор FPC. 
 similarity | 0.27732363        

# Save result

In [87]:
result_agg = (
    result_df
    .select([
        "id_test",
        "id"
    ])
    .groupBy("id_test")
    .agg(f.collect_list("id").alias("id"))
    .collect()
)

result_dict = dict()
for row in result_agg:
    result_dict[row["id_test"]] = row["id"]
    
result_dict

{13702: [864, 28074, 1041, 21079, 8300, 13057, 8313, 21025, 1033, 1111],
 16627: [11431, 5687, 17964, 12660, 12247, 17961, 16694, 5558, 11575, 13551],
 16704: [1236, 1247, 1365, 1164, 1273, 20288, 8186, 1233, 8203, 18331],
 23126: [13665, 14760, 13782, 20638, 24419, 15909, 2724, 25782, 17499, 13348],
 21617: [21609, 21616, 22298, 21608, 21630, 21628, 21081, 21623, 19417, 21508],
 11556: [16488, 468, 19330, 10447, 23357, 21707, 22710, 13461, 10384, 13776]}

In [88]:
import json

with open("/data/home/alexander.yusov/lab02.json", "w") as ff:
    json.dump(result_dict, ff)

# Закрытие спарк сессии

In [None]:
sc.stop()