In [1]:
import findspark
import numpy as np
import pandas as pd
import seaborn as sns
findspark.init()

from pyspark.ml.feature import MinHashLSH, BucketedRandomProjectionLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import coalesce, udf, struct, col, lit, unix_timestamp, count, when, isnan, isnull, split

from pyspark.ml import Pipeline
from IPython.display import display
from pyspark.sql import SparkSession, Row
from pyspark import SparkConf, SparkContext
from pyspark.ml.feature import VectorAssembler, StandardScaler, MinMaxScaler
from pyspark.ml.clustering import KMeans, GaussianMixture, BisectingKMeans
from pyspark.mllib.evaluation import MulticlassMetrics

import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType, StructType, StructField, StringType
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler


spark = SparkSession.builder.appName('laptop_everis').getOrCreate()

"""
spark = SparkSession.builder\
       .appName("Simple recommendation engine using Spark MLlib")\
       .config("spark.some.config.option", "config-value")\
       .getOrCreate()\
"""
spark 

In [2]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
sc

In [3]:
# Parse dating agency ratings data as a Spark dataframe

ratings = "data/ratings.dat"

schema = StructType([
    StructField("user_id", IntegerType(), False),
    StructField("profile_id", IntegerType(), False),
    StructField("rating", IntegerType(), True)
])

ratings_df = spark.read.format(
    "csv"
).option(
    "header", "false"
).option(
    "delimiter", ","
).schema(
    schema
).load(
    ratings
)

ratings_df.show(3)

+-------+----------+------+
|user_id|profile_id|rating|
+-------+----------+------+
|      1|       133|     8|
|      1|       720|     6|
|      1|       971|    10|
+-------+----------+------+
only showing top 3 rows



In [4]:
help(ratings_df.na.drop)

Help on method drop in module pyspark.sql.dataframe:

drop(how='any', thresh=None, subset=None) method of pyspark.sql.dataframe.DataFrameNaFunctions instance
    Returns a new :class:`DataFrame` omitting rows with null values.
    :func:`DataFrame.dropna` and :func:`DataFrameNaFunctions.drop` are aliases of each other.
    
    :param how: 'any' or 'all'.
        If 'any', drop a row if it contains any nulls.
        If 'all', drop a row only if all its values are null.
    :param thresh: int, default None
        If specified, drop rows that have less than `thresh` non-null values.
        This overwrites the `how` parameter.
    :param subset: optional list of column names to consider.
    
    >>> df4.na.drop().show()
    +---+------+-----+
    |age|height| name|
    +---+------+-----+
    | 10|    80|Alice|
    +---+------+-----+
    
    .. versionadded:: 1.3.1



In [5]:
print(ratings_df.count())
ratings_df = ratings_df.na.drop(how="any")
ratings_df.printSchema()

17359346
root
 |-- user_id: integer (nullable = true)
 |-- profile_id: integer (nullable = true)
 |-- rating: integer (nullable = true)



In [6]:
print(dir(ratings_df))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_collectAsArrow', '_jcols', '_jdf', '_jmap', '_jseq', '_lazy_rdd', '_sc', '_schema', '_sort_cols', 'agg', 'alias', 'approxQuantile', 'cache', 'checkpoint', 'coalesce', 'colRegex', 'collect', 'columns', 'corr', 'count', 'cov', 'createGlobalTempView', 'createOrReplaceGlobalTempView', 'createOrReplaceTempView', 'createTempView', 'crossJoin', 'crosstab', 'cube', 'describe', 'distinct', 'drop', 'dropDuplicates', 'drop_duplicates', 'dropna', 'dtypes', 'explain', 'fillna', 'filter', 'first', 'foreach', 'foreachPartition', 'freqItems', 'groupBy', 'groupby', 'head', 'hint', 'intersect', 'isLocal', 'isStreaming', 'is_cached', 

In [7]:
print(ratings_df.count())

17359346


In [8]:
gender_data = "data/gender.dat"

schema = StructType([
    StructField("profile_id", IntegerType(), False),
    StructField("gender", StringType(), False)
])

gender_df = spark.read.format(
    "csv"
).option(
    "header", "false"
).option(
    "delimiter", ","
).schema(
    schema
).load(
    gender_data
)

gender_df.show(10)

+----------+------+
|profile_id|gender|
+----------+------+
|         1|     F|
|         2|     F|
|         3|     U|
|         4|     F|
|         5|     F|
|         6|     F|
|         7|     F|
|         8|     M|
|         9|     M|
|        10|     M|
+----------+------+
only showing top 10 rows



In [9]:
print(dir(ratings_df.select("rating")))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_collectAsArrow', '_jcols', '_jdf', '_jmap', '_jseq', '_lazy_rdd', '_sc', '_schema', '_sort_cols', 'agg', 'alias', 'approxQuantile', 'cache', 'checkpoint', 'coalesce', 'colRegex', 'collect', 'columns', 'corr', 'count', 'cov', 'createGlobalTempView', 'createOrReplaceGlobalTempView', 'createOrReplaceTempView', 'createTempView', 'crossJoin', 'crosstab', 'cube', 'describe', 'distinct', 'drop', 'dropDuplicates', 'drop_duplicates', 'dropna', 'dtypes', 'explain', 'fillna', 'filter', 'first', 'foreach', 'foreachPartition', 'freqItems', 'groupBy', 'groupby', 'head', 'hint', 'intersect', 'isLocal', 'isStreaming', 'is_cached', 

In [10]:
print(help(ratings_df.select("rating").agg))

Help on method agg in module pyspark.sql.dataframe:

agg(*exprs) method of pyspark.sql.dataframe.DataFrame instance
    Aggregate on the entire :class:`DataFrame` without groups
    (shorthand for ``df.groupBy.agg()``).
    
    >>> df.agg({"age": "max"}).collect()
    [Row(max(age)=5)]
    >>> from pyspark.sql import functions as F
    >>> df.agg(F.min(df.age)).collect()
    [Row(min(age)=2)]
    
    .. versionadded:: 1.3

None


In [11]:
# RATING MAXIMO
row_max = ratings_df.agg({"rating": "max"}).collect()[0]
max_rating = row_max["max(rating)"]
max_rating

10

In [12]:
# RATING MINIMO
row_min = ratings_df.agg({"rating": "min"}).collect()[0]
min_rating = row_min["min(rating)"]
min_rating

1

In [13]:
def calc(df, col, func):
    return df.agg(
        {col: func}
    ).collect()[0][
        "{}({})".format(func, col)
    ]

calc(ratings_df, 'rating', 'min')

1

In [14]:
import math

{
    _: math.log(_ + 1) + 0.69 for _ in range(10)
}

{0: 0.69,
 1: 1.3831471805599453,
 2: 1.7886122886681097,
 3: 2.0762943611198903,
 4: 2.2994379124341,
 5: 2.481759469228055,
 6: 2.635910149055313,
 7: 2.7694415416798357,
 8: 2.8872245773362195,
 9: 2.992585092994046}

In [15]:
# MIN MAX SCALER
ratings_df = ratings_df.withColumn(
    'norm_rating', F.round((ratings_df.rating - min_rating) / (max_rating - min_rating), 4)
).withColumn(
    'log_rating',  F.round(F.log(ratings_df.rating) + 1, 4)
).withColumn(
    'log_rating_069',  F.round(F.log(ratings_df.rating) + 0.69, 4)
)

max_log_069 = calc(ratings_df, 'log_rating_069', 'max')
min_log_069 = calc(ratings_df, 'log_rating_069', 'min')

ratings_df = ratings_df.withColumn(
    "norm_log_069", F.round((ratings_df.log_rating_069 - min_log_069) / (max_log_069 - min_log_069), 4)
)

df = ratings_df\
     .select("user_id", "profile_id", "rating", "norm_rating", 'log_rating', 'log_rating_069', 'norm_log_069')\
     .withColumnRenamed("norm_rating", "label")  # original norm_rating --> label

df.show(50)

+-------+----------+------+------+----------+--------------+------------+
|user_id|profile_id|rating| label|log_rating|log_rating_069|norm_log_069|
+-------+----------+------+------+----------+--------------+------------+
|      1|       133|     8|0.7778|    3.0794|        2.7694|      0.9031|
|      1|       720|     6|0.5556|    2.7918|        2.4818|      0.7782|
|      1|       971|    10|   1.0|    3.3026|        2.9926|         1.0|
|      1|      1095|     7|0.6667|    2.9459|        2.6359|      0.8451|
|      1|      1616|    10|   1.0|    3.3026|        2.9926|         1.0|
|      1|      1978|     7|0.6667|    2.9459|        2.6359|      0.8451|
|      1|      2145|     8|0.7778|    3.0794|        2.7694|      0.9031|
|      1|      2211|     8|0.7778|    3.0794|        2.7694|      0.9031|
|      1|      3751|     7|0.6667|    2.9459|        2.6359|      0.8451|
|      1|      4062|     3|0.2222|    2.0986|        1.7886|      0.4771|
|      1|      4633|    10|   1.0|    

In [16]:
# 15 PERFILES MAS POPULARES
"""
top_most_rated_profiles = ratings_df.groupBy("profile_id").count().sort(F.col("count").desc()).limit(50000)
top_most_rated_profiles = top_most_rated_profiles.withColumnRenamed("profile_id", "popular_profile_id")
print("> ", top_most_rated_profiles.count())

top_most_rated_profiles.show(15)
"""

'\ntop_most_rated_profiles = ratings_df.groupBy("profile_id").count().sort(F.col("count").desc()).limit(50000)\ntop_most_rated_profiles = top_most_rated_profiles.withColumnRenamed("profile_id", "popular_profile_id")\nprint("> ", top_most_rated_profiles.count())\n\ntop_most_rated_profiles.show(15)\n'

In [17]:
# 50 PERFILES MEJORES PUNTUADOS

"""
avg_rating_by_profile = ratings_df.groupBy(
    "profile_id"
).agg(
    F.avg('rating').alias('avg_rating')
).sort(
    F.col("avg_rating").desc()
)
print("> ", avg_rating_by_profile.count())

avg_rating_by_profile.show(50)
"""

'\navg_rating_by_profile = ratings_df.groupBy(\n    "profile_id"\n).agg(\n    F.avg(\'rating\').alias(\'avg_rating\')\n).sort(\n    F.col("avg_rating").desc()\n)\nprint("> ", avg_rating_by_profile.count())\n\navg_rating_by_profile.show(50)\n'

In [18]:
# Compute a ration between rating value and popularity
# CALCULO DE UNA RELACION ENTRE EL VALOR DE CALIFICACION Y LA POPULARIDAD

"""
top_profiles = top_most_rated_profiles.join(
    avg_rating_by_profile, 
    top_most_rated_profiles["popular_profile_id"] == avg_rating_by_profile["profile_id"],
    "left_outer"
).drop(
    'profile_id'
).withColumn(
    "ratio",
    F.col("avg_rating") / F.col("count")
)
"""

# Top profiles sorted by the relation average rating - number of times rated
# 25 mejores perfiles basados en el ratio

"""
top_profiles.select(
    "popular_profile_id", "ratio", "avg_rating", "count"
).sort(
    F.col("ratio").desc()
).show(25)
"""

'\ntop_profiles.select(\n    "popular_profile_id", "ratio", "avg_rating", "count"\n).sort(\n    F.col("ratio").desc()\n).show(25)\n'

In [19]:
# ratings_df.columns

In [20]:
# ratings_df.filter(F.col('profile_id') == 160749).show(20)

In [21]:
# top_profiles_gender = gender_df.join(top_profiles, on='profile_id')

In [22]:
# ratings_df.columns

In [23]:
df.columns

['user_id',
 'profile_id',
 'rating',
 'label',
 'log_rating',
 'log_rating_069',
 'norm_log_069']

In [24]:
(training, test) = df.randomSplit([0.8, 0.2])

In [25]:
training.show()

+-------+----------+------+------+----------+--------------+------------+
|user_id|profile_id|rating| label|log_rating|log_rating_069|norm_log_069|
+-------+----------+------+------+----------+--------------+------------+
|      1|       133|     8|0.7778|    3.0794|        2.7694|      0.9031|
|      1|       720|     6|0.5556|    2.7918|        2.4818|      0.7782|
|      1|       971|    10|   1.0|    3.3026|        2.9926|         1.0|
|      1|      1095|     7|0.6667|    2.9459|        2.6359|      0.8451|
|      1|      1616|    10|   1.0|    3.3026|        2.9926|         1.0|
|      1|      1978|     7|0.6667|    2.9459|        2.6359|      0.8451|
|      1|      2145|     8|0.7778|    3.0794|        2.7694|      0.9031|
|      1|      3751|     7|0.6667|    2.9459|        2.6359|      0.8451|
|      1|      4062|     3|0.2222|    2.0986|        1.7886|      0.4771|
|      1|      4633|    10|   1.0|    3.3026|        2.9926|         1.0|
|      1|      4842|     5|0.4444|    

In [26]:
training.columns

['user_id',
 'profile_id',
 'rating',
 'label',
 'log_rating',
 'log_rating_069',
 'norm_log_069']

In [27]:
test.show()

+-------+----------+------+------+----------+--------------+------------+
|user_id|profile_id|rating| label|log_rating|log_rating_069|norm_log_069|
+-------+----------+------+------+----------+--------------+------------+
|      1|      2211|     8|0.7778|    3.0794|        2.7694|      0.9031|
|      1|      7576|     8|0.7778|    3.0794|        2.7694|      0.9031|
|      1|      8923|     9|0.8889|    3.1972|        2.8872|      0.9542|
|      1|      9345|    10|   1.0|    3.3026|        2.9926|         1.0|
|      1|     11671|     5|0.4444|    2.6094|        2.2994|      0.6989|
|      1|     18287|    10|   1.0|    3.3026|        2.9926|         1.0|
|      1|     19231|     5|0.4444|    2.6094|        2.2994|      0.6989|
|      1|     19727|    10|   1.0|    3.3026|        2.9926|         1.0|
|      1|     20737|     8|0.7778|    3.0794|        2.7694|      0.9031|
|      1|     21256|    10|   1.0|    3.3026|        2.9926|         1.0|
|      1|     21642|     6|0.5556|    

In [28]:
test.columns

['user_id',
 'profile_id',
 'rating',
 'label',
 'log_rating',
 'log_rating_069',
 'norm_log_069']

In [29]:
seed = 42
als = ALS(
    userCol="user_id", 
    itemCol="profile_id", 
    ratingCol="label",
    coldStartStrategy="drop", 
    seed=seed
)

# Set considered parameter grid
paramGrid = ParamGridBuilder().addGrid(
    als.regParam, [0.01]
).addGrid(
    als.rank, [12]
).addGrid(
    als.nonnegative, [True]
).build()

# Set evaluator
modelEvaluator = RegressionEvaluator(metricName="rmse")

# Set cross validator instance
crossval = CrossValidator(estimator=als,
                          estimatorParamMaps=paramGrid,
                          evaluator=modelEvaluator,
                          numFolds=3)

# Perform cross-validation
cvModel = crossval.fit(training)

In [30]:
cvModel

CrossValidatorModel_47cab7358acf8bd7a0d7

In [31]:
print(dir(cvModel))

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__metaclass__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_clear', '_copyValues', '_copy_params', '_defaultParamMap', '_dummy', '_from_java', '_from_java_impl', '_paramMap', '_params', '_randomUID', '_resetUid', '_resolveParam', '_set', '_setDefault', '_shouldOwn', '_to_java', '_to_java_impl', '_transform', 'avgMetrics', 'bestModel', 'copy', 'estimator', 'estimatorParamMaps', 'evaluator', 'explainParam', 'explainParams', 'extractParamMap', 'getEstimator', 'getEstimatorParamMaps', 'getEvaluator', 'getOrDefault', 'getParam', 'getSeed', 'hasDefault', 'hasParam', 'isDefined', 'isSet', 'load', 'params', 'read', 'save', 'seed', 'set', 'setEstimator', 'setEstimatorParamMaps', 'setEvaluator', '

In [32]:
print(help(cvModel))

Help on CrossValidatorModel in module pyspark.ml.tuning object:

class CrossValidatorModel(pyspark.ml.base.Model, ValidatorParams, pyspark.ml.util.MLReadable, pyspark.ml.util.MLWritable)
 |  CrossValidatorModel(bestModel, avgMetrics=[])
 |  
 |  CrossValidatorModel contains the model with the highest average cross-validation
 |  metric across folds and uses this model to transform input data. CrossValidatorModel
 |  also tracks the metrics for each param map evaluated.
 |  
 |  .. versionadded:: 1.4.0
 |  
 |  Method resolution order:
 |      CrossValidatorModel
 |      pyspark.ml.base.Model
 |      pyspark.ml.base.Transformer
 |      ValidatorParams
 |      pyspark.ml.param.shared.HasSeed
 |      pyspark.ml.param.Params
 |      pyspark.ml.util.Identifiable
 |      pyspark.ml.util.MLReadable
 |      pyspark.ml.util.MLWritable
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, bestModel, avgMetrics=[])
 |      Initialize self.  See help(type(self)) for accura

In [33]:
bet_model_als = cvModel.bestModel
bet_model_als

ALS_4c6bac31c9a3810fe610

In [34]:
bet_model_als.rank

12

In [35]:
bet_model_als._java_obj.parent().getRegParam()

0.01

In [36]:
bet_model_als._java_obj.parent().getMaxIter()

10

In [37]:
bet_model_als._java_obj.parent().getNonnegative()

True

In [38]:
bet_model_als._java_obj.parent().getSeed()

42

In [39]:
bet_model_als._java_obj.parent().getRank()

12

In [40]:
print(dir(bet_model_als._java_obj.parent()))



### PREDICT

In [42]:
predictions = bet_model_als.transform(test)
predictions.show(25)

+-------+----------+------+------+----------+--------------+------------+-----------+
|user_id|profile_id|rating| label|log_rating|log_rating_069|norm_log_069| prediction|
+-------+----------+------+------+----------+--------------+------------+-----------+
|  27657|       496|     6|0.5556|    2.7918|        2.4818|      0.7782| 0.44840348|
| 133795|       496|    10|   1.0|    3.3026|        2.9926|         1.0| 0.75698173|
|  83542|       496|    10|   1.0|    3.3026|        2.9926|         1.0| 0.98405075|
|  37913|       496|     5|0.4444|    2.6094|        2.2994|      0.6989| 0.44741356|
|   4100|       496|    10|   1.0|    3.3026|        2.9926|         1.0| 0.55546737|
|  29441|       496|     6|0.5556|    2.7918|        2.4818|      0.7782| 0.68115723|
| 129820|       496|     6|0.5556|    2.7918|        2.4818|      0.7782|  0.7318838|
|  93285|      1238|     9|0.8889|    3.1972|        2.8872|      0.9542|   0.733231|
|  25280|      1238|     1|   0.0|       1.0|         