In [1]:
import findspark
import numpy as np
import pandas as pd
import seaborn as sns
findspark.init()

from pyspark.ml.feature import MinHashLSH, BucketedRandomProjectionLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import coalesce, udf, struct, col, lit, unix_timestamp, count, when, isnan, isnull, split

from pyspark.ml import Pipeline
from IPython.display import display
from pyspark.sql import SparkSession, Row
from pyspark import SparkConf, SparkContext
from pyspark.ml.feature import VectorAssembler, StandardScaler, MinMaxScaler
from pyspark.ml.clustering import KMeans, GaussianMixture, BisectingKMeans
from pyspark.mllib.evaluation import MulticlassMetrics

import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType, StructType, StructField, StringType
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler


spark = SparkSession.builder.appName('laptop_everis').getOrCreate()

"""
spark = SparkSession.builder\
       .appName("Simple recommendation engine using Spark MLlib")\
       .config("spark.some.config.option", "config-value")\
       .getOrCreate()\
"""
spark

In [2]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
sc

<center>
    <img src='https://2.bp.blogspot.com/-oBPhcEuXFA0/VwpQHERiVPI/AAAAAAAAFsg/r4yUWXmXeQ0ec4YsAGp-UTBeGpvS3mUDg/s1600/LEFT%2Bvs%2BRight%2BOuter%2BJoin%2Bin%2BSQL.png'>
</center>

In [60]:
A = spark.createDataFrame(
    [
        [1,2,'A'], [2,1,'B'], [3,1,'C'], 
        [4,11,'D'],[1,None,'D'], [0, 5,'C'],
        [None,4,'C'], [9,None , None], [10,4, None],
    ],
    ['PC1','P2','P3']
).sort(
    F.col("PC1").asc()
)

A.show()

+----+----+----+
| PC1|  P2|  P3|
+----+----+----+
|null|   4|   C|
|   0|   5|   C|
|   1|   2|   A|
|   1|null|   D|
|   2|   1|   B|
|   3|   1|   C|
|   4|  11|   D|
|   9|null|null|
|  10|   4|null|
+----+----+----+



In [61]:
help(A.join)

Help on method join in module pyspark.sql.dataframe:

join(other, on=None, how=None) method of pyspark.sql.dataframe.DataFrame instance
    Joins with another :class:`DataFrame`, using the given join expression.
    
    :param other: Right side of the join
    :param on: a string for the join column name, a list of column names,
        a join expression (Column), or a list of Columns.
        If `on` is a string or a list of strings indicating the name of the join column(s),
        the column(s) must exist on both sides, and this performs an equi-join.
    :param how: str, default ``inner``. Must be one of: ``inner``, ``cross``, ``outer``,
        ``full``, ``full_outer``, ``left``, ``left_outer``, ``right``, ``right_outer``,
        ``left_semi``, and ``left_anti``.
    
    The following performs a full outer join between ``df1`` and ``df2``.
    
    >>> df.join(df2, df.name == df2.name, 'outer').select(df.name, df2.height).collect()
    [Row(name=None, height=80), Row(name='Bob'

In [62]:
B = spark.createDataFrame(
    [
        [0,0,'A'], [1,1,'B'], [2,1,'C'], 
        [0,None,'D'],[2,3,'D'], [1,4,'C'],
        [11,4,'C'], [10, None,'C'], [None,9,'C'],
        [None, 4, 'C'], [13, 4, None], [12, 4, None]
    ],
    ['PC1','P2','P3']
).sort(
    F.col("PC1").asc()
)

B.show()

+----+----+----+
| PC1|  P2|  P3|
+----+----+----+
|null|   4|   C|
|null|   9|   C|
|   0|   0|   A|
|   0|null|   D|
|   1|   1|   B|
|   1|   4|   C|
|   2|   3|   D|
|   2|   1|   C|
|  10|null|   C|
|  11|   4|   C|
|  12|   4|null|
|  13|   4|null|
+----+----+----+



## INNER

In [63]:
A.join(B, on='PC1', how='inner').show(100)

+---+----+----+----+---+
|PC1|  P2|  P3|  P2| P3|
+---+----+----+----+---+
|  0|   5|   C|   0|  A|
|  0|   5|   C|null|  D|
|  1|   2|   A|   1|  B|
|  1|   2|   A|   4|  C|
|  1|null|   D|   1|  B|
|  1|null|   D|   4|  C|
| 10|   4|null|null|  C|
|  2|   1|   B|   1|  C|
|  2|   1|   B|   3|  D|
+---+----+----+----+---+



## LEFT_SEMI

In [64]:
A.join(B, on='PC1', how='left_semi').show(100)

+---+----+----+
|PC1|  P2|  P3|
+---+----+----+
|  0|   5|   C|
|  1|   2|   A|
|  1|null|   D|
| 10|   4|null|
|  2|   1|   B|
+---+----+----+



## LEFT_ANTI

In [65]:
A.join(B, on='PC1', how='left_anti').show(100)

+----+----+----+
| PC1|  P2|  P3|
+----+----+----+
|null|   4|   C|
|   9|null|null|
|   3|   1|   C|
|   4|  11|   D|
+----+----+----+



## OUTER

In [66]:
A.join(B, on='PC1', how='outer').show(100)

+----+----+----+----+----+
| PC1|  P2|  P3|  P2|  P3|
+----+----+----+----+----+
|   0|   5|   C|   0|   A|
|   0|   5|   C|null|   D|
|null|   4|   C|null|null|
|null|null|null|   9|   C|
|null|null|null|   4|   C|
|   9|null|null|null|null|
|   1|   2|   A|   1|   B|
|   1|   2|   A|   4|   C|
|   1|null|   D|   1|   B|
|   1|null|   D|   4|   C|
|  10|   4|null|null|   C|
|   3|   1|   C|null|null|
|  12|null|null|   4|null|
|  11|null|null|   4|   C|
|   2|   1|   B|   1|   C|
|   2|   1|   B|   3|   D|
|   4|  11|   D|null|null|
|  13|null|null|   4|null|
+----+----+----+----+----+



In [69]:
A.join(B, on='PC1', how='outer').explain()

== Physical Plan ==
*(5) Project [coalesce(PC1#785L, PC1#801L) AS PC1#930L, P2#786L, P3#787, P2#802L, P3#803]
+- SortMergeJoin [PC1#785L], [PC1#801L], FullOuter
   :- *(2) Sort [PC1#785L ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(PC1#785L, 200)
   :     +- *(1) Sort [PC1#785L ASC NULLS FIRST], true, 0
   :        +- Exchange rangepartitioning(PC1#785L ASC NULLS FIRST, 200)
   :           +- Scan ExistingRDD[PC1#785L,P2#786L,P3#787]
   +- *(4) Sort [PC1#801L ASC NULLS FIRST], false, 0
      +- Exchange hashpartitioning(PC1#801L, 200)
         +- *(3) Sort [PC1#801L ASC NULLS FIRST], true, 0
            +- Exchange rangepartitioning(PC1#801L ASC NULLS FIRST, 200)
               +- Scan ExistingRDD[PC1#801L,P2#802L,P3#803]


In [71]:
A.join(B, on='PC1', how='full_outer').explain()

== Physical Plan ==
*(5) Project [coalesce(PC1#785L, PC1#801L) AS PC1#942L, P2#786L, P3#787, P2#802L, P3#803]
+- SortMergeJoin [PC1#785L], [PC1#801L], FullOuter
   :- *(2) Sort [PC1#785L ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(PC1#785L, 200)
   :     +- *(1) Sort [PC1#785L ASC NULLS FIRST], true, 0
   :        +- Exchange rangepartitioning(PC1#785L ASC NULLS FIRST, 200)
   :           +- Scan ExistingRDD[PC1#785L,P2#786L,P3#787]
   +- *(4) Sort [PC1#801L ASC NULLS FIRST], false, 0
      +- Exchange hashpartitioning(PC1#801L, 200)
         +- *(3) Sort [PC1#801L ASC NULLS FIRST], true, 0
            +- Exchange rangepartitioning(PC1#801L ASC NULLS FIRST, 200)
               +- Scan ExistingRDD[PC1#801L,P2#802L,P3#803]


In [67]:
A.join(B, on='PC1', how='full').show(100)

+----+----+----+----+----+
| PC1|  P2|  P3|  P2|  P3|
+----+----+----+----+----+
|   0|   5|   C|   0|   A|
|   0|   5|   C|null|   D|
|null|   4|   C|null|null|
|null|null|null|   9|   C|
|null|null|null|   4|   C|
|   9|null|null|null|null|
|   1|   2|   A|   1|   B|
|   1|   2|   A|   4|   C|
|   1|null|   D|   1|   B|
|   1|null|   D|   4|   C|
|  10|   4|null|null|   C|
|   3|   1|   C|null|null|
|  12|null|null|   4|null|
|  11|null|null|   4|   C|
|   2|   1|   B|   1|   C|
|   2|   1|   B|   3|   D|
|   4|  11|   D|null|null|
|  13|null|null|   4|null|
+----+----+----+----+----+



In [68]:
A.join(B, on='PC1', how='full_outer').show(100)

+----+----+----+----+----+
| PC1|  P2|  P3|  P2|  P3|
+----+----+----+----+----+
|   0|   5|   C|   0|   A|
|   0|   5|   C|null|   D|
|null|   4|   C|null|null|
|null|null|null|   9|   C|
|null|null|null|   4|   C|
|   9|null|null|null|null|
|   1|   2|   A|   1|   B|
|   1|   2|   A|   4|   C|
|   1|null|   D|   1|   B|
|   1|null|   D|   4|   C|
|  10|   4|null|null|   C|
|   3|   1|   C|null|null|
|  12|null|null|   4|null|
|  11|null|null|   4|   C|
|   2|   1|   B|   1|   C|
|   2|   1|   B|   3|   D|
|   4|  11|   D|null|null|
|  13|null|null|   4|null|
+----+----+----+----+----+



In [47]:
A.join(B, on='PC1', how='left').show(100)

+----+---+---+----+----+
| PC1| P2| P3|  P2|  P3|
+----+---+---+----+----+
|   0|  5|  C|   0|   A|
|   0|  5|  C|  11|   D|
|null|  4|  C|null|null|
|   9|  4|  C|null|null|
|   1|  2|  A|   1|   B|
|   1|  2|  A|   4|   C|
|   1|  3|  D|   1|   B|
|   1|  3|  D|   4|   C|
|  10|  4|  C|   4|   C|
|   3|  1|  C|null|null|
|   2|  1|  B|   1|   C|
|   2|  1|  B|   3|   D|
|   4| 11|  D|null|null|
+----+---+---+----+----+



In [48]:
A.join(B, on='PC1', how='left_outer').show(100)

+----+---+---+----+----+
| PC1| P2| P3|  P2|  P3|
+----+---+---+----+----+
|   0|  5|  C|   0|   A|
|   0|  5|  C|  11|   D|
|null|  4|  C|null|null|
|   9|  4|  C|null|null|
|   1|  2|  A|   1|   B|
|   1|  2|  A|   4|   C|
|   1|  3|  D|   1|   B|
|   1|  3|  D|   4|   C|
|  10|  4|  C|   4|   C|
|   3|  1|  C|null|null|
|   2|  1|  B|   1|   C|
|   2|  1|  B|   3|   D|
|   4| 11|  D|null|null|
+----+---+---+----+----+



In [51]:
A.join(B, on='PC1', how='right').show(100)

+----+----+----+---+---+
| PC1|  P2|  P3| P2| P3|
+----+----+----+---+---+
|   0|   5|   C|  0|  A|
|   0|   5|   C| 11|  D|
|null|null|null|  9|  C|
|null|null|null|  4|  C|
|   1|   2|   A|  1|  B|
|   1|   3|   D|  1|  B|
|   1|   2|   A|  4|  C|
|   1|   3|   D|  4|  C|
|  10|   4|   C|  4|  C|
|  12|null|null|  4|  C|
|  11|null|null|  4|  C|
|   2|   1|   B|  1|  C|
|   2|   1|   B|  3|  D|
|  13|null|null|  4|  C|
+----+----+----+---+---+



In [52]:
A.join(B, on='PC1', how='right_outer').show(100)

+----+----+----+---+---+
| PC1|  P2|  P3| P2| P3|
+----+----+----+---+---+
|   0|   5|   C|  0|  A|
|   0|   5|   C| 11|  D|
|null|null|null|  9|  C|
|null|null|null|  4|  C|
|   1|   2|   A|  1|  B|
|   1|   3|   D|  1|  B|
|   1|   2|   A|  4|  C|
|   1|   3|   D|  4|  C|
|  10|   4|   C|  4|  C|
|  12|null|null|  4|  C|
|  11|null|null|  4|  C|
|   2|   1|   B|  1|  C|
|   2|   1|   B|  3|  D|
|  13|null|null|  4|  C|
+----+----+----+---+---+

