In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=0a3345f85023f03c77636bbf026528d274562484ebf5f567ec8a841013889725
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [17]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

conf = SparkConf().setAppName("pysparkProgramming3").setMaster("local")

sc = SparkContext.getOrCreate(conf = conf)
spark = SparkSession(sc)
data = spark.read.csv("recipes_combined.csv", header=True,inferSchema=True)
data.show()
Col=data.columns
print(Col)

columns_to_drop = ['_c2', '_c3', '_c4', '_c5', '_c6', '_c7', '_c8', '_c9', '_c10', '_c11', '_c12', '_c13', '_c14', '_c15','all_ingredients']
Filter_data = data.drop(*columns_to_drop)
Filter_data.show(truncate=False)

from pyspark.sql.functions import regexp_replace

filtered_data = Filter_data.withColumn('ingredients', regexp_replace('ingredients', '//', ' ')) \
    .withColumn('ingredients', regexp_replace('ingredients', '[(|)]', ''))

filtered_data.show(truncate=False)

from pyspark.ml.feature import Tokenizer, HashingTF, IDF
tokenizer = Tokenizer(inputCol="ingredients", outputCol="words")

from pyspark.ml.feature import  HashingTF, IDF
from pyspark.ml import Pipeline
from pyspark.sql.functions import col


hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")

idf = IDF(inputCol="rawFeatures", outputCol="features")


pipeline = Pipeline(stages=[tokenizer,hashingTF, idf])
pipeline_model = pipeline.fit(filtered_data)
transformed_data = pipeline_model.transform(filtered_data)


transformed_data.show()

columns_to_drop = ['ingredients','_c2', '_c3', '_c4', '_c5', '_c6', '_c7', '_c8', '_c9', '_c10', '_c11', '_c12', '_c13', '_c14', '_c15','all_ingredients','words','rawFeatures']
Filter_data = transformed_data.drop(*columns_to_drop)
Filter_data.show(truncate=False)

from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

def cosine_similarity(vec1, vec2):
    dot_product = float(vec1.dot(vec2))
    norm_vec1 = float(vec1.norm(2))
    norm_vec2 = float(vec2.norm(2))
    return dot_product / (norm_vec1 * norm_vec2)

# User Define function likhana ka liya phla ussa register kia
cosine_similarity_udf = udf(cosine_similarity, DoubleType())

## do dataframes ko apas main join kia hain
joined_data = Filter_data.alias("df1").crossJoin(Filter_data.alias("df2"))

cosine_similarity_result = joined_data.select(
    "df1.recipeNames",
    "df2.recipeNames",
    cosine_similarity_udf("df1.features", "df2.features").alias("cosine_similarity")
)
##same recipies ki simularity ko avoid karna ka liya
cosine_similarity_result = cosine_similarity_result.filter("df1.recipeNames != df2.recipeNames")
cosine_similarity_result.show(truncate=False)

+--------------------+--------------------+--------------------+--------------------+--------------------+----+----+----+----+----+----+----+----+----+----+----+--------------------+
|         recipeNames|         ingredients|                 _c2|                 _c3|                 _c4| _c5| _c6| _c7| _c8| _c9|_c10|_c11|_c12|_c13|_c14|_c15|     all_ingredients|
+--------------------+--------------------+--------------------+--------------------+--------------------+----+----+----+----+----+----+----+----+----+----+----+--------------------+
|Poppy Seed Bread ...| 3 cups all-purpo...|                NULL|                NULL|                NULL|NULL|NULL|NULL|NULL|NULL|NULL|NULL|NULL|NULL|NULL|NULL|Poppy Seed Bread ...|
|Czech Christmas H...| 1 (0.6 ounce) ca...| beaten // 1 teas...|                NULL|                NULL|NULL|NULL|NULL|NULL|NULL|NULL|NULL|NULL|NULL|NULL|NULL|Czech Christmas H...|
|  Applesauce Bread I| 3 cups all-purpo...|                NULL|                NULL|

In [18]:
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
from pyspark.ml.linalg import DenseVector

def centered_cosine_similarity(vec1, vec2):
    # Compute means of each vector's components
    mean_vec1 = sum(vec1) / len(vec1)
    mean_vec2 = sum(vec2) / len(vec2)

    # Center the vectors by subtracting the means
    centered_vec1 = DenseVector(vec1 - mean_vec1)
    centered_vec2 = DenseVector(vec2 - mean_vec2)

    # Compute cosine similarity between centered vectors
    dot_product = float(centered_vec1.dot(centered_vec2))
    norm_centered_vec1 = float(centered_vec1.norm(2))
    norm_centered_vec2 = float(centered_vec2.norm(2))

    return dot_product / (norm_centered_vec1 * norm_centered_vec2)

centered_cosine_similarity_udf = udf(centered_cosine_similarity, DoubleType())
centered_cosine_similarity_result = joined_data.select(
    "df1.recipeNames",    "df2.recipeNames",
    centered_cosine_similarity_udf("df1.features", "df2.features").alias("centered_cosine_similarity")
)
centered_cosine_similarity_result = centered_cosine_similarity_result.filter("df1.recipeNames != df2.recipeNames")
centered_cosine_similarity_result.show(truncate=False)


+---------------------------+-----------------------------------+--------------------------+
|recipeNames                |recipeNames                        |centered_cosine_similarity|
+---------------------------+-----------------------------------+--------------------------+
|Poppy Seed Bread with Glaze|Czech Christmas Hoska              |0.044448510638663924      |
|Poppy Seed Bread with Glaze|Applesauce Bread I                 |0.2495675615193068        |
|Poppy Seed Bread with Glaze|Raisin Brown Bread                 |0.1317615634871075        |
|Poppy Seed Bread with Glaze|Applesauce Raisin Bread            |0.29978283331911443       |
|Poppy Seed Bread with Glaze|Apple Raisin Bread                 |0.2956631593218344        |
|Poppy Seed Bread with Glaze|Buttermilk Oatmeal Bread           |0.21403369179797524       |
|Poppy Seed Bread with Glaze|Kolaches II                        |0.18403896556643162       |
|Poppy Seed Bread with Glaze|Whole Wheat Bread II               |0.108