<a href="https://colab.research.google.com/github/letiziamolinari/AMD_FINDING-SIMILAR-ITEMS/blob/main/Jaccard_distance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install pyspark



In [None]:
from google.colab import files
json = files.upload()
! mkdir /root/.kaggle
! mv kaggle.json /root/.kaggle/kaggle.json
! chmod 600 /root/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [None]:
import kaggle
from kaggle.api.kaggle_api_extended import KaggleApi

In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark import SparkContext
from pyspark.sql import SQLContext

In [None]:
conf = SparkConf().setAppName("FindingSimilarItems")
conf = (conf.setMaster('local[*]')
        .set('spark.executor.memory', '8G')
        .set('spark.driver.memory', '45G')
        .set('spark.driver.maxResultSize', '0')
        .set("spark.sql.execution.arrow.enabled", "true"))
sc = SparkContext.getOrCreate(conf=conf)
spark = SQLContext(sc)



In [None]:
! kaggle datasets download -d stackoverflow/stacksample -f Questions.csv --unzip -o
! unzip Questions.csv.zip

Downloading Questions.csv.zip to /content
100% 589M/589M [00:03<00:00, 203MB/s]

Archive:  Questions.csv.zip
replace Questions.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: Questions.csv           y
y



In [None]:
#dataset visualization
df = spark.read\
  .option("multiLine", "true")\
  .option("header", "true")\
  .option("escape", "\"")\
  .csv("Questions.csv")
  

In [None]:
df.show(15)

+----+-----------+--------------------+--------------------+-----+--------------------+--------------------+
|  Id|OwnerUserId|        CreationDate|          ClosedDate|Score|               Title|                Body|
+----+-----------+--------------------+--------------------+-----+--------------------+--------------------+
|  80|         26|2008-08-01T13:57:07Z|                  NA|   26|SQLStatement.exec...|<p>I've written a...|
|  90|         58|2008-08-01T14:41:24Z|2012-12-26T03:45:49Z|  144|Good branching an...|<p>Are there any ...|
| 120|         83|2008-08-01T15:50:08Z|                  NA|   21|   ASP.NET Site Maps|<p>Has anyone got...|
| 180|    2089740|2008-08-01T18:42:19Z|                  NA|   53|Function for crea...|<p>This is someth...|
| 260|         91|2008-08-01T23:22:08Z|                  NA|   49|Adding scripting ...|<p>I have a littl...|
| 330|         63|2008-08-02T02:51:36Z|                  NA|   29|Should I use nest...|<p>I am working o...|
| 470|         71|2

In [None]:
#selectiong only Id & Body columns
df = spark.read\
  .option("multiLine", "true")\
  .option("header", "true")\
  .option("escape", "\"")\
  .csv("Questions.csv")\
  .select("Id", "Body")
  



In [None]:
df.show(15)

+----+--------------------+
|  Id|                Body|
+----+--------------------+
|  80|<p>I've written a...|
|  90|<p>Are there any ...|
| 120|<p>Has anyone got...|
| 180|<p>This is someth...|
| 260|<p>I have a littl...|
| 330|<p>I am working o...|
| 470|<p>I've been writ...|
| 580|<p>I wonder how y...|
| 650|<p>I would like t...|
| 810|<p>I'm trying to ...|
| 930|<p>What's the sim...|
|1010|<p>I need to grab...|
|1040|<p>I'm looking fo...|
|1070|<p>What is the co...|
|1160|<p>I am using CCN...|
+----+--------------------+
only showing top 15 rows



In [None]:
from pyspark.sql.functions import col, lower, regexp_replace
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover

In [None]:
def cleaner(s):
    s = lower(s)
    s = regexp_replace(s, "^rt ", "") 
    s = regexp_replace(s, "<p>", "")
    s = regexp_replace(s, "</p>", "") #replace quotes
    s = regexp_replace(s, "(https?\://)\S+", "") #deletion of urls
    s = regexp_replace(s, "[^a-zA-Z0-9\\s]", "") #deletion of all figures and apostrophes
    return s

In [None]:
#application and cleaning body 
cleaned_df = df.select('Id',cleaner(col("Body")).alias("Body"))

In [None]:
cleaned_df.show(15)

+----+--------------------+
|  Id|                Body|
+----+--------------------+
|  80|ive written a dat...|
|  90|are there any rea...|
| 120|has anyone got ex...|
| 180|this is something...|
| 260|i have a little g...|
| 330|i am working on a...|
| 470|ive been writing ...|
| 580|i wonder how you ...|
| 650|i would like the ...|
| 810|im trying to main...|
| 930|whats the simples...|
|1010|i need to grab th...|
|1040|im looking for a ...|
|1070|what is the corre...|
|1160|i am using ccnet ...|
+----+--------------------+
only showing top 15 rows



In [None]:
#removing stopwords
tokenizer = RegexTokenizer(inputCol="Body", outputCol="tokens")
token_df = tokenizer.transform(cleaned_df)
token_df.show(15)

+----+--------------------+--------------------+
|  Id|                Body|              tokens|
+----+--------------------+--------------------+
|  80|ive written a dat...|[ive, written, a,...|
|  90|are there any rea...|[are, there, any,...|
| 120|has anyone got ex...|[has, anyone, got...|
| 180|this is something...|[this, is, someth...|
| 260|i have a little g...|[i, have, a, litt...|
| 330|i am working on a...|[i, am, working, ...|
| 470|ive been writing ...|[ive, been, writi...|
| 580|i wonder how you ...|[i, wonder, how, ...|
| 650|i would like the ...|[i, would, like, ...|
| 810|im trying to main...|[im, trying, to, ...|
| 930|whats the simples...|[whats, the, simp...|
|1010|i need to grab th...|[i, need, to, gra...|
|1040|im looking for a ...|[im, looking, for...|
|1070|what is the corre...|[what, is, the, c...|
|1160|i am using ccnet ...|[i, am, using, cc...|
+----+--------------------+--------------------+
only showing top 15 rows



In [None]:
stopwrd_remover= StopWordsRemover(inputCol = 'tokens', outputCol = 'removed_stopwrd')
final_df= stopwrd_remover.transform(token_df)
final_df.show(15)

+----+--------------------+--------------------+--------------------+
|  Id|                Body|              tokens|     removed_stopwrd|
+----+--------------------+--------------------+--------------------+
|  80|ive written a dat...|[ive, written, a,...|[ive, written, da...|
|  90|are there any rea...|[are, there, any,...|[really, good, tu...|
| 120|has anyone got ex...|[has, anyone, got...|[anyone, got, exp...|
| 180|this is something...|[this, is, someth...|[something, ive, ...|
| 260|i have a little g...|[i, have, a, litt...|[little, game, wr...|
| 330|i am working on a...|[i, am, working, ...|[working, collect...|
| 470|ive been writing ...|[ive, been, writi...|[ive, writing, we...|
| 580|i wonder how you ...|[i, wonder, how, ...|[wonder, guys, ma...|
| 650|i would like the ...|[i, would, like, ...|[like, version, p...|
| 810|im trying to main...|[im, trying, to, ...|[im, trying, main...|
| 930|whats the simples...|[whats, the, simp...|[whats, simplest,...|
|1010|i need to grab

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF
from pyspark.ml.feature import MinHashLSH

In [None]:
#HASHING TF

hashingTF = HashingTF(inputCol="removed_stopwrd", outputCol="TF", numFeatures=1024)
hashed_df = hashingTF.transform(final_df)

In [None]:
#MINHASH
mh = MinHashLSH(inputCol="TF", outputCol="hash", seed = 3)
model = mh.fit(hashed_df)
hash = model.transform(hashed_df)

In [None]:
hash.show(15)

+----+--------------------+--------------------+--------------------+--------------------+----------------+
|  Id|                Body|              tokens|     removed_stopwrd|                  TF|            hash|
+----+--------------------+--------------------+--------------------+--------------------+----------------+
|  80|ive written a dat...|[ive, written, a,...|[ive, written, da...|(1024,[3,6,11,24,...|   [[2078511.0]]|
|  90|are there any rea...|[are, there, any,...|[really, good, tu...|(1024,[24,65,80,4...|  [[5.527494E7]]|
| 120|has anyone got ex...|[has, anyone, got...|[anyone, got, exp...|(1024,[19,39,88,9...| [[3.0469636E7]]|
| 180|this is something...|[this, is, someth...|[something, ive, ...|(1024,[9,17,105,1...|   [[2703620.0]]|
| 260|i have a little g...|[i, have, a, litt...|[little, game, wr...|(1024,[24,43,53,5...| [[8.2415847E7]]|
| 330|i am working on a...|[i, am, working, ...|[working, collect...|(1024,[79,108,117...|   [[2703620.0]]|
| 470|ive been writing ...|[

In [None]:
from pyspark.sql.functions import size
#removal of empty rows and putting a limit on the maximum number of rows
sample= hash.where((size(col("removed_stopwrd")) >= 1)).limit(30000)

In [None]:
#approximate similarity join
match = model.approxSimilarityJoin( sample, sample, 0.6, "JaccardDistance")\
.select(col('datasetA.id').alias('id_A'),
col('datasetB.id').alias('id_B'),
col("JaccardDistance")).filter('id_A < id_B').show()



+-------+-------+------------------+
|   id_A|   id_B|   JaccardDistance|
+-------+-------+------------------+
| 103560|1503630|0.5529411764705883|
| 252660| 920670|               0.5|
| 270440| 272190|0.5636363636363637|
| 503310| 835280|             0.375|
| 612820| 634630|              0.48|
| 897770| 905410|0.4871794871794872|
|1041520|1042370|               0.5|
|1071630| 865480|0.5769230769230769|
|1082310|1276960|0.5642458100558659|
|1125640| 198460|0.5714285714285714|
|1406050| 936820|0.5906735751295337|
+-------+-------+------------------+



In [None]:
cleaned_df.where((col("id") == 503310) | (col("id") == 835280)).show(truncate=False)

+------+-------------------------------------------------------------------+
|Id    |Body                                                               |
+------+-------------------------------------------------------------------+
|503310|is it possible to map an enum as a string using fluent nhibernate\n|
|835280|its possible to map a view using fluent nhibernate if so how\n     |
+------+-------------------------------------------------------------------+



In [None]:
cleaned_df.where((col("id") == 612820) | (col("id") == 634630)).show(truncate=False)

+------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
cleaned_df.where((col("id") == 897770) | (col("id") == 905410)).show(truncate=False)

+------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------