In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.feature import Tokenizer, StopWordsRemover, Word2Vec, Word2VecModel
from pyspark.ml import Pipeline
import pandas as pd

# To simulate a cluster environment, change the instance size to test multi instance performance

spark = (
    SparkSession.builder.master("local[*]")
    .appName("Spark-Word2Vec")
    .config("spark.driver.memory", "20g")
    #.config("spark.driver.cores", "2")
    #.config("spark.executor.cores", "2")
    #.config("spark.executor.memory", "2g")
    #.config("spark.driver.maxResultSize", "3g")
    #.config("spark.executor.instances", "2")
    .getOrCreate()
)

# 1-Read and Clean the Dataset 

In [None]:
# Read the JSON lines file
comments = (
    spark.read.json("data/RC_2010-07")
    .select("body", "subreddit")
    .where("body != '[deleted]' AND body != '[removed]'")
    # Replace newline and carriage return characters with a space
    .withColumn("body", F.regexp_replace(F.col("body"), "[\\r\\n]+", " "))
    # Remove URLs (matches strings starting with http or https)
    .withColumn("body", F.regexp_replace(F.col("body"), "https?://\\S+", ""))
    # Remove characters that are not letters, digits, whitespace, or apostrophes
    .withColumn("body", F.regexp_replace(F.col("body"), "[^a-zA-Z0-9\\s']", ""))
)

comments.show(5, truncate=50)

# Get basic statistics
print(f"Number of records: {comments.count()}")
print(f"Number of columns: {len(comments.columns)}")

## 1.1-Create a pipeline to tokenize the text 

In [None]:
# Define pipeline stages
tokenizer = Tokenizer(inputCol="body", outputCol="tokens")
stopwords_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")

# Create and fit the pipeline
pipeline = Pipeline(stages=[tokenizer, stopwords_remover])
comments = pipeline.fit(comments).transform(comments).select("filtered_tokens")

comments.show(5, truncate=50)

print(f"Number of total words to train Word2Vec: {comments.select(F.explode('filtered_tokens')).count()}")

# 2-Train Word2Vec model

In [None]:
# Train the word2vec model, depending on the size of the data, this may take a while

word2vec_model = Word2Vec(
    vectorSize=50,
    minCount=5,
    maxIter=1,
    inputCol="filtered_tokens",
    outputCol="word2vec_features",
).fit(comments)

In [None]:
# Save the model
word2vec_model.write().overwrite().save("data/word2vec_model")

In [None]:
# Load the model
word2vec_model = Word2VecModel.load("data/word2vec_model")

## 2.1-Extract Keywords from Similarity matrix

In [None]:
# Get synonyms for a list of words for later training

words_list = ["music", "gaming", "politics", "programming", "science"]
keywords = []

for word in words_list:
    synonyms = word2vec_model.findSynonyms(word, 50).toPandas().drop(columns="similarity")
    synonyms["label"] = word
    keywords.append(synonyms)

keywords = pd.concat(keywords, ignore_index=True)

In [None]:
keywords.head(10)

In [None]:
keywords.to_parquet("data/keywords.parquet", index=False)