#### Creating Topic & Repo Topic Junction Dataframes

In [0]:
df = (
    spark.sql("SELECT * FROM data").filter(
        (col("event_type") == "ForkEvent") 
        | (col("event_type") == "PullRequestEvent")
    )
)

In [0]:
paths = [
    "payload_forkee",
    "payload_pull_request.base.repo",
    "payload_pull_request.head.repo",
]

In [0]:
head_topics = (
    df.filter(size(col(f"{paths[2]}.topics")) > 0)
    .select(explode(col(f"{paths[2]}.topics")))
    .distinct()
)
base_topics = (
    df.filter(size(col(f"{paths[1]}.topics")) > 0)
    .select(explode(col(f"{paths[1]}.topics")))
    .distinct()
)
forkee_topics = (
    df.filter(size(col(f"{paths[0]}.topics")) > 0)
    .select(explode(col(f"{paths[0]}.topics")))
    .distinct()
)
topics = head_topics.union(base_topics)
topics = topics.union(forkee_topics).distinct().orderBy("col")

In [0]:
topicDF = topics.select(
    monotonically_increasing_id().alias("id"), 
    col("col").alias("keyword")
)

In [0]:
jxn_1 = (
    df.filter(size(col(f"{paths[0]}.topics")) > 0).select(
        col(f"{paths[0]}.id").alias("repo_id"), 
        explode(col(f"{paths[0]}.topics"))
    )
)

jxn_2 = (
    df.filter(size(col(f"{paths[1]}.topics")) > 0)
    .select(
        col(f"{paths[1]}.id").alias("repo_id"), 
        explode(col(f"{paths[1]}.topics"))
    )
)

jxn_3 = (
    df.filter(size(col(f"{paths[2]}.topics")) > 0)
    .select(
        col(f"{paths[2]}.id").alias("repo_id"), 
        explode(col(f"{paths[2]}.topics"))
    )
)

repo_topic_jxn = jxn_1.union(jxn_2)
repo_topic_jxn = repo_topic_jxn.union(jxn_3).distinct()

# Join tables to replace topic keyword with id
repo_topic_jxnDF = (
    repo_topic_jxn.join(topicDF, topicDF.keyword == repo_topic_jxn.col)
    .select(col("repo_id"), col("id").alias("topic_id"))
    .orderBy("topic_id")
)

#### Writing Both to Database

In [0]:
topicDF.repartition(1).write.parquet("abfss://team1-project2@20230821desa.dfs.core.windows.net/SilverLayer/topic")

In [0]:
repo_topic_jxnDF.repartition(1).write.parquet("abfss://team1-project2@20230821desa.dfs.core.windows.net/SilverLayer/repo-topic")