#### Creating Repo Event Dataframe

In [0]:
df = spark.sql("SELECT * FROM data")

#### Reference variables

In [0]:
# Final column names
col_names = [
    "id",
    "allow_forking",
    "archive_url",
    "archived",
    "assignees_url",
    "blobs_url",
    "branches_url",
    "clone_url",
    "collaborators_url",
    "comments_url",
    "commits_url",
    "compare_url",
    "contents_url",
    "contributors_url",
    "created_at",
    "default_branch",
    "deployments_url",
    "description",
    "disabled",
    "downloads_url",
    "events_url",
    "fork",
    "forks",
    "forks_count",
    "forks_url",
    "full_name",
    "git_commits_url",
    "git_refs_url",
    "git_tags_url",
    "git_url",
    "has_downloads",
    "has_issues",
    "has_pages",
    "has_projects",
    "has_wiki",
    "homepage",
    "hooks_url",
    "html_url",
    "is_template",
    "issue_comment_url",
    "issue_events_url",
    "issues_url",
    "keys_url",
    "labels_url",
    "language",
    "languages_url",
    "license_id",
    "merges_url",
    "milestones_url",
    "mirror_url",
    "name",
    "node_id",
    "notifications_url",
    "open_issues",
    "open_issues_count",
    "owner_id",
    "private",
    "pulls_url",
    "pushed_at",
    "releases_url",
    "size",
    "ssh_url",
    "stargazers_count",
    "stargazers_url",
    "statuses_url",
    "subscribers_url",
    "subscription_url",
    "svn_url",
    "tags_url",
    "teams_url",
    "trees_url",
    "updated_at",
    "url",
    "visibility",
    "watchers",
    "watchers_count",
]

# Col names that cast to type and search through objects for nested id's
cols = col_names.copy()
cols[0] = col("id").cast(LongType())
cols[14] = col("created_at").cast(TimestampType())
cols[46] = col("license.key").alias("license_id")
cols[55] = col("owner.id").alias("owner_id")
cols[58] = col("pushed_at").cast(TimestampType())
cols[71] = col("updated_at").cast(TimestampType())

# Top level repo cols with missing info replaced with nulls
top_level_cols = [lit(None) for i in range(76)]
top_level_cols[0] = "repo_id"
top_level_cols[50] = "repo_name"
top_level_cols[72] = "repo_url"

In [0]:
paths = [
    "payload_forkee",
    "payload_pull_request.base.repo",
    "payload_pull_request.head.repo",
]

In [0]:
top_repoDF = df.select(top_level_cols).distinct().orderBy("repo_id")

In [0]:
forkeeDF = (
    df.filter(col(paths[0]).isNotNull())
    .select(f"{paths[0]}.*")
    .select(cols)
    .distinct()
    .orderBy("id")
)

In [0]:
base_repoDF = (
    df.filter(col(paths[1]).isNotNull())
    .select(f"{paths[1]}.*")
    .select(cols)
    .distinct()
    .orderBy("id")
)

In [0]:
head_repoDF = (
    df.filter(col(paths[2]).isNotNull())
    .select(f"{paths[2]}.*")
    .select(cols)
    .distinct()
    .orderBy("id")
)

In [0]:
repoDF = forkeeDF.union(base_repoDF)
repoDF = repoDF.union(head_repoDF).distinct()

In [0]:
repoDF = repoDF.union(top_repoDF).orderBy("id")

In [0]:
# Max hack (see user table for explanation)
repoDF = repoDF.groupBy("id").agg(
    max(col_names[1]).alias(col_names[1]),
    max(col_names[2]).alias(col_names[2]),
    max(col_names[3]).alias(col_names[3]),
    max(col_names[4]).alias(col_names[4]),
    max(col_names[5]).alias(col_names[5]),
    max(col_names[6]).alias(col_names[6]),
    max(col_names[7]).alias(col_names[7]),
    max(col_names[8]).alias(col_names[8]),
    max(col_names[9]).alias(col_names[9]),
    max(col_names[10]).alias(col_names[10]),
    max(col_names[11]).alias(col_names[11]),
    max(col_names[12]).alias(col_names[12]),
    max(col_names[13]).alias(col_names[13]),
    max(col_names[14]).alias(col_names[14]),
    max(col_names[15]).alias(col_names[15]),
    max(col_names[16]).alias(col_names[16]),
    max(col_names[17]).alias(col_names[17]),
    max(col_names[18]).alias(col_names[18]),
    max(col_names[19]).alias(col_names[19]),
    max(col_names[20]).alias(col_names[20]),
    max(col_names[21]).alias(col_names[21]),
    max(col_names[22]).alias(col_names[22]),
    max(col_names[23]).alias(col_names[23]),
    max(col_names[24]).alias(col_names[24]),
    max(col_names[25]).alias(col_names[25]),
    max(col_names[26]).alias(col_names[26]),
    max(col_names[27]).alias(col_names[27]),
    max(col_names[28]).alias(col_names[28]),
    max(col_names[29]).alias(col_names[29]),
    max(col_names[30]).alias(col_names[30]),
    max(col_names[31]).alias(col_names[31]),
    max(col_names[32]).alias(col_names[32]),
    max(col_names[33]).alias(col_names[33]),
    max(col_names[34]).alias(col_names[34]),
    max(col_names[35]).alias(col_names[35]),
    max(col_names[36]).alias(col_names[36]),
    max(col_names[37]).alias(col_names[37]),
    max(col_names[38]).alias(col_names[38]),
    max(col_names[39]).alias(col_names[39]),
    max(col_names[40]).alias(col_names[40]),
    max(col_names[41]).alias(col_names[41]),
    max(col_names[42]).alias(col_names[42]),
    max(col_names[43]).alias(col_names[43]),
    max(col_names[44]).alias(col_names[44]),
    max(col_names[45]).alias(col_names[45]),
    max(col_names[46]).alias(col_names[46]),
    max(col_names[47]).alias(col_names[47]),
    max(col_names[48]).alias(col_names[48]),
    max(col_names[49]).alias(col_names[49]),
    max(col_names[50]).alias(col_names[50]),
    max(col_names[51]).alias(col_names[51]),
    max(col_names[52]).alias(col_names[52]),
    max(col_names[53]).alias(col_names[53]),
    max(col_names[54]).alias(col_names[54]),
    max(col_names[55]).alias(col_names[55]),
    max(col_names[56]).alias(col_names[56]),
    max(col_names[57]).alias(col_names[57]),
    max(col_names[58]).alias(col_names[58]),
    max(col_names[59]).alias(col_names[59]),
    max(col_names[60]).alias(col_names[60]),
    max(col_names[61]).alias(col_names[61]),
    max(col_names[62]).alias(col_names[62]),
    max(col_names[63]).alias(col_names[63]),
    max(col_names[64]).alias(col_names[64]),
    max(col_names[65]).alias(col_names[65]),
    max(col_names[66]).alias(col_names[66]),
    max(col_names[67]).alias(col_names[67]),
    max(col_names[68]).alias(col_names[68]),
    max(col_names[69]).alias(col_names[69]),
    max(col_names[70]).alias(col_names[70]),
    max(col_names[71]).alias(col_names[71]),
    max(col_names[72]).alias(col_names[72]),
    max(col_names[73]).alias(col_names[73]),
    max(col_names[74]).alias(col_names[74]),
    max(col_names[75]).alias(col_names[75]),
)

#### Write to Database

In [0]:
silver_path = "abfss://team1-project2@20230821desa.dfs.core.windows.net/SilverLayer/repo"

repoDF.repartition(87).write.parquet(silver_path)