### 1. Find the most popular category people post to based on their country.

Output written to /tmp/parquet/most_popular_category_per_country

In [0]:
sql_df1 = spark.sql("""
                WITH category_totals AS (
                    SELECT
                        country,
                        category,
                        COUNT(category) as category_count
                    FROM
                        hilla_pin
                    INNER JOIN
                        hilla_geo ON hilla_pin.ind = hilla_geo.ind
                    GROUP BY
                        country,
                        category
                ), category_ranks AS (
                    SELECT
                        country,
                        category,
                        category_count,
                        RANK() OVER (
                            PARTITION BY country
                            ORDER BY category_count DESC
                        ) as category_rank
                    FROM category_totals
                )
                SELECT
                    country,
                    category,
                    category_count
                FROM
                    category_ranks
                WHERE
                    category_rank = 1
                ORDER BY country;
                        """)

sql_df1.write.mode("overwrite").parquet("/tmp/parquet/most_popular_category_per_country")

### 2a. Frequency of posts per category between 2018 and 2022, in order of popularity

Output written to /tmp/parquet/frequency_of_posts_per_category_per_year

In [0]:
sql_df2a = spark.sql("""
            FROM (
                SELECT
                    date_part('year', timestamp) as post_year,
                    category,
                    COUNT(category) as category_count
                FROM
                    hilla_pin
                INNER JOIN
                    hilla_geo ON hilla_pin.ind = hilla_geo.ind
                GROUP BY
                    post_year,
                    category
            )
            SELECT
                post_year,
                category,
                category_count
            WHERE
                post_year BETWEEN 2018 and 2022
            ORDER BY
                post_year,
                category_count DESC;
                        """)
sql_df2a.write.mode("overwrite").parquet("/tmp/parquet/frequency_of_posts_per_category_per_year")

### 2b. Most popular post category per year

Output written to /tmp/parquet/most_popular_category_per_year

In [0]:
sql_df2a.createOrReplaceTempView("frequency_of_posts_per_category_per_year")

sql_df2b = spark.sql("""
                    WITH
                        frequency_of_posts_per_category_ranked AS (
                    SELECT post_year,
                            category,
                            category_count,
                            RANK() OVER (
                            PARTITION BY post_year
                            ORDER BY category_count DESC
                            ) as category_rank
                    FROM
                        frequency_of_posts_per_category_per_year
                    )
                    SELECT
                        post_year,
                        category,
                        category_count
                    FROM
                        frequency_of_posts_per_category_ranked
                    WHERE
                        category_rank = 1
                    ORDER BY
                        post_year;
""")

sql_df2b.write.mode("overwrite").parquet("/tmp/parquet/most_popular_category_per_year")

### 3a. Find the user with most followers in each country

Output written to /tmp/parquet/user_with_most_followers_per_country

### 3b. Find the country with the user with most followers.

Output written to /tmp/parquet/country_of_user_with_most_followers

In [0]:
sql_df3_1 = spark.sql("""
                WITH all_posters AS (
                    SELECT
                        country,
                        poster_name,
                        follower_count
                    FROM
                        hilla_pin
                    LEFT JOIN
                        hilla_geo ON hilla_pin.ind = hilla_geo.ind
                ), follower_counts_ranked AS (
                    SELECT
                        country,
                        poster_name,
                        follower_count,
                        RANK() OVER (
                            PARTITION BY COUNTRY
                            ORDER BY follower_count DESC
                        ) follower_count_ranking
                    FROM all_posters
                )
                SELECT DISTINCT
                    country,
                    poster_name,
                    follower_count
                FROM
                    follower_counts_ranked
                WHERE
                    follower_count_ranking = 1
                ORDER BY
                    country;
                        """)

sql_df3_1.write.mode("overwrite").parquet("/tmp/parquet/user_with_most_followers_per_country")

In [0]:
sql_df3_1.createOrReplaceTempView("hilla_sql_df3_1")

sql_df3_2 = spark.sql("""
                SELECT
                    country,
                    follower_count
                FROM (
                    SELECT
                        country,
                        follower_count,
                        RANK() OVER (
                            ORDER BY follower_count DESC
                        ) follower_count_ranking
                    FROM
                        hilla_sql_df3_1
                    )
                WHERE follower_count_ranking = 1;
                    """)
sql_df3_2.write.mode("overwrite").parquet("/tmp/parquet/country_of_user_with_most_followers")

### 4. Find the most popular category people post to based on the following age groups:

- 18-24
- 25-35
- 36-50
- +50

Output written to /tmp/parquet/most_popular_category_per_age_group

In [0]:
sql_df4 = spark.sql("""
            WITH category_counts AS (
                SELECT
                    category,
                    COUNT(category) as category_count,
                    CASE
                        WHEN AGE < 25 THEN '18-24'
                        WHEN AGE < 36 THEN '25-35'
                        WHEN AGE < 51 THEN '36-50'
                        ELSE '>50'
                    END AS age_group
                FROM
                    hilla_pin
                LEFT JOIN
                    hilla_user ON hilla_pin.ind = hilla_user.ind
                GROUP BY
                    age_group,
                    category
            ), category_count_rankings AS (
                SELECT
                    age_group,
                    category,
                    category_count,
                    RANK() OVER (
                        PARTITION BY age_group
                        ORDER BY category_count DESC
                    ) category_count_ranking
                FROM
                    category_counts
            )
            SELECT
                age_group,
                category,
                category_count
            FROM
                category_count_rankings
            WHERE
                category_count_ranking = 1;
                    """)

sql_df4.write.mode("overwrite").parquet("/tmp/parquet/most_popular_category_per_age_group")

### 5. Find the median follower count for users in the following age groups:

- 18-24
- 25-35
- 36-50
- +50

Output written to /tmp/parquet/median_follower_count_per_age_group

In [0]:
sql_df5 = spark.sql("""
            SELECT
                CASE
                    WHEN AGE < 25 THEN '18-24'
                    WHEN AGE < 36 THEN '25-35'
                    WHEN AGE < 51 THEN '36-50'
                    ELSE '>50'
                END AS age_group,
                percentile_approx(follower_count, 0.5) AS median_follower_count
            FROM
                hilla_pin
            LEFT JOIN
                hilla_user ON hilla_pin.ind = hilla_user.ind
            GROUP BY
                age_group
            ORDER BY
                age_group;
            """)

sql_df5.write.mode("overwrite").parquet("/tmp/parquet/median_follower_count_per_age_group")

### 6. Find how many users have joined between 2015 and 2020.

Ouput written to /tmp/parquet/new_users_per_year

In [0]:
sql_df6 = spark.sql("""
            FROM (
                SELECT
                    date_part('year', date_joined) as joining_year,
                    COUNT(ind) as number_users_joined
                FROM (
                        SELECT DISTINCT *
                        FROM hilla_user
                )
                GROUP BY
                    joining_year
                )
            SELECT
                joining_year,
                number_users_joined
            WHERE
                joining_year BETWEEN 2015 and 2022
            ORDER BY
                joining_year;
                        """)

sql_df6.write.mode("overwrite").parquet("/tmp/parquet/new_users_per_year")

### 7. Find the median follower count of posts by users who joined between 2015 and 2020, based on joining year

Output written to /tmp/parquet/median_follower_count_per_joining_year

In [0]:
sql_df7 = spark.sql("""
        FROM (
            SELECT
                date_part('year', date_joined) as joining_year,
                follower_count
            FROM
                hilla_user
            LEFT JOIN
                hilla_pin ON hilla_user.ind = hilla_pin.ind
            )
        SELECT
            joining_year,
            percentile_approx(follower_count, 0.5) AS median_follower_count
        WHERE
            joining_year BETWEEN 2015 AND 2020
        GROUP BY
            joining_year
        ORDER BY
            joining_year;
                """)

sql_df7.write.mode("overwrite").parquet("/tmp/parquet/median_follower_count_per_joining_year")

### 8. Find the median follower count of posts by users that have joined between 2015 and 2020, based on their joining year and the age group they are part of

Output written to /tmp/parquet/median_follower_count_per_joining_year_and_age_group

In [0]:
sql_df8 = spark.sql("""
            SELECT
                CASE
                    WHEN AGE < 25 THEN '18-24'
                    WHEN AGE < 36 THEN '25-35'
                    WHEN AGE < 51 THEN '36-50'
                    ELSE '>50'
                END AS age_group,
                date_part('year', date_joined) as joining_year,
                percentile_approx(follower_count, 0.5) AS median_follower_count
            FROM
                hilla_pin
            RIGHT JOIN
                hilla_user ON hilla_pin.ind = hilla_user.ind
            GROUP BY
                age_group,
                joining_year
            ORDER BY
                joining_year,
                age_group;
                    """)

sql_df8.write.mode("overwrite").parquet("/tmp/parquet/median_follower_count_per_joining_year_and_age_group")

### 9. Find the median follower count of posts made by users between 2015 and 2020, based on posting year and user age group

Output written to /tmp/parquet/median_follower_count_per_post_year_and_age_group

In [0]:
sql_df9 = spark.sql("""
            SELECT
                CASE
                    WHEN AGE < 25 THEN '18-24'
                    WHEN AGE < 36 THEN '25-35'
                    WHEN AGE < 51 THEN '36-50'
                    ELSE '>50'
                END AS age_group,
                date_part('year', timestamp) as post_year,
                percentile_approx(follower_count, 0.5) AS median_follower_count
            FROM
                hilla_pin
            JOIN
                hilla_geo ON hilla_pin.ind = hilla_geo.ind
            JOIN
                hilla_user ON hilla_pin.ind = hilla_user.ind
            WHERE
                date_part('year', timestamp) BETWEEN 2015 AND 2020
            GROUP BY
                age_group,
                post_year
            ORDER BY
                post_year,
                age_group;
                    """)

sql_df9.write.mode("overwrite").parquet("/tmp/parquet/median_follower_count_per_post_year_and_age_group")