In [2]:
from core.connection import get_session, save_on_database
from core.config import settings
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
import pyspark.sql.functions as F
import pyspark.sql.types as T

spark: SparkSession = get_session()

# ðŸ¥‰ Bronze Layer

### Load data from csv

In [3]:
data: DataFrame = spark.read.csv(
    f"{settings.BASE_DIR / 'data_csv' / 'survey_results_public.csv'}",
    sep=",",
    header=True,
    inferSchema=True,
)

data.show(5)

24/06/26 14:13:13 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+----------+-----+----------+--------------+--------------+------------------+--------------------+--------------------+--------------------+--------------------+----------------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+----------+----------+----------+----------+----------+----------+----------+----------+-----------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+----------------+----------------+---------------------+---------------------+---------------------+---------------------+---------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+------+----------+---------------+--------------+--------------------+-------------------+--------------------+--------

## Select only data used in Data Warehouse

In [4]:
data = data.select(
    "Respondent",
    "Hobby",
    "OpenSource",
    "ConvertedSalary",
    "CommunicationTools",
    "LanguageWorkedWith",
    "OperatingSystem",
    "CompanySize",
    "Country",
)

data.show(5)

+----------+-----+----------+---------------+--------------------+--------------------+---------------+--------------------+--------------+
|Respondent|Hobby|OpenSource|ConvertedSalary|  CommunicationTools|  LanguageWorkedWith|OperatingSystem|         CompanySize|       Country|
+----------+-----+----------+---------------+--------------------+--------------------+---------------+--------------------+--------------+
|         1|  Yes|        No|             NA|               Slack|JavaScript;Python...|    Linux-based|  20 to 99 employees|         Kenya|
|         3|  Yes|       Yes|          70841|Confluence;Office...|JavaScript;Python...|    Linux-based|10,000 or more em...|United Kingdom|
|         4|  Yes|       Yes|             NA|                  NA|                  NA|             NA|  20 to 99 employees| United States|
|         5|   No|        No|             NA|                  NA|C#;JavaScript;SQL...|        Windows|100 to 499 employees| United States|
|         7|  Yes|  

# ðŸ¥ˆ Silver Layer

### Replace Na to None

In [5]:
data = data.replace("NA", None)

### Create function to add id in dataframe

In [6]:
def add_id_column(df: DataFrame, session: SparkSession) -> DataFrame:
    df_items = [y for x in df.collect() for y in list(x.asDict().values())]
    return session.createDataFrame(
        list(enumerate(df_items, start=1)), "id: int, name: string"
    )

### Create dataframe operating_system

In [7]:
operating_system = add_id_column(
    data.select(F.col("OperatingSystem").alias("name")).distinct().dropna(),
    spark,
)
operating_system.show()

                                                                                

+---+-----------+
| id|       name|
+---+-----------+
|  1|Linux-based|
|  2|   BSD/Unix|
|  3|      MacOS|
|  4|    Windows|
+---+-----------+



### Create dataframe country

In [8]:
country = add_id_column(
    data.select(F.col("Country").alias("name")).distinct().dropna(), spark
)
country.show()

                                                                                

+---+--------------------+
| id|                name|
+---+--------------------+
|  1|            Paraguay|
|  2|The former Yugosl...|
|  3|               Yemen|
|  4|              Sweden|
|  5|  Hong Kong (S.A.R.)|
|  6|   Republic of Korea|
|  7|         Philippines|
|  8|            Malaysia|
|  9|           Singapore|
| 10|                Fiji|
| 11|              Turkey|
| 12|             Germany|
| 13|            Cambodia|
| 14|         Afghanistan|
| 15|              Jordan|
| 16|              Rwanda|
| 17|              France|
| 18|              Greece|
| 19|           Sri Lanka|
| 20|              Taiwan|
+---+--------------------+
only showing top 20 rows



### Create dataframe company

In [9]:
company = add_id_column(
    data.select(F.col("CompanySize").alias("size")).distinct().dropna(), spark
)
company.show()

                                                                                

+---+--------------------+
| id|                name|
+---+--------------------+
|  1|Fewer than 10 emp...|
|  2|100 to 499 employees|
|  3|5,000 to 9,999 em...|
|  4|1,000 to 4,999 em...|
|  5|  20 to 99 employees|
|  6|500 to 999 employees|
|  7|10,000 or more em...|
|  8|  10 to 19 employees|
+---+--------------------+



## Create dataframe respondent

In [10]:
respondent = data.select(
    "Respondent",
    "Hobby",
    "OpenSource",
    "ConvertedSalary",
    "OperatingSystem",
    "CompanySize",
    "Country"
)
respondent.show(5)

+----------+-----+----------+---------------+---------------+--------------------+--------------+
|Respondent|Hobby|OpenSource|ConvertedSalary|OperatingSystem|         CompanySize|       Country|
+----------+-----+----------+---------------+---------------+--------------------+--------------+
|         1|  Yes|        No|           NULL|    Linux-based|  20 to 99 employees|         Kenya|
|         3|  Yes|       Yes|          70841|    Linux-based|10,000 or more em...|United Kingdom|
|         4|  Yes|       Yes|           NULL|           NULL|  20 to 99 employees| United States|
|         5|   No|        No|           NULL|        Windows|100 to 499 employees| United States|
|         7|  Yes|        No|          21426|        Windows|10,000 or more em...|  South Africa|
+----------+-----+----------+---------------+---------------+--------------------+--------------+
only showing top 5 rows



### Add id column

In [11]:
respondent = respondent.withColumn("id", F.col("Respondent"))
respondent.show(5)

+----------+-----+----------+---------------+---------------+--------------------+--------------+---+
|Respondent|Hobby|OpenSource|ConvertedSalary|OperatingSystem|         CompanySize|       Country| id|
+----------+-----+----------+---------------+---------------+--------------------+--------------+---+
|         1|  Yes|        No|           NULL|    Linux-based|  20 to 99 employees|         Kenya|  1|
|         3|  Yes|       Yes|          70841|    Linux-based|10,000 or more em...|United Kingdom|  3|
|         4|  Yes|       Yes|           NULL|           NULL|  20 to 99 employees| United States|  4|
|         5|   No|        No|           NULL|        Windows|100 to 499 employees| United States|  5|
|         7|  Yes|        No|          21426|        Windows|10,000 or more em...|  South Africa|  7|
+----------+-----+----------+---------------+---------------+--------------------+--------------+---+
only showing top 5 rows



### Change respondent name as described in the challenge.


In [12]:
@F.udf(returnType=T.StringType())
def _add_name_prefix(x):
    return f"respondent_{x}"


respondent = respondent.withColumn("name", _add_name_prefix(F.col("Respondent"))).drop(
    "Respondent"
)
respondent.show(5)

+-----+----------+---------------+---------------+--------------------+--------------+---+------------+
|Hobby|OpenSource|ConvertedSalary|OperatingSystem|         CompanySize|       Country| id|        name|
+-----+----------+---------------+---------------+--------------------+--------------+---+------------+
|  Yes|        No|           NULL|    Linux-based|  20 to 99 employees|         Kenya|  1|respondent_1|
|  Yes|       Yes|          70841|    Linux-based|10,000 or more em...|United Kingdom|  3|respondent_3|
|  Yes|       Yes|           NULL|           NULL|  20 to 99 employees| United States|  4|respondent_4|
|   No|        No|           NULL|        Windows|100 to 499 employees| United States|  5|respondent_5|
|  Yes|        No|          21426|        Windows|10,000 or more em...|  South Africa|  7|respondent_7|
+-----+----------+---------------+---------------+--------------------+--------------+---+------------+
only showing top 5 rows



#### Create function to reference id from database & replace value

In [13]:
def get_reference_from_dataframe(df: DataFrame) -> dict[str, int]:
    return {v: k for k, v in dict(df.collect()).items()}


def replace_value(x, to_replace: dict):
    return to_replace.get(x, T.NullType())

### create column company_id

In [15]:
company_ref = get_reference_from_dataframe(company)


@F.udf(returnType=T.IntegerType())
def company_replace_udf(x):
    return replace_value(x, company_ref)


respondent = respondent.withColumn(
    "company_id", company_replace_udf(F.col("CompanySize"))
).drop("CompanySize")
respondent.show(5)

+-----+----------+---------------+---------------+--------------+---+------------+----------+
|Hobby|OpenSource|ConvertedSalary|OperatingSystem|       Country| id|        name|company_id|
+-----+----------+---------------+---------------+--------------+---+------------+----------+
|  Yes|        No|           NULL|    Linux-based|         Kenya|  1|respondent_1|         5|
|  Yes|       Yes|          70841|    Linux-based|United Kingdom|  3|respondent_3|         7|
|  Yes|       Yes|           NULL|           NULL| United States|  4|respondent_4|         5|
|   No|        No|           NULL|        Windows| United States|  5|respondent_5|         2|
|  Yes|        No|          21426|        Windows|  South Africa|  7|respondent_7|         7|
+-----+----------+---------------+---------------+--------------+---+------------+----------+
only showing top 5 rows



#### create column operation_system_id

In [16]:
os_ref = get_reference_from_dataframe(operating_system)


@F.udf(returnType=T.IntegerType())
def os_replace_udf(x):
    return replace_value(x, os_ref)


respondent = respondent.withColumn(
    "operation_system_id", os_replace_udf(F.col("OperatingSystem"))
).drop("OperatingSystem")
respondent.show(5)

+-----+----------+---------------+--------------+---+------------+----------+-------------------+
|Hobby|OpenSource|ConvertedSalary|       Country| id|        name|company_id|operation_system_id|
+-----+----------+---------------+--------------+---+------------+----------+-------------------+
|  Yes|        No|           NULL|         Kenya|  1|respondent_1|         5|                  1|
|  Yes|       Yes|          70841|United Kingdom|  3|respondent_3|         7|                  1|
|  Yes|       Yes|           NULL| United States|  4|respondent_4|         5|               NULL|
|   No|        No|           NULL| United States|  5|respondent_5|         2|                  4|
|  Yes|        No|          21426|  South Africa|  7|respondent_7|         7|                  4|
+-----+----------+---------------+--------------+---+------------+----------+-------------------+
only showing top 5 rows



#### create column country_id

In [17]:
country_ref = get_reference_from_dataframe(country)


@F.udf(returnType=T.IntegerType())
def country_replace_udf(x):
    return replace_value(x, country_ref)


respondent = respondent.withColumn(
    "country_id", country_replace_udf(F.col("Country"))
).drop("Country")

respondent.show(5)

+-----+----------+---------------+---+------------+----------+-------------------+----------+
|Hobby|OpenSource|ConvertedSalary| id|        name|company_id|operation_system_id|country_id|
+-----+----------+---------------+---+------------+----------+-------------------+----------+
|  Yes|        No|           NULL|  1|respondent_1|         5|                  1|        97|
|  Yes|       Yes|          70841|  3|respondent_3|         7|                  1|       127|
|  Yes|       Yes|           NULL|  4|respondent_4|         5|               NULL|        38|
|   No|        No|           NULL|  5|respondent_5|         2|                  4|        38|
|  Yes|        No|          21426|  7|respondent_7|         7|                  4|       121|
+-----+----------+---------------+---+------------+----------+-------------------+----------+
only showing top 5 rows



### Change ConvertedSalary to Decimal and transform null to 0.0

In [18]:
respondent = respondent.withColumn(
    "ConvertedSalary", respondent["ConvertedSalary"].cast(T.DoubleType())
).fillna(0.0, subset="ConvertedSalary")
respondent.show(10)

+-----+----------+---------------+---+-------------+----------+-------------------+----------+
|Hobby|OpenSource|ConvertedSalary| id|         name|company_id|operation_system_id|country_id|
+-----+----------+---------------+---+-------------+----------+-------------------+----------+
|  Yes|        No|            0.0|  1| respondent_1|         5|                  1|        97|
|  Yes|       Yes|        70841.0|  3| respondent_3|         7|                  1|       127|
|  Yes|       Yes|            0.0|  4| respondent_4|         5|               NULL|        38|
|   No|        No|            0.0|  5| respondent_5|         2|                  4|        38|
|  Yes|        No|        21426.0|  7| respondent_7|         7|                  4|       121|
|  Yes|        No|        41671.0|  8| respondent_8|         8|                  1|       127|
|  Yes|       Yes|       120000.0|  9| respondent_9|         7|                  3|        38|
|  Yes|       Yes|            0.0| 10|respondent_1

#### Transform salary

In [19]:
@F.udf(returnType=T.DoubleType())
def convert_salary(x):
    return round((x / settings.YEAR_MONTHS) * settings.DOLLAR_EXCHANGE, 2)


respondent = respondent.withColumn(
    "salary", convert_salary(F.col("ConvertedSalary"))
).drop("ConvertedSalary")
respondent.show(5)

+-----+----------+---+------------+----------+-------------------+----------+--------+
|Hobby|OpenSource| id|        name|company_id|operation_system_id|country_id|  salary|
+-----+----------+---+------------+----------+-------------------+----------+--------+
|  Yes|        No|  1|respondent_1|         5|                  1|        97|     0.0|
|  Yes|       Yes|  3|respondent_3|         7|                  1|       127|22492.02|
|  Yes|       Yes|  4|respondent_4|         5|               NULL|        38|     0.0|
|   No|        No|  5|respondent_5|         2|                  4|        38|     0.0|
|  Yes|        No|  7|respondent_7|         7|                  4|       121| 6802.76|
+-----+----------+---+------------+----------+-------------------+----------+--------+
only showing top 5 rows



#### transform OpenSource and Hobby column and order dataframe

In [20]:
respondent = (
    respondent.select(
        "id",
        "name",
        "OpenSource",
        "Hobby",
        "salary",
        "operation_system_id",
        "country_id",
        "company_id",
    )
    .withColumns(
        {
            "open_source": F.col("OpenSource").cast(T.BooleanType()),
            "hobby": F.col("Hobby").cast(T.BooleanType()),
        }
    )
    .drop("OpenSource")
)
respondent.show(5)

+---+------------+-----+--------+-------------------+----------+----------+-----------+
| id|        name|hobby|  salary|operation_system_id|country_id|company_id|open_source|
+---+------------+-----+--------+-------------------+----------+----------+-----------+
|  1|respondent_1| true|     0.0|                  1|        97|         5|      false|
|  3|respondent_3| true|22492.02|                  1|       127|         7|       true|
|  4|respondent_4| true|     0.0|               NULL|        38|         5|       true|
|  5|respondent_5|false|     0.0|                  4|        38|         2|      false|
|  7|respondent_7| true| 6802.76|                  4|       121|         7|      false|
+---+------------+-----+--------+-------------------+----------+----------+-----------+
only showing top 5 rows



### Create function to separate values in CommunicationTools and LanguageWorkedWith

In [21]:
def sep_col_values(df: DataFrame, col: str, sep: str = ";") -> DataFrame:

    col_separated = (
        df.select((F.split(col, sep))).rdd.flatMap(lambda x: x).collect()
    )

    col_separated = set(
        [
            item
            for sublist in col_separated
            if sublist is not None
            for item in sublist
        ]
    )

    return spark.createDataFrame(
        list(enumerate(col_separated, start=1)), "id: int, name: string"
    )

### Create dataframe programming_language

In [22]:
programming_language = sep_col_values(data, "LanguageWorkedWith")
programming_language.show(10)

                                                                                

+---+----------+
| id|      name|
+---+----------+
|  1|      Hack|
|  2|      Rust|
|  3|      Java|
|  4|   Clojure|
|  5|       Lua|
|  6|     Swift|
|  7|       C++|
|  8|    Matlab|
|  9|JavaScript|
| 10|    Kotlin|
+---+----------+
only showing top 10 rows



### Create dataframe communications_tools

In [23]:
communications_tools = sep_col_values(data, "CommunicationTools")
communications_tools.show()

                                                                                

+---+--------------------+
| id|                name|
+---+--------------------+
|  1|Office / producti...|
|  2|            Facebook|
|  3|                Jira|
|  4|Google Hangouts/Chat|
|  5|Other chat system...|
|  6|              Trello|
|  7|Stack Overflow En...|
|  8|Other wiki tool (...|
|  9|          Confluence|
| 10|             HipChat|
| 11|               Slack|
+---+--------------------+



### Create function to separate every language in row

In [22]:
def col_list_to_df(df: DataFrame, col: str, sep: str = ";") -> DataFrame:

    data_rdd = df.select("id", F.split(col, ";")).dropna().rdd.collect()

    data_list = [list(zip([b] * len(c), c)) for b, c in data_rdd]

    split_data = [y for x in data_list for y in x]

    return spark.createDataFrame(
        split_data,
        T.StructType(
            [
                T.StructField("respondent_id", T.LongType()),
                T.StructField("name", T.StringType()),
            ]
        ),
    )

#### Create dataframe resp_programming_language

In [23]:
programming_language_ref = get_reference_from_dataframe(programming_language)

resp_programming_language = col_list_to_df(data, "LanguageWorkedWith")


@F.udf(returnType=T.IntegerType())
def programming_replace_udf(x):
    return replace_value(x, programming_language_ref)


resp_programming_language = resp_programming_language.withColumn(
    "programming_language_id", programming_replace_udf(F.col("name"))
).drop("name")

resp_programming_language.show(5)

                                                                                

+-------------+-----------------------+
|respondent_id|programming_language_id|
+-------------+-----------------------+
|            1|                     27|
|            1|                     38|
|            1|                     20|
|            1|                     25|
|            3|                     27|
+-------------+-----------------------+
only showing top 5 rows



#### Create dataframe resp_tools

In [24]:
communications_tools_ref = get_reference_from_dataframe(communications_tools)

resp_tools = col_list_to_df(data, "CommunicationTools")


@F.udf(returnType=T.IntegerType())
def communication_replace_udf(x):
    return replace_value(x, communications_tools_ref)


resp_tools = resp_tools.withColumn(
    "communications_tools_id", communication_replace_udf(F.col("name"))
).drop("name")

resp_tools.show(5)

+-------------+-----------------------+
|respondent_id|communications_tools_id|
+-------------+-----------------------+
|            1|                      2|
|            3|                     11|
|            3|                      4|
|            3|                      2|
|            3|                      5|
+-------------+-----------------------+
only showing top 5 rows

