In [19]:
from core.connection import get_session, save_on_database
from core.config import settings
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
import pyspark.sql.functions as func
from pyspark.sql.types import StringType

spark: SparkSession = get_session()

### Load data from csv

In [20]:
data: DataFrame = spark.read.csv(
    f"{settings.BASE_DIR / 'data_csv' / 'base_de_respostas_10k_amostra.csv'}",
    sep=",",
    header=True,
    inferSchema=True,
)

data.show(5)

+----------+-----+----------+-------------+--------------+------------------+--------------------+--------------------+--------------------+--------------------+-----------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+----------+----------+----------+----------+----------+----------+----------+----------+-----------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+---------------+----------------+----------------+---------------------+---------------------+---------------------+---------------------+---------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+--------------------+----------------+------+----------+---------------+--------------+--------------------+-------------------+--------------------+------------------

### Select only data used in Data Warehouse

In [21]:
data = data.select(
    "Respondent",
    "Hobby",
    "ConvertedSalary",
    "CommunicationTools",
    "LanguageWorkedWith",
    "OperatingSystem",
    "CompanySize",
    "Country",
)

data.show(5)

+----------+-----+---------------+--------------------+--------------------+---------------+--------------------+-------------+
|Respondent|Hobby|ConvertedSalary|  CommunicationTools|  LanguageWorkedWith|OperatingSystem|         CompanySize|      Country|
+----------+-----+---------------+--------------------+--------------------+---------------+--------------------+-------------+
|    101346|   No|        50000.0|               Slack|C#;JavaScript;SQL...|        Windows|100 to 499 employees|United States|
|     44791|  Yes|        79552.0|Jira;Other wiki t...|C;C++;Java;JavaSc...|        Windows|1,000 to 4,999 em...|      Germany|
|     32306|  Yes|       125000.0|Office / producti...|Assembly;C;C++;C#...|        Windows|10,000 or more em...|United States|
|     37142|  Yes|           NULL|        Slack;Trello|C++;Java;JavaScri...|          MacOS|  10 to 19 employees|United States|
|     21745|  Yes|           NULL|        Slack;Trello|C;C++;Java;JavaSc...|        Windows|  20 to 99 e

### Add id column

In [22]:
data = data.withColumn("id", func.col("Respondent"))
data.show(5)

+----------+-----+---------------+--------------------+--------------------+---------------+--------------------+-------------+------+
|Respondent|Hobby|ConvertedSalary|  CommunicationTools|  LanguageWorkedWith|OperatingSystem|         CompanySize|      Country|    id|
+----------+-----+---------------+--------------------+--------------------+---------------+--------------------+-------------+------+
|    101346|   No|        50000.0|               Slack|C#;JavaScript;SQL...|        Windows|100 to 499 employees|United States|101346|
|     44791|  Yes|        79552.0|Jira;Other wiki t...|C;C++;Java;JavaSc...|        Windows|1,000 to 4,999 em...|      Germany| 44791|
|     32306|  Yes|       125000.0|Office / producti...|Assembly;C;C++;C#...|        Windows|10,000 or more em...|United States| 32306|
|     37142|  Yes|           NULL|        Slack;Trello|C++;Java;JavaScri...|          MacOS|  10 to 19 employees|United States| 37142|
|     21745|  Yes|           NULL|        Slack;Trello|

### Change respondent name as described in the challenge.


In [23]:
@func.udf(returnType=StringType())
def _add_name_prefix(x):
    return f"respondent_{x}"


data = data.withColumn("name", _add_name_prefix(func.col("Respondent"))).drop(
    "Respondent"
)
data.show(5)

+-----+---------------+--------------------+--------------------+---------------+--------------------+-------------+------+-----------------+
|Hobby|ConvertedSalary|  CommunicationTools|  LanguageWorkedWith|OperatingSystem|         CompanySize|      Country|    id|             name|
+-----+---------------+--------------------+--------------------+---------------+--------------------+-------------+------+-----------------+
|   No|        50000.0|               Slack|C#;JavaScript;SQL...|        Windows|100 to 499 employees|United States|101346|respondent_101346|
|  Yes|        79552.0|Jira;Other wiki t...|C;C++;Java;JavaSc...|        Windows|1,000 to 4,999 em...|      Germany| 44791| respondent_44791|
|  Yes|       125000.0|Office / producti...|Assembly;C;C++;C#...|        Windows|10,000 or more em...|United States| 32306| respondent_32306|
|  Yes|           NULL|        Slack;Trello|C++;Java;JavaScri...|          MacOS|  10 to 19 employees|United States| 37142| respondent_37142|
|  Yes

### Insert data in operation_system

In [24]:
operating_system = (
    data.select(func.col("OperatingSystem").alias("name")).distinct().dropna()
)

save_on_database(operating_system, "operation_system")

### Insert data in country

In [25]:
country = data.select(func.col("Country").alias("name")).distinct().dropna()

save_on_database(country, "country")

### Insert data in company

In [26]:
company = data.select(func.col("CompanySize").alias("size")).distinct().dropna()
save_on_database(company, "company")