In [1]:
import pyspark.sql.functions as F
from core.connection import get_from_database, get_session

spark = get_session()

24/06/27 16:51:40 WARN Utils: Your hostname, IdeaPad-Gaming-3-15IHU6 resolves to a loopback address: 127.0.1.1; using 192.168.1.4 instead (on interface wlp0s20f3)
24/06/27 16:51:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/06/27 16:51:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/27 16:51:41 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


### Import data from database to spark-warehouse

In [3]:
dw_ame = get_from_database(spark, "information_schema.tables")
tables = (
    dw_ame.select("table_name")
    .where(F.col("table_schema") == "public")
    .rdd.flatMap(lambda x: x)
    .collect()
)
for table in tables:
    get_from_database(spark, table).write.saveAsTable(table)

spark.sql("SHOW TABLES;").show()

                                                                                

+---------+--------------------+-----------+
|namespace|           tableName|isTemporary|
+---------+--------------------+-----------+
|  default| communication_tools|      false|
|  default|             company|      false|
|  default|             country|      false|
|  default|    operation_system|      false|
|  default|programming_language|      false|
|  default|resp_programming_...|      false|
|  default|          resp_tools|      false|
|  default|          respondent|      false|
+---------+--------------------+-----------+



1. Qual a quantidade de respondentes de cada país?

In [4]:
spark.sql(
    """
SELECT 
    c.name AS country, COUNT(c.name) AS total 
FROM 
    respondent r 
INNER JOIN 
    country c ON r.country_id = c.id 
GROUP BY country 
ORDER BY total DESC;
"""
).show()

+------------------+-----+
|           country|total|
+------------------+-----+
|     United States|20309|
|             India|13721|
|           Germany| 6459|
|    United Kingdom| 6221|
|            Canada| 3393|
|Russian Federation| 2869|
|            France| 2572|
|            Brazil| 2505|
|            Poland| 2122|
|         Australia| 2018|
|       Netherlands| 1841|
|             Spain| 1769|
|             Italy| 1535|
|           Ukraine| 1279|
|            Sweden| 1164|
|          Pakistan| 1050|
|             China| 1037|
|       Switzerland| 1010|
|            Turkey| 1004|
|            Israel| 1003|
+------------------+-----+
only showing top 20 rows



2. Quantos usuários que moram em "United States" gostam de Windows?

In [5]:
spark.sql(
    """
SELECT 
    count(*) AS `Windows user's in United States`
FROM respondent r 
INNER JOIN 
    operation_system os ON r.operation_system_id = os.id 
INNER JOIN 
    country c ON r.country_id = c.id 
WHERE c.name = 'United States' AND os.name = 'Windows';
"""
).show()

+-------------------------------+
|Windows user's in United States|
+-------------------------------+
|                           7635|
+-------------------------------+



3. Qual a média de salário dos usuários que moram em Israel e gostam de Linux?

In [6]:
spark.sql(
    """
SELECT 
    ROUND(AVG(salary), 2) AS `Israel linux user's salary mean`
FROM respondent r 
INNER JOIN 
    operation_system os ON r.operation_system_id = os.id 
INNER JOIN 
    country c ON r.country_id = c.id
WHERE c.name = 'Israel' and os.name = 'Linux-based';
"""
).show()

+-------------------------------+
|Israel linux user's salary mean|
+-------------------------------+
|                       16809.26|
+-------------------------------+



4. Qual a média e o desvio padrão do salário dos usuários que usam Slack para cada tamanho de empresa disponível?

In [7]:
spark.sql(
    """
SELECT 
	size, ROUND(AVG(salary), 2) AS `salary mean`,
    ROUND(STDDEV(salary), 2)  AS `standard deviation`
FROM
	(SELECT 
		r.id, r.name, ct.name AS `communication_tools`, r.salary, r.company_id 
	FROM 
		resp_tools rt 
	INNER JOIN 
		communication_tools ct on rt.communication_tools_id = ct.id 
	INNER JOIN 
		respondent r on rt.respondent_id = r.id) r
INNER JOIN company c on r.company_id = c.id
WHERE communication_tools = 'Slack'
GROUP BY size;
"""
).show()

+--------------------+-----------+------------------+
|                size|salary mean|standard deviation|
+--------------------+-----------+------------------+
|Fewer than 10 emp...|   21037.84|          52621.87|
|100 to 499 employees|   27616.54|          59485.82|
|5,000 to 9,999 em...|   30055.67|          68600.93|
|1,000 to 4,999 em...|   30874.92|          66487.79|
|  20 to 99 employees|   24677.47|          56113.57|
|500 to 999 employees|   27407.43|          58763.03|
|10,000 or more em...|   34710.87|          75875.98|
|  10 to 19 employees|   21523.31|          53277.85|
+--------------------+-----------+------------------+



5. Qual a diferença entre a média de salário dos respondentes do Brasil que acham que criar código é um hobby e a média de todos de salário de todos os respondentes brasileiros agrupado por cada sistema operacional que eles usam?

In [8]:
spark.sql(
    """
WITH salary_mean AS (
    SELECT
        os.name AS operation_system, ROUND(AVG(r.salary), 2) AS `salary_mean`
    FROM
        respondent r 
    INNER JOIN 
        country c ON r.country_id = c.id 
    INNER JOIN
        operation_system os ON r.operation_system_id = os.id
    WHERE
        c.name = 'Brazil'
    GROUP BY os.name
), hobby_salary_mean AS (
    SELECT
        os.name AS operation_system, ROUND(AVG(r.salary), 2) AS `salary_mean`
    FROM
        respondent r 
    INNER JOIN 
        country c ON r.country_id = c.id 
    INNER JOIN
        operation_system os ON r.operation_system_id = os.id
    WHERE
        c.name = 'Brazil' AND r.hobby = TRUE
    GROUP BY os.name
)
SELECT
    sm.operation_system, sm.salary_mean AS `brazil salary mean`,
    hm.salary_mean AS `brazil hobby salary mean`,
    sm.salary_mean - hm.salary_mean AS `diff salary mean`
FROM 
    salary_mean sm 
JOIN 
    hobby_salary_mean hm ON sm.operation_system = hm.operation_system;
"""
).show()

+----------------+------------------+------------------------+----------------+
|operation_system|brazil salary mean|brazil hobby salary mean|diff salary mean|
+----------------+------------------+------------------------+----------------+
|     Linux-based|           9831.36|                10141.35|         -309.99|
|        BSD/Unix|          90025.22|                90025.22|            0.00|
|           MacOS|          10638.05|                10279.19|          358.86|
|         Windows|          10634.36|                10567.10|           67.26|
+----------------+------------------+------------------------+----------------+



6. Quais são as top 3 tecnologias mais usadas pelos desenvolvedores?

In [9]:
spark.sql(
    """
SELECT 
    pl.name, COUNT(rpl.respondent_id) AS `Total users`
FROM
    resp_programming_language rpl
INNER JOIN 
    programming_language pl ON rpl.programming_language_id = pl.id
GROUP BY pl.name
ORDER BY 2 DESC
LIMIT 3;

"""
).show()

+----------+-----------+
|      name|Total users|
+----------+-----------+
|JavaScript|      54686|
|      HTML|      53628|
|       CSS|      50979|
+----------+-----------+



7. Quais são os top 5 países em questão de salário?

In [10]:
spark.sql(
    """
SELECT DISTINCT 
    c.name AS country, 
    ROUND(r.salary, 2) AS salary
FROM respondent r 
INNER JOIN 
    country c ON r.country_id = c.id 
ORDER BY salary DESC 
LIMIT 5;
"""
).show()

+-------------+---------+
|      country|   salary|
+-------------+---------+
|     Malaysia|635000.00|
|    Argentina|635000.00|
|   Bangladesh|635000.00|
|United States|635000.00|
|     Colombia|635000.00|
+-------------+---------+



8. A tabela abaixo contém os salários mínimos mensais de cinco países presentes na amostra de dados. Baseado nesses valores, gostaríamos de saber quantos usuários ganham mais de 5 salários mínimos em cada um desses países.

In [11]:
countries = {
    "United States": 4787.9,
    "India": 243.52,
    "United Kingdom": 6925.63,
    "Germany": 6664.0,
    "Canada": 5567.68,
}

dfs = []
for country, minimum in countries.items():
    dfs.append(
        spark.sql(
            """
select 
	count(r.salary) as salary, c.name as country 
from 
	respondent r 
inner join 
	country c on r.country_id = c.id 
where c.name = '{}' and round(r.salary / {}) >= 5
group by country;
""".format(
                country, minimum
            )
        )
    )
spark.createDataFrame(
    [(row.country, row.salary) for df in dfs for row in df.collect()],
    ["Country", "GT 5 salaries"],
).show()

+--------------+-------------+
|       Country|GT 5 salaries|
+--------------+-------------+
| United States|        10148|
|         India|         3412|
|United Kingdom|          897|
|       Germany|          498|
|        Canada|          590|
+--------------+-------------+

