In [5]:
import pyspark.sql.functions as F
from core.connection import get_from_database, get_session

spark = get_session()

### Import data from database to spark-warehouse

In [6]:
dw_ame = get_from_database(spark, "information_schema.tables")
tables = (
    dw_ame.select("table_name")
    .where(F.col("table_schema") == "public")
    .rdd.flatMap(lambda x: x)
    .collect()
)
for table in tables:
    get_from_database(spark, table).write.saveAsTable(table)

spark.sql("SHOW TABLES;").show()

                                                                                

+---------+--------------------+-----------+
|namespace|           tableName|isTemporary|
+---------+--------------------+-----------+
|  default|communications_tools|      false|
|  default|             company|      false|
|  default|             country|      false|
|  default|    operation_system|      false|
|  default|programming_language|      false|
|  default|resp_programming_...|      false|
|  default|          resp_tools|      false|
|  default|          respondent|      false|
+---------+--------------------+-----------+



1. Qual a quantidade de respondentes de cada país?

In [7]:
spark.sql(
    """
SELECT 
    c.name AS country, COUNT(c.name) AS total 
FROM 
    respondent r 
INNER JOIN 
    country c ON r.country_id = c.id 
GROUP BY country 
ORDER BY total DESC;
"""
    ).show()

+--------------------+-----+
|             country|total|
+--------------------+-----+
|       United States| 2350|
|               India| 1124|
|      United Kingdom|  749|
|             Germany|  655|
|              Canada|  360|
|              France|  278|
|  Russian Federation|  270|
|              Brazil|  255|
|              Poland|  233|
|               Spain|  203|
|           Australia|  194|
|         Netherlands|  193|
|               Italy|  166|
|              Sweden|  129|
|             Ukraine|  109|
|         Switzerland|  107|
|              Israel|  101|
|              Turkey|   98|
|               China|   92|
|Iran, Islamic Rep...|   89|
+--------------------+-----+
only showing top 20 rows



2. Quantos usuários que moram em "United States" gostam de Windows?

In [8]:
spark.sql(
    """
SELECT 
    count(*) AS `Windows user's in United States`
FROM respondent r 
INNER JOIN 
    operation_system os ON r.operation_system_id = os.id 
INNER JOIN 
    country c ON r.country_id = c.id 
WHERE c.name = 'United States' AND os.name = 'Windows';
"""
).show()

+-------------------------------+
|Windows user's in United States|
+-------------------------------+
|                            961|
+-------------------------------+



3. Qual a média de salário dos usuários que moram em Israel e gostam de Linux?

In [9]:
spark.sql(
    """
SELECT 
    ROUND(AVG(salary), 2) AS `Israel linux user's salary mean`
FROM respondent r 
INNER JOIN 
    operation_system os ON r.operation_system_id = os.id 
INNER JOIN 
    country c ON r.country_id = c.id
WHERE c.name = 'Israel' and os.name = 'Linux-based';
"""
).show()

+-------------------------------+
|Israel linux user's salary mean|
+-------------------------------+
|                       19278.15|
+-------------------------------+



4. Qual a média e o desvio padrão do salário dos usuários que usam Slack para cada tamanho de empresa disponível?

In [13]:
spark.sql(
    """
SELECT 
	size, ROUND(AVG(salary), 2) AS `salary mean`,
    ROUND(STDDEV(salary), 2)  AS `standard deviation`
FROM
	(SELECT 
		r.id, r.name, ct.name AS `communication_tools`, r.salary, r.company_id 
	FROM 
		resp_tools rt 
	INNER JOIN 
		communications_tools ct on rt.communications_tools_id = ct.id 
	INNER JOIN 
		respondent r on rt.respondent_id = r.id) r
INNER JOIN company c on r.company_id = c.id
WHERE communication_tools = 'Slack'
GROUP BY size;
"""
    ).show()

+--------------------+-----------+------------------+
|                size|salary mean|standard deviation|
+--------------------+-----------+------------------+
|Fewer than 10 emp...|   24457.96|          54390.66|
|100 to 499 employees|   30852.68|          57919.33|
|5,000 to 9,999 em...|   30124.88|          61310.04|
|1,000 to 4,999 em...|   30821.72|          51641.98|
|  20 to 99 employees|   30084.28|          63245.62|
|500 to 999 employees|   28238.84|          47406.25|
|10,000 or more em...|   38286.31|          77029.57|
|  10 to 19 employees|   24472.32|          54963.28|
+--------------------+-----------+------------------+

