# Occupation

### Introduction:

Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T

In [2]:
spark = SparkSession.builder \
    .appName('Spark exercises based on pandas exercises') \
    .getOrCreate()

23/03/03 20:23:29 WARN Utils: Your hostname, karlos-300E5M-300E5L resolves to a loopback address: 127.0.1.1; using 192.168.10.20 instead (on interface enp1s0)
23/03/03 20:23:29 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/03 20:23:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user). 

### Step 3. Assign it to a variable called users.

In [3]:
from pyspark import SparkFiles

URL = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user'

spark.sparkContext.addFile(URL)

schema = T.StructType([
    T.StructField('user_id', T.IntegerType()),
    T.StructField('age', T.IntegerType()),
    T.StructField('gender', T.StringType()),
    T.StructField('occupation', T.StringType()),
    T.StructField('zip_code', T.IntegerType())
])

users = spark.read.csv(
    path='file://'+SparkFiles.get('u.user'),
    sep='|',
    header=True,
    schema=schema,
    mode='FAILFAST'
)

users.show()

[Stage 0:>                                                          (0 + 1) / 1]

+-------+---+------+-------------+--------+
|user_id|age|gender|   occupation|zip_code|
+-------+---+------+-------------+--------+
|      1| 24|     M|   technician|   85711|
|      2| 53|     F|        other|   94043|
|      3| 23|     M|       writer|   32067|
|      4| 24|     M|   technician|   43537|
|      5| 33|     F|        other|   15213|
|      6| 42|     M|    executive|   98101|
|      7| 57|     M|administrator|   91344|
|      8| 36|     M|administrator|    5201|
|      9| 29|     M|      student|    1002|
|     10| 53|     M|       lawyer|   90703|
|     11| 39|     F|        other|   30329|
|     12| 28|     F|        other|    6405|
|     13| 47|     M|     educator|   29206|
|     14| 45|     M|    scientist|   55106|
|     15| 49|     F|     educator|   97301|
|     16| 21|     M|entertainment|   10309|
|     17| 30|     M|   programmer|    6355|
|     18| 35|     F|        other|   37212|
|     19| 40|     M|    librarian|    2138|
|     20| 42|     F|    homemake

                                                                                

### Step 4. Discover what is the mean age per occupation

In [23]:
## First, let's find out how many occuptions there are.

number_of_occupations = users.select('occupation').distinct().count()
print(f'{number_of_occupations = }')

number_of_occupations = 21


In [24]:
## Alternative:

users.selectExpr('COUNT(DISTINCT occupation) number_of_occupations') \
    .show()

+---------------------+
|number_of_occupations|
+---------------------+
|                   21|
+---------------------+



In [55]:
users.groupBy('occupation') \
    .agg(F.round(F.mean('age'), 1).alias('mean age')) \
    .orderBy('mean age', ascending=False) \
    .show(number_of_occupations, truncate=False)

+-------------+--------+
|occupation   |mean age|
+-------------+--------+
|retired      |63.1    |
|doctor       |43.6    |
|educator     |42.0    |
|healthcare   |41.6    |
|librarian    |40.0    |
|administrator|38.7    |
|executive    |38.7    |
|marketing    |37.6    |
|lawyer       |36.8    |
|engineer     |36.4    |
|writer       |36.3    |
|salesman     |35.7    |
|scientist    |35.5    |
|other        |34.5    |
|programmer   |33.1    |
|technician   |33.1    |
|homemaker    |32.6    |
|artist       |31.4    |
|entertainment|29.2    |
|none         |26.6    |
|student      |22.1    |
+-------------+--------+



### Step 5. Discover the Male ratio per occupation and sort it from the most to the least

In [72]:
users.groupBy('occupation', 'gender') \
    .count() \
    .orderBy('occupation') \
    .show()

+-------------+------+-----+
|   occupation|gender|count|
+-------------+------+-----+
|administrator|     M|   43|
|administrator|     F|   36|
|       artist|     F|   13|
|       artist|     M|   15|
|       doctor|     M|    7|
|     educator|     M|   69|
|     educator|     F|   26|
|     engineer|     M|   65|
|     engineer|     F|    2|
|entertainment|     F|    2|
|entertainment|     M|   16|
|    executive|     M|   29|
|    executive|     F|    3|
|   healthcare|     M|    5|
|   healthcare|     F|   11|
|    homemaker|     F|    6|
|    homemaker|     M|    1|
|       lawyer|     F|    2|
|       lawyer|     M|   10|
|    librarian|     M|   22|
+-------------+------+-----+
only showing top 20 rows



In [127]:
# Let's see if all occupations have at least one woman:

# If the result is 0, then every occupation has at least one woman:
users.groupBy('occupation', 'gender') \
    .count() \
    .where('gender = "F" AND count = 0') \
    .count()

0

In [129]:
users.groupBy('occupation') \
    .pivot('gender') \
    .count() \
    .selectExpr('occupation', 'ROUND(M / (M + F), 2) as male_ratio') \
    .orderBy('male_ratio', ascending=False) \
    .show(number_of_occupations)

+-------------+----------+
|   occupation|male_ratio|
+-------------+----------+
|     engineer|      0.97|
|   technician|      0.96|
|      retired|      0.93|
|   programmer|      0.91|
|    executive|      0.91|
|    scientist|       0.9|
|entertainment|      0.89|
|       lawyer|      0.83|
|     salesman|      0.75|
|     educator|      0.73|
|      student|      0.69|
|        other|      0.66|
|    marketing|      0.62|
|       writer|      0.58|
|         none|      0.56|
|       artist|      0.54|
|administrator|      0.54|
|    librarian|      0.43|
|   healthcare|      0.31|
|    homemaker|      0.14|
|       doctor|      null|
+-------------+----------+



In [26]:
# Fixing the null, when there is no male or female for
# some occupation.

"""
users.groupBy('occupation') \
    .agg(
        F.round(
            F.sum(
                F.when(F.col('gender') == 'M', 1)
                .otherwise(0)
            ) / F.count('*'), 2) \
        .alias('male_ratio')
    ).orderBy('male_ratio', ascending=False) \
    .show()
"""

users.groupBy('occupation') \
    .agg(
        (F.sum(
            F.when(F.col('gender') == 'M', 1)
            .otherwise(0)
        ) / F.count('*')).alias('male_ratio')
    ).selectExpr('occupation', 'ROUND(male_ratio, 2) AS male_ratio') \
    .orderBy('male_ratio', ascending=False) \
    .show(number_of_occupations)


+-------------+----------+
|   occupation|male_ratio|
+-------------+----------+
|       doctor|       1.0|
|     engineer|      0.97|
|   technician|      0.96|
|      retired|      0.93|
|   programmer|      0.91|
|    executive|      0.91|
|    scientist|       0.9|
|entertainment|      0.89|
|       lawyer|      0.83|
|     salesman|      0.75|
|     educator|      0.73|
|      student|      0.69|
|        other|      0.66|
|    marketing|      0.62|
|       writer|      0.58|
|         none|      0.56|
|administrator|      0.54|
|       artist|      0.54|
|    librarian|      0.43|
|   healthcare|      0.31|
|    homemaker|      0.14|
+-------------+----------+



In [28]:
users.createOrReplaceTempView('users')

spark.sql(
    """
    SELECT
        occupation,
        ROUND(
            SUM(
                CASE gender
                    WHEN "M" THEN 1
                    ELSE 0
                END
            ) / COUNT(*),
        2) as male_ratio
    FROM users
    GROUP BY occupation
    ORDER BY male_ratio DESC
    """
).show(number_of_occupations)

+-------------+----------+
|   occupation|male_ratio|
+-------------+----------+
|       doctor|       1.0|
|     engineer|      0.97|
|   technician|      0.96|
|      retired|      0.93|
|   programmer|      0.91|
|    executive|      0.91|
|    scientist|       0.9|
|entertainment|      0.89|
|       lawyer|      0.83|
|     salesman|      0.75|
|     educator|      0.73|
|      student|      0.69|
|        other|      0.66|
|    marketing|      0.62|
|       writer|      0.58|
|         none|      0.56|
|administrator|      0.54|
|       artist|      0.54|
|    librarian|      0.43|
|   healthcare|      0.31|
|    homemaker|      0.14|
+-------------+----------+



### Step 6. For each occupation, calculate the minimum and maximum ages

In [137]:
users.groupBy('occupation') \
    .agg(
        F.min('age').alias('min_age'),
        F.max('age').alias('max_age')
    ).show()

+-------------+-------+-------+
|   occupation|min_age|max_age|
+-------------+-------+-------+
|    librarian|     23|     69|
|      retired|     51|     73|
|       lawyer|     21|     53|
|         none|     11|     55|
|       writer|     18|     60|
|   programmer|     20|     63|
|    marketing|     24|     55|
|        other|     13|     64|
|    executive|     22|     69|
|    scientist|     23|     55|
|      student|      7|     42|
|     salesman|     18|     66|
|       artist|     19|     48|
|   technician|     21|     55|
|administrator|     21|     70|
|     engineer|     22|     70|
|   healthcare|     22|     62|
|     educator|     23|     63|
|entertainment|     15|     50|
|    homemaker|     20|     50|
+-------------+-------+-------+
only showing top 20 rows



### Step 7. For each combination of occupation and gender, calculate the mean age

In [142]:
users.groupBy('occupation', 'gender') \
    .agg(F.round(F.avg('age'), 2).alias('mean_age')) \
    .orderBy('occupation', 'gender') \
    .show()

+-------------+------+--------+
|   occupation|gender|mean_age|
+-------------+------+--------+
|administrator|     F|   40.64|
|administrator|     M|   37.16|
|       artist|     F|   30.31|
|       artist|     M|   32.33|
|       doctor|     M|   43.57|
|     educator|     F|   39.12|
|     educator|     M|    43.1|
|     engineer|     F|    29.5|
|     engineer|     M|    36.6|
|entertainment|     F|    31.0|
|entertainment|     M|    29.0|
|    executive|     F|    44.0|
|    executive|     M|   38.17|
|   healthcare|     F|   39.82|
|   healthcare|     M|    45.4|
|    homemaker|     F|   34.17|
|    homemaker|     M|    23.0|
|       lawyer|     F|    39.5|
|       lawyer|     M|    36.2|
|    librarian|     F|    40.0|
+-------------+------+--------+
only showing top 20 rows



### Step 8.  For each occupation present the percentage of women and men

In [29]:
spark.sql(
    """
    SELECT
        occupation,
        COUNT(*) AS total_count,
        ROUND(
            SUM(
                CASE
                    WHEN gender = "F" THEN 1
                    ELSE 0
                END
            ) / COUNT(*), 2
        ) AS percentage_of_women,
        ROUND(
            SUM(
                CASE
                    WHEN gender = "M" THEN 1
                    ELSE 0
                END
            ) / COUNT(*), 2
        ) AS percentage_of_men
    FROM users
    GROUP BY occupation
    ORDER BY occupation
    """
).show(number_of_occupations)

+-------------+-----------+-------------------+-----------------+
|   occupation|total_count|percentage_of_women|percentage_of_men|
+-------------+-----------+-------------------+-----------------+
|administrator|         79|               0.46|             0.54|
|       artist|         28|               0.46|             0.54|
|       doctor|          7|                0.0|              1.0|
|     educator|         95|               0.27|             0.73|
|     engineer|         67|               0.03|             0.97|
|entertainment|         18|               0.11|             0.89|
|    executive|         32|               0.09|             0.91|
|   healthcare|         16|               0.69|             0.31|
|    homemaker|          7|               0.86|             0.14|
|       lawyer|         12|               0.17|             0.83|
|    librarian|         51|               0.57|             0.43|
|    marketing|         26|               0.38|             0.62|
|         

In [30]:
spark.stop()